
    *i]              	       :   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZmZmZmZmZ d dlmZmZmZmZmZmZmZmZmZmZmZmZ d dl m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( h d	Z)h d
Z*dZ+dZ,g dg dg dg dg dg dg dg ddZ- G d d      Z.y)    )SentenceTransformerN)Path)fuzz)ListDictAnyOptionalTuple)EMBEDDINGS_DIRHYBRID_SEARCH_OVERSAMPLEHYBRID_VECTOR_WEIGHTHYBRID_KEYWORD_WEIGHTHYBRID_MIN_KEYWORD_SCOREHYBRID_BM25_WEIGHTMAX_CHUNKS_PER_SOURCETECHNICAL_TERMSSYNONYMSTRADING_KEYWORD_TAXONOMYTRADING_SYNONYMSDOMAIN_KEYWORD_MAPPING)get_keyword_manager)VectorStoreErrorIndexLoadErrorIndexSaveErrorEmbeddingErrorValidationErrorResourceError>N   alsobeenbotheachevenfromgivehavehelpintolikemakemanymoremostmuchneedonlyontoshowsomesuchtellthanthatthemtheythisuponverywantwerewhenwillwithyouraboutafterbasedbeingcouldgimmeguidemightothershallstepstheirtherethesethosewherewhichwhilewouldanswerbeforedetailguidespleaseshouldwithinbecausebetweendetailsexplainlookingproviderelatedsummarythroughdescribequestion	according	regarding	summaries	summarizeinformation>   gbudpotbookbudsgramgrowhempweedbooksganjagramsrH   plantflowerrX   manualplantsflowersgrowingharvestmanualscannabisdocument	cultivate	documents	marijuanacultivation      ?g      ?)
hydroponichydroponicshydro	soil-less	soil lesssoillesszwater cultureznutrient solutionznutrient tankzebb and flow	reservoirzdeep water culturedwczdrip systemzflood and drainsoilless mixsoilless mixture)	r   r   r   r   zperlite mixz	coco coirrockwoolzexpanded clayz	grow cube)medicalmedicine	medicinalpatientdoctor	physiciantherapeutictherapy	treatmentdosagedoseclinicalsymptom	condition	diagnosisprescriptionwellnesshealth
healthcarezpain reliefzanxiety reliefzanti-inflammatory)finance	financialfundingcapitalloancreditdebt
investmentinvestorroizreturn on investmentz	cash flowprofitprofitabilityrevenueincomemarginbudget)legallawlawsstatute
regulation
regulatory
compliancelicense	licensingpermit
permittingdecriminalizelegalizationcourtpolicy	ordinance)businessentrepreneurentrepreneurshipstartupcompanycorporationmarket	marketingsalesbranding	franchisezsupply chain	wholesaleretailcustomerclientstrategyscaling
operations
management)zmental health
psychologyr   
counseling	wellbeingstressanxietyptsd
depressionmindfulness
meditation)history
historical	chronicletimelineeracenturyancientzmodern history	historiancivilizationheritagelegacyanthropologyarchaeologyculturezcultural historyrecords)r   zsoilless growingr   r   r   r   r   r   c                   &   e Zd Zd dZd Zd Zd Zd Zd Zd Z	d	 Z
d!d
Zd"dededee   deeeef      fdZd"dee   dedee   deeeef      fdZdededefdZdededee   defdZdededefdZd Zd Zd Zd Zd Zd#dZd$defdZy)%VectorStoreNc                    ddl m} t               | _        ||}t	        |      | _        d | _        g | _        g | _        g | _	        g | _
        d| _        t        dz  | _        t        dz  | _        t        j                          | _        y )Nr   )EMBEDDING_MODEL        zfaiss_index.indexzdocuments_metadata.json)airagagent.configr   r   keyword_managerr   modelindexr   searchable_texts_rawsearchable_texts_lowerdoc_lengthsavg_doc_lengthr   
index_filemetadata_file	threadingLock_lock)self
model_namer   s      1/var/www/html/backtest/airagagent/vector_store.py__init__zVectorStore.__init__i   s    5  34(J(4

$&!&(#!(+>>+.GG ^^%
    c                    | j                   j                         sy| j                  j                         r	 t        j                  t        | j                               | _        t        | j                  dd      5 }t        j                  |      | _
        ddd       | j                          t        dt        | j                         d       yddlm} |dz  }|j                         r	 t        d       t        j                  t        | j                               | _        ddl}t        |d      5 }|j                  |      | _
        ddd       | j)                          | j                          |j+                          t        dt        | j                         d       yy# 1 sw Y   xY w# t        j                  $ r0}t        t        | j                        d	t        |      d
      d}~wt         $ r0}t        t        | j                         dt        |      d
      d}~ww xY w# 1 sw Y   xY w# t         $ r&}t        t        |      dt        |      d
      d}~ww xY w)z&Load existing FAISS index and metadataFrutf-8encodingNu   ✓ Loaded existing index with z
 documentsTjson_decode
error_typeoriginal_error
faiss_loadr   )r   zdocuments_metadata.pklz'Migrating from pickle to JSON format...rbu   ✓ Migrated index with z documents to JSON formatpickle_migration)r   existsr   faiss
read_indexstrr   openjsonloadr   _prepare_searchable_textsprintlenJSONDecodeErrorr   	Exceptionconfigr   picklesave_metadataunlink)r  fer   old_pickle_filer   s         r  load_existing_indexzVectorStore.load_existing_index}   s   %%'$$&"--c$//.BC
$,,cGD 2%)YYq\DN2..07DNN8K7LJWX 	*(+CC!!#?@"--c$//.BC
/40 4A%+[[^DN4 ""$..0&&(0T^^1D0EE^_` O2 2
 '' $**+#0CFK   $(#/3q6J 4 4  $(#5QP sn   AF6 <F):F6 .AH? 6H3AH? )F3.F6 6H0	+G44H0 +H++H03H<8H? ?	I.!I))I.c           	         | j                   5  |s
	 ddd       yt        |t              st        ddt	        |      i      t        |      D ]G  \  }}t        |t              st        d| d|t	        |      d      d|vs7t        d| dd	|i       |D cg c]  }|d   	 }}	 | j                  j                  |d
ddd      }| j                  .t        j                  |j                   d         | _        g | _        t%        j&                  |      j)                  d      }t        j*                  |       | j                  j-                  |       | j"                  j/                  |       |D ]W  }| j1                  |      }	| j2                  j5                  |	       | j6                  j5                  |	j9                                Y t;        dt        |       d       ddd       yc c}w # t        $ r5}|r|d   dd dz   nd}t        |t        |      t        |      d      d}~ww xY w# 1 sw Y   yxY w)z%Add new documents to the vector storeNzChunks must be a listreceived_typezChunk z must be a dictionary)chunk_index
chunk_typecontentz missing 'content' keyr)      TF
batch_sizeshow_progress_barconvert_to_numpynormalize_embeddingsr      z... )r  r.     float32u
   ✓ Added z chunks to vector store)r  
isinstancelistr   type	enumeratedictr   encoder  r   r  r  r   r  IndexFlatIPshaper   nparrayastypenormalize_L2addextend_build_searchable_textr   appendr   lowerr  )
r  chunksichunkcontents
embeddingsr$  problematic_contentembeddings_npsearchable_texts
             r  add_documentszVectorStore.add_documents   sR   ZZ /	E/	E /	E fd+%&=QUV\Q]?^__ &f- b5!%.)F1#5J*K]^nrsxnyMz{{E))F1#5K*L}^_N`aa	b 7==Ui(=H=s!ZZ..!&*%)). / 
 zz!"..z/?/?/BC
!# HHZ077	BM}-JJNN=) NN!!&) L"&"="=e"D))00A++22?3H3H3JKL
 Js6{m+BCD_/	E /	E >  sCKhqk$3&7%&?QS#$%8SQRVcfgocp:qrrs1/	E /	EsN   H5A(H5H5G/&H5) G4	DH5/H54	H2=0H--H22H55H>c           	         | j                   5  | j                  t        dddi      	 t        j                  | j                  t        | j                               | j                          t        d       	 ddd       y# t        $ r/}t        t        | j                        dt        |      i      d}~ww xY w# 1 sw Y   yxY w)z!Save the FAISS index and metadataNz No index to save - index is Noneindex_statenoneu   ✓ Saved vector store indexr  )
r  r   r   r  write_indexr  r   r!  r  r  )r  r$  s     r  
save_indexzVectorStore.save_index   s    ZZ 		Wzz!$%G-Y_I`aaW!!$**c$//.BC""$45		W 		W  W$S%9<LcRSf;UVVW		W 		Ws)   B<AB	B9
*B44B99B<<Cc                    	 t        | j                  dd      5 }t        j                  | j                  |dd       ddd       y# 1 sw Y   yxY w# t
        $ r0}t        t        | j                        dt        |      d	      d}~ww xY w)
z#Save document metadata to JSON filewr	  r
  F   )ensure_asciiindentN	json_saver  )r  r   r  dumpr   r  r   r  )r  r#  r$  s      r  r!  zVectorStore.save_metadata   s    	qd((#@ KA		$..!%JK K K 	q T%7%7!8hklmhn:opp	qs3   A $AA AA A 	B+BBc           	         | j                   5  |s| j                  |       	 ddd       yt        d| dt        |       d       | j                  D cg c](  }|j                  di       j                  d      |k7  r|* }}t        dt        |       d       ||z   }t        d	t        |       d
       | j                  | j                  j                  }n+| j                  j                  dg      }|j                  d   }t        j                  |      | _        g | _        g | _        |D cg c]  }|d   	 }}d}	t        dt        |      |	      D ]  }
||
|
|	z    }	 | j                  j                  ||	ddd      }t!        j"                  |      j%                  d      }t        j&                  |       | j                  j)                  |       t+        ||
|
|	z          D ]Z  \  }}| j-                  |      }| j                  j/                  |       | j                  j/                  |j1                                \  || _        t        d| dt        |       d       t        d| j                  j8                          ddd       yc c}w c c}w # t2        $ r }t5        d|
 t7        |      |
d      d}~ww xY w# 1 sw Y   yxY w)a:  
        Safely replace all documents for a source with new chunks.
        This rebuilds the index to ensure consistency (prevents index corruption).

        Args:
            source_name: Name of the source file to replace
            new_chunks: List of new chunk dictionaries to replace old ones with
        Nz Replacing documents for source: z (z new chunks)metadatasourcez
  Keeping z documents from other sourcesz  Rebuilding index with z total documents...testr4  r+  r,  r   FTr-  r5  z"Error embedding batch starting at )r  batch_startu"   ✓ Successfully replaced source 'z': z chunksz  Total documents in index: )r  _remove_source_from_indexr  r  r   getr   dr   r;  r=  r  r<  r   r   ranger>  r?  r@  rA  rB  r9  rD  rE  rF  r  r   r  ntotal)r  source_name
new_chunksdockept_documentsupdated_documents	dimensionsample_embeddingrJ  r.  rH  batchrK  rM  jrN  r$  s                    r  replace_source_documentsz$VectorStore.replace_source_documents   s    ZZ H	F..{;	H	F H	F 4[MC
OCTT`ab  $~~77:r*..x8KG N 
 Js>233PQR !/ ; ,S1B-C,DDWXY zz% JJLL	 $(::#4#4fX#> ,2215	 **95DJ )+D%*,D' 3DD3IDHDJ1c(mZ8  Q^4!%!2!2#-*/)--2 "3 "J %'HHZ$8$?$?	$JM&&}5JJNN=1 #,,=a!j.,Q"R T3*.*E*Ec*J1188I33::?;P;P;RST6 /DN6{m3s:FWW^_`01B1B0CDEQH	F H	F< E. ! (<QC@+.q6!D }H	F H	FsU   K)K-JB1K2J>%K$C"JAK
K	K(KKKKc           	         | j                   5  | j                  D cg c](  }|j                  di       j                  d      |k7  r|* }}t        |      t        | j                        k(  rt	        d|        	 ddd       yt	        dt        | j                        t        |      z
   d|        | j
                  | j
                  j                  }n+| j                  j                  dg      }|j                  d   }t        j                  |      | _        g | _        g | _        |D cg c]  }|d	   	 }}d
}t        dt        |      |      D ]  }||||z    }	| j                  j                  |	|ddd      }
t        j                   |
      j#                  d      }t        j$                  |       | j
                  j'                  |       t)        ||||z          D ]Z  \  }}| j+                  |      }| j                  j-                  |       | j                  j-                  |j/                                \  || _        t	        d| dt        | j                                ddd       yc c}w c c}w # 1 sw Y   yxY w)zCRemove all documents for a source by rebuilding index without them.r]  r^  zNo documents found for source: Nz	Removing z documents for source: r_  r4  r+  r,  r   FTr-  r5  u   ✓ Removed source 'z'. Total documents: )r  r   rb  r  r  r   rc  r   r;  r=  r  r<  r   r   rd  r>  r?  r@  rA  rB  r9  rD  rE  rF  )r  rf  rh  ri  rk  rl  rJ  r.  rH  rm  rK  rM  rn  rN  s                 r  ra  z%VectorStore._remove_source_from_indexI  su   ZZ -	a#~~77:r*..x8KG N 
 >"c$..&997}EF-	a -	a Ic$..1C4GGHH_`k_lmn zz% JJLL	#'::#4#4fX#> ,2215	**95DJ(*D%*,D'2@A3IAHAJ1c(mZ8 P Q^4!ZZ..)&+%)). / 
 !# 4 ; ;I F""=1

}-'q1z>(JK PFAs&*&A&A#&FO--44_E//667L7L7NOPP$ ,DN(5I#dnnJ]I^_`[-	a -	a, B/-	a -	as0   J-I7	2JB*J.I<:D4J7
JJ
c                     d}| j                   D ]A  }|j                  di       j                  d      |k(  s(d|j                  di       d<   |dz  }C |dkD  r"| j                          t	        d| d| d	       |S )
z
        DEPRECATED: Use replace_source_documents instead.
        This method only marks documents as deleted but doesn't remove vectors from index,
        which can cause index corruption. Kept for backward compatibility but not recommended.
        r   r]  r^  Tdeletedr4  u   ⚠ Marked z documents from z: as deleted (deprecated method - may cause index mismatch))r   rb  
setdefaultr!  r  )r  rf  countrh  s       r  mark_source_as_deletedz"VectorStore.mark_source_as_deletedz  s     >> 	Cwwz2&**84C<@z2.y9
	
 19 Kw&6{mC}~r  c                 p   | j                   t        dddi      t        | j                        dk(  rg S |st	        dddi      t        |t              s!t	        dd	t        |      j                  i      t        |j                               dk(  rt	        d
dt        |      i      |j                         }t        j                  d|      }g }g }g }t               }|D ]#  }	|	t        v s|j!                  t        |	          % t"        D ]7  }
|
|v s|
|vs|j%                  |
       |
t&        vs'|j%                  |
       9 |D ]2  }
|
|vs|j%                  |
       |
t&        vs"|j%                  |
       4 |D ]4  }	t        |	      dk  r|	t(        v r|	|v s|	|v r$|j%                  |	       6 g }t               }||z   D ]m  }
|
|v r|j%                  |
t        j*                  dt        j,                  |
      z   dz   t        j.                        |
|v |
|v d       |j1                  |
       o | j3                          |r| j5                  |      ni }	 | j6                  j9                  |g      }t;        j<                  |      j?                  d      }tA        jB                  |       tE        |tF        z  |      }| j                   jI                  ||      \  }}g }g }tO        |d   |d         D ]  \  }}|t        | j                        k\  r | j                  |   }|jQ                  di       jQ                  d      rQ|j%                  |       | jR                  |   }| jT                  |   }d}|rtW        jX                  ||      dz  }g }d}d}d}d}t[        d |D              } t[        d |D              }!|r|D ]X  }"|"d   }#|#jI                  |      s|j%                  |"d          |"d   rd}|"jQ                  d      rd}t]        d|dz         }d}Z t        |      d kD  r(t]        d|d!t]        t        |      d z
  d      z  z         }| j_                  ||      }|s|r|dd" }|r|s| st]        |d#      }|su|D ]p  }	t        |	      dkD  st        j*                  dt        j,                  |	      z   dz   t        j.                        }$|$jI                  |      sat]        d|d$z         } n |t`        k  rd}|r|!r|s|!s| r|s| s|s%| jc                  |||      }%td        tg        |      z  th        |z  z   tj        |%z  z   }&|d%   |d   |&tg        |      ||%|||d&	}'|j%                  |'        tm        d' |D              }(|D )cg c]  })|)jQ                  d      s|) }*})|*s|D )cg c]
  })|)d   s	|) }*})|*s|}*|*r9|(|k  r4||(z
  }+| jo                  |*|+t        |      |(      },|jq                  |,       |js                  d) d*       g }-i }.t               }/|D ]  }'|'jQ                  di       jQ                  d+d,      }0|.jQ                  |0d      }1|1tt        k\  rA|-j%                  |'       |1d z   |.|0<   |'jQ                  d-      }2|2|/j1                  |2       t        |-      |k\  s n |*r|-D 3ch c]$  }3|3jQ                  di       jQ                  d+d,      & }4}3tE        d|t        |4      z
        }5|5dkD  r| jo                  |*|5|/|4|.      }6|6D ]  }7|7jQ                  di       jQ                  d+d,      }0|0|4v r*|-j%                  |7       |4j1                  |0       |.jQ                  |0d      d z   |.|0<   |7jQ                  d-      }2|2|/j1                  |2       t        |-      |k\  s n t        |-      |k  r.|D ])  }'|'|-v r|-j%                  |'       t        |-      |k\  s( |-S  |-S # tJ        $ r}tM        d|        g cY d}~S d}~ww xY wc c})w c c})w c c}3w )/zFSearch for similar documents using hybrid vector + keyword re-ranking.NzIndex not loadedrQ  rR  r   zQuery cannot be emptyquery_lengthzQuery must be a string
query_typezQuery cannot be only whitespacequery_contentz\b\w+\b   \btermpattern	is_domain
is_primaryr5  zError during search: r]  rr  r   g      Y@Fc              3   &   K   | ]	  }|d      yw)r  N .0ps     r  	<genexpr>z%VectorStore.search.<locals>.<genexpr>  s     &P!q~&P   c              3   &   K   | ]	  }|d      yw)r  Nr  r  s     r  r  z%VectorStore.search.<locals>.<genexpr>  s     'RA,'Rr  r~  r}  r  Tr        ?      ?r4  皙?   皙?333333?r+  	r+  r]  scorevector_scorekeyword_score
bm25_scorehas_keyword_matchkeyword_snippet	doc_indexc              3   B   K   | ]  }|j                  d       rd  yw)r  r4  Nrb  )r  	candidates     r  r  z%VectorStore.search.<locals>.<genexpr>4  s%       
}}01  
s   )exclude_indices	idf_tablec                     | d   S Nr  r  )items    r  <lambda>z$VectorStore.search.<locals>.<lambda>K  s
    g r  keyreverser^  Unknownr  )r  exclude_sourcesr  );r   r   r  r   r   r6  r  r8  __name__stripreprrF  refindallsetr   updater   rE  GENERIC_DOMAIN_TERMSQUERY_STOPWORDScompileescape
IGNORECASErB  _ensure_searchable_texts_compute_idf_tabler   r;  r>  r?  r@  r  rA  maxr   searchr  r  ziprb  r   r   r   partial_ratioanymin_extract_keyword_snippetr   _compute_bm25_scorer   floatr   r   sum_keyword_fallback_searchrC  sortr   )8r  querykquery_lowerquery_wordsdomain_termsprimary_domain_termsgeneral_termsexpanded_termswordr}  keyword_patternsadded_termsr  query_embedding
oversamplescoresindicesr$  
candidatesretrieved_indicesr  idxrh  searchable_text_lowersearchable_text_rawr  matched_termsr  r  matched_domainmatched_primaryrequire_domain_matchrequire_primary_matchpattern_infoterm_patternword_patternr  combined_scorer  keyword_hit_countr  fallback_patternsneededfallback_candidatesbalanced_candidatessource_countsused_indicesr^  rt  doc_idxcexisting_sourcesneeded_sourcesextra_candidatesextra_candidates8                                                           r  r  zVectorStore.search  s	   ::"#5v7NOOt~~!#I!"9NA;NOO%%!":\4PU;K_K_<`aau{{}"!"CoW[\aWbEcddkkmjj[9! 	6Dx%%htn5	6 $ 	6D{"t<'?##D)33(//5		6 # 	6D<'##D)33(//5		6   	'D4yA~&|#t}'<  &	' e =0 		"D{"##::ebiio&=&Er}}U!\1"&::	%  OOD!		" 	%%'AQD++,<=WY			"jj//8O hh7>>yIO/Q!991=J"jj//LOFG
 
fQi4 [	)JE3c$..))..%Cwwz2&**95$$S)$($?$?$D!"&";";C"@  M$ $ 2 2;@U VY^ ^ M %"O"N#O#&&P?O&P#P $''RAQ'R$R!$4 	1L#/	#:L#**+>?%,,\&-AB'4-1N+//=26(+C1D(E,0)	1 }%)$']S3s=GY\]G]_`Ca=a-a$bM"&"?"?@SUe"f&+<&9$3&?O$^DX$'s$;M !' "D4y1}')zz%"))D/2IE2QSUS`S`'a'../BC,/]S5H,IM!" 77 #(,1En+4E11#7GSJ %uU|3%56"Z/0  y>
O' %e!.(%6#2 
I i(w[	)|    
) 
 
 )9P1AEE,<OQPP ,< Oq+ O O  0!2Q!6**F"&"?"?! #$5 6#	 #@ # 126E u# 	I]]:r266xKF!%%fa0E--&&y1$)AIM&!mmK0G"  )&'1,	 XklSTj" 5 9 9(I Nll A,<(=$=>N!#'#@#@%"$0$4' $A $  (8 O,00R@DDXyYF!11 '..?$((0,9,=,=fa,H1,LM&)-11+>G*$((1./14 "#a'' 	 33#**95*+q0"" #"[  	)!-.I	V Q O@  ms=   B` `)5`)
`.`.)`3	`&`!`&!`&r  r  domainreturnc           
         | j                   j                  |d      }| j                  j                  dt	        |       d|dd D cg c]  }|d   	 c}        | j                   j                  ||      }| j                  j                  dt	        |       d	       |rF| j                   j                  ||      }| j                  j                  d
t	        |       d|        | j                  ||dz        }g }|D ]  }	|	j                  dd      }
|	j                  di       }| j                   j                  |
d      }g }|D ]  }|d   j                         }||
j                         v s)| j                  ||
      }| j                  ||
|      }|d   dz  |dz  z   |dz  z   }|j                  |d   ||d   |
j                         j                  |      d        |rt        d |D              t	        |      z  nd}|	j                  dd      }|dz  |dz  z   }|	j                         }|j!                  ||||t	        |      d|dd |r| j#                  |
|      ndd       |j                  |        |j%                  d d       |r%|D cg c]  }|j                  dd      dkD  s| }}|d| S c c}w c c}w ) aR  
        Enhanced search using advanced keyword management and domain filtering.

        Args:
            query: Search query
            k: Number of results to return
            domain: Optional domain filter (sports, crypto, stocks, forex)

        Returns:
            List of search results with enhanced keyword analysis
           )max_keywordsz
Extracted z keywords from query: N   keywordzExpanded query to z termszFiltered to z domain-specific terms for rW  r+  r3  r]  
   r  r  r  r  )r  r  r  	frequencyc              3   &   K   | ]	  }|d      yw)r  Nr  )r  kss     r  r  z.VectorStore.enhanced_search.<locals>.<genexpr>  s     #IBBwK#Ir  r   ffffff?)matched_keywordsdoc_keywordsr  total_matchesr  )r  keyword_analysisquery_expansiondomain_relevancec                     | d   S r  r  xs    r  r  z-VectorStore.enhanced_search.<locals>.<lambda>  s
    AgJ r  Tr  r  )r   extract_keywordsloggerinfor  expand_queryfilter_by_domainr  rb  rF  !_calculate_keyword_position_score _calculate_keyword_context_scorerE  rt  r  copyr  _calculate_domain_relevancer  )r  r  r  r  extracted_keywordskwr  base_resultsenhanced_resultsresultr+  r]  r  keyword_scoreskw_lowerposition_scorecontext_scoretotal_kw_scoredoc_keyword_scoreoriginal_scoreenhanced_scoreenhanced_resultr  s                          r  enhanced_searchzVectorStore.enhanced_search  sH    "11BB5WYBZ:c*<&=%>>T  oA  BD  CD  oE  VFhjVXYbVc  VF  UG  H  	I --::5&I-c..A-B&IJ !11BB>SYZNKK|C,?+@@[\b[cde {{5!a%0 " .	5FjjB/Gzz*b1H  //@@WY@ZL  N( i=..0w}}.%)%K%KHV]%^N$($I$I(T[]k$lM&(kC&7.3:N&NQ^adQd&dN"))#%i=!/"$X,%,]]_%:%:8%D	+ " dr#I.#I ICP^L_ _wx $ZZ3N+c14E4KKN %kkmO""'(6$0%6%(%8	% $2#2#6Y_D$D$DWf$Ueh
$ 
 ##O4].	5b 	"6E +;baquuEWYZ?[^a?abb##O VFJ  cs   KK
:K
keywordsc                    | j                   j                  |dj                  |            }| j                  j	                  d|dd D cg c]  \  }}|	 c}}        t        |      }|dd D ]L  \  }}	| j                   j                  |d      }
|j                  |
D cg c]  \  }}|dkD  s| c}}       N g }|D ]  }|t        j                  dt        j                  |      z   dz   t        j                        |r|| j                   j                  |      v nd	||dd D 	cg c]  \  }}	|	 c}	}v d
}|j                  |        | j                  ||dz  i       }g }|D ]/  }|j                  dd      }g }|D ]  }|j!                         |j!                         v s$t#        t        j$                  dt        j                  |      z   dz   |t        j                              }|dkD  sq| j                   j                  |g|      d   d   }|j                  |||d        |rt'        d |D              t#        |      z  nd}|j)                         }|j                  ||t#        |      t+        |      d       |j                  |       2 |j-                  d d       |d| S c c}}w c c}}w c c}	}w )a/  
        Search focused specifically on keyword matching with advanced ranking.

        Args:
            keywords: List of keywords to search for
            k: Number of results to return
            domain: Optional domain filter

        Returns:
            Keyword-focused search results
         zRanked keywords: Nr  rz  )top_kr  r{  Fr|  rW  )r  r+  r3  r   r4  )r  matches	relevancec              3   2   K   | ]  }|d    |d   z    yw)r  r  Nr  )r  kms     r  r  z5VectorStore.keyword_focused_search.<locals>.<genexpr>  s     ZB;"Y- ?Zs   )r  keyword_matchestotal_keyword_matchesexpanded_keywordsc                 &    | j                  dd      S )Nr  r   r  r  s    r  r  z4VectorStore.keyword_focused_search.<locals>.<lambda>  s    AEE/1,E r  Tr  )r   rank_keywords_by_relevancejoinr  r   r  find_similar_keywordsr  r  r  r  r  get_domain_keywordsrE  r  rb  rF  r  r  r  r  r7  r  )r  r  r  r  ranked_keywordsr  r  r   r  _similarr  r~  resultsr
  r  r+  r  r  relevance_scorer  r  s                         r  keyword_focused_searchz"VectorStore.keyword_focused_search  s    ..II(TWT\T\]eTfg,/RTSTBU-VYRb-V,WXY  M)"1- 	SJGQ**@@PQ@RG$$'%QYRUS[b%QR	S
 ( 	-G::ebii.@&@5&H"--X[aVt';';'O'OPV'WWgl%/"1:M)NQ")NN	G ##G,	- //0@!a%SU/V  	5FjjB/G !O, 	==?gmmo5!"**URYYw5G-G%-OQXZ\ZgZg"hiG{*.*>*>*Y*Y[bZcel*mno*pqr*s'..'.'.)80 	 vECZ/ZZ]`ap]qq  KLM$kkmO""!.#2),_)=%)*;%<	$  ##O47	5< 	"EtT##q .W &R *Os   
J9#J?1J?,Kr  r+  c                     |j                         }|j                  |      }|dk  ryd|t        |      z  z
  }|t        |      dz  k  r|dz  }t        |d      S )z6Calculate score based on keyword position in document.r   r  r  r   )rF  findr  r  )r  r  r+  content_lowerkeyword_posr  s         r  r  z-VectorStore._calculate_keyword_position_score  sf    #((1? c'l :; W++c!N>3''r  context_termsc                 2   |j                         }|j                        }|dk  ryd}t        d||z
        }t        t	        |      |t	              z   |z         }||| t        fd|D              }	|rt        |	t	        |      z  d      }
|
S d}
|
S )z3Calculate score based on surrounding context terms.r   r2  c              3   8   K   | ]  }|v s|k7  sd   ywr4  Nr  )r  r}  contextr  s     r  r  z?VectorStore._calculate_keyword_context_score.<locals>.<genexpr>>  s      aDTW_QUY`Q`aas   	r  )rF  r-  r  r  r  r  )r  r  r+  r0  r.  r/  window_sizestartendcontext_matchesr  r4  s    `         @r  r  z,VectorStore._calculate_keyword_context_score/  s    #((1? A{[01#m$kCL&@;&NOc* aMaa KXOc-.@@#F ^_r  c                     |sy| j                   j                  |      }|sy|j                         t        fd|D              }|t	        |      z  }|dk\  r|dz  }t        |d      S )z:Calculate how relevant a document is to a specific domain.r  r  c              3   H   K   | ]  }|j                         v sd   ywr3  )rF  )r  r  r.  s     r  r  z:VectorStore._calculate_domain_relevance.<locals>.<genexpr>O  s     QBRXXZ=5PaQs   ""rz  g333333?)r   r%  rF  r  r  r  )r  r+  r  domain_keywordsr  r  r.  s         @r  r  z'VectorStore._calculate_domain_relevanceE  sq    ..BB6JQ/QQ c/22	 a<I9c""r  c                    |j                  di       }g }|j                  d      r)|j                  t        |j                  d                   |j                  d      r8|j                  |j                  dg       D cg c]  }t        |       c}       |j                  d      r8|j                  |j                  dg       D cg c]  }t        |       c}       |j                  d      r)|j                  t        |j                  d                   |j                  t        |j                  dd                   dj	                  |      }|j                         |j                  d	g       }t        j                         D ]A  \  }}	||v rt        fd
|	D              s |j                  |       |j                  |       C |rt        t        |            }
dj	                  |
      |d<   |j                  d      sH|j                  dd      }d|d    d}|j                  |      s| | j                         |d<   d|d<   dj	                  |      S c c}w c c}w )z>Compose searchable text using enriched metadata + raw content.r]  rc   
key_pointsthemesclean_excerptr+  r3  r  topicsc              3   &   K   | ]  }|v  
 y wNr  )r  variant
lower_texts     r  r  z5VectorStore._build_searchable_text.<locals>.<genexpr>p  s     AW7j(As   z, topic_label_topic_taggedz	[Topics: z] T)rb  rE  r  rC  r#  rF  rs  CANONICAL_TOPIC_KEYWORDSitemsr  sortedr  
startswithr  )r  rh  r]  searchable_text_partspointtheme	base_textr@  	canonicalvariantsdistinct_topicsrc   topic_prefixrD  s                @r  rD  z"VectorStore._build_searchable_textZ  s   77:r* "<<	"!((X\\)-D)EF<<%!(((,,|]_B`)a#e*)ab<<!!(((,,xY[B\)]#e*)]^<<(!((X\\/-J)KL$$SB)?%@AHH23	__&
$$Xr2#;#A#A#C 	8IxF"AAAi(%,,Y7	8 $S[1O&*ii&@H]#<<0",,y"5!*8M+B*C2F)),7-9N7)*D*J*J*LHY',0)xx-..9 *b)]s    I%	I*c                    | j                   D cg c]  }| j                  |       c}| _        | j                  D cg c]  }|j                          c}| _        | j                  D cg c]  }t        |j                                c}| _        | j                  r1t        | j                        t        | j                        z  | _	        yd| _	        yc c}w c c}w c c}w )z/Build cached searchable text for all documents.r   N)
r   rD  r   rF  r   r  splitr   r  r   )r  rh  texts      r  r  z%VectorStore._prepare_searchable_texts  s     ~~%
 '',%
!
 &*%>%>'
!DJJL'
# +/*C*C
"&C


 "%d&6&6"7#d>N>N:O"OD"%D%
'

s   CC* C!c                     t        | j                        t        | j                        k7  s+t        | j                        t        | j                        k7  r| j	                          yy)z8Ensure searchable texts cache is in sync with documents.N)r  r   r   r   r  )r  s    r  r  z$VectorStore._ensure_searchable_texts  sN     ))*c$...AA++,DNN0CC**, Dr  c                 (   i }t        | j                        }|dk(  r|S |D ]n  }|d   }||v r|d   }d}| j                  D ]  }|j                  |      s|dz  } t        j                  ||z
  dz   |dz   z  dz         }	t        |	d      ||<   p |S )z6Compute IDF values for query terms using BM25 formula.r   r}  r~  r4  r  r  r   )r  r   r  mathlogr  )
r  r  r  
total_docsr   r}  r~  dfrU  idfs
             r  r  zVectorStore._compute_idf_table  s    	445
?$ 
	,D<Dy 9oGB33 >>$'!GB ((JOc1b3h?#EFC!#smIdO
	, r  c                    |r|sy|t        | j                        k\  ry| j                  |   }|t        | j                        k  r| j                  |   nt        |j                               }| j                  dkD  r| j                  n|xs d}d}|D ]u  }|d   }	|d   }
t        |
j                  |            }|dk(  r-|j                  |	d      }|t        dz   z  }|t        dt        z
  t        ||z  z  z   z  z   }||||z  z  z  }w |S )z9Compute BM25 score for a document given keyword patterns.r   r   r4  r}  r~  r  )	r  r   r   rT  r   r  rb  BM25_K1BM25_B)r  r  r  r  doc_textdoc_lenavg_lenbm25r   r}  r~  tfr\  	numeratordenominators                  r  r  zVectorStore._compute_bm25_score  s!   yD7788..y91:SAQAQ=R1R$""9-X[\d\j\j\lXm)-)<)<q)@$%%glQR$ 		4D<D9oGW__X./BQw--c*Cgm,Iw#,7WCT9U*UVVKC9{233D		4 r  c                    g }|xs
 t               }|rt        |      n	t               }t        | j                        D ]  \  }||v r| j                  |   }	|	j	                  di       j	                  d      r=t        fd|D              r|	j	                  di       j	                  dd      }
|
|v rx| j                  |      }|sd}| j                  |||xs i       }t        |z  t        |z  z   }|	d   |	d   |d||d	||d
	}|j                  |       |j                  |       |j                  |
       t        |      |k\  s |S  |S )zFScan all documents for keyword matches when vector search misses them.r]  rr  c              3   F   K   | ]  }|d    j                          yw)r~  N)r  )r  r   r  s     r  r  z7VectorStore._keyword_fallback_search.<locals>.<genexpr>  s!     \44	?))*=>\s   !r^  r  r  r+  r   Tr  )r  r9  r   r   rb  r  r  r  r   r   rE  rB  r  )r  r  r  r  r  r  r  local_exclude_sourcesr  rh  r^  snippetr  rc  combinedr  r  s                   @r  r  z$VectorStore._keyword_fallback_search  s    )2SU8GO 4SU(1$2K2K(L %	$C$o%..%Cwwz2&**95\K[\\R044XyI22778KM]^ #//5EyTVW)M9&-. 
  #9~ #J%$'%2"&)-'.!$
	 $**95##C(%))&1&'61""O%	N #"r  windowc                 6   |r|sy|D ]  }|d   j                  |      }|st        d|j                         |dz  z
        }t        t	        |      |j                         |dz  z         }||| }t        j                  dd|      j                         }|c S  y)z9Extract a concise snippet around the first keyword match.Nr~  r   rW  z\s+r  )	r  r  r6  r  r  r7  r  subr  )	r  raw_textr  rl  r   matchr6  r7  rj  s	            r  r  z$VectorStore._extract_keyword_snippet  s    /$ 	DO**84EAu{{}v{:;#h-v{)BC"5-&&g6<<>	 r  rB  )r  )r  N)NNN)r  )r  
__module____qualname__r  r&  rO  rT  r!  ro  ra  ru  r  r  intr	   r   r   r   r  r+  r  r  r  r  rD  r  r  r  r  r  r  r  r  r  r   r   h   sE   &(/b1EfWqQFf/ab"r#hU$S U$S U$hsm U$W[\`adfiai\jWk U$nF$tCy F$S F$hWZm F$gklpqtvyqylzg{ F$P( (s (u (" c Z^_bZc hm ,#3 # # #*$/L&"-&,-#^3 r  r   )/sentence_transformersr   r  numpyr>  r  r  rX  r   pathlibr   	rapidfuzzr   typingr   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   airagagent.keyword_managerr   airagagent.exceptionsr   r   r   r   r   r   r  r  r^  r_  rG  r   r  r  r  <module>r{     s    5    	     3 3    ;   	

M+ ZV Vr  