
    9iL                        d dl Z d dlmZmZ d dlmZmZ d dlmZ d dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlmZmZmZ d dlmZ d dlZd dlmZ d dlmZmZmZmZ d dlZd d	lmZ  ej>                  e       Z!d
edefdZ"d
edefdZ#d
edefdZ$d
edefdZ%d
edefdZ&defdZ'	 	 d6dee(   de)de*fdZ+dddde#fdede*de*de*de*deeegef   fd Z,d! Z-ddd"e#fd#ed$ede*de*de*deeegef   fd%Z.d& Z/d'efd(Z0d) Z1d* Z2d7d+Z3	 	 	 	 	 	 	 d8d,e(d-ee(   d.ee(edf   d/ee(   d0ee(   d1eee(df   d2eee(      d3ee)e(df   d4e(fd5Z4y)9    N)Tensordevice)ListCallable)tqdm)DictOptionalUnion)Path)HUGGINGFACE_HUB_CACHE)HfApi
hf_hub_urlcached_downloadHfFolder)versionabc                     t        | |      S )
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    )cos_simr   r   s     d/var/www/html/backtest/airagagent/rag_env/lib/python3.12/site-packages/sentence_transformers/util.pypytorch_cos_simr      s    
 1a=    c                 ^   t        | t        j                        st        j                  |       } t        |t        j                        st        j                  |      }t	        | j
                        dk(  r| j                  d      } t	        |j
                        dk(  r|j                  d      }t        j                  j                  j                  | dd      }t        j                  j                  j                  |dd      }t        j                  ||j                  dd            S )r      r      pdim)
isinstancetorchr   tensorlenshape	unsqueezenn
functional	normalizemm	transpose)r   r   a_normb_norms       r   r   r      s    
 a&LLOa&LLO
177|qKKN
177|qKKNXX  **1q*9FXX  **1q*9F88FF,,Q233r   c                    t        | t        j                        st        j                  |       } t        |t        j                        st        j                  |      }t	        | j
                        dk(  r| j                  d      } t	        |j
                        dk(  r|j                  d      }t        j                  | |j                  dd            S )z
    Computes the dot-product dot_prod(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = dot_prod(a[i], b[j])
    r   r   )	r!   r"   r   r#   r$   r%   r&   r*   r+   r   s     r   	dot_scorer/   4   s    
 a&LLOa&LLO
177|qKKN
177|qKKN88Aq{{1a())r   c                     t        | t        j                        st        j                  |       } t        |t        j                        st        j                  |      }| |z  j	                  d      S )zt
   Computes the pairwise dot-product dot_prod(a[i], b[i])
   :return: Vector with res[i] = dot_prod(a[i], b[i])
   )r    )r!   r"   r   r#   sumr   s     r   pairwise_dot_scorer3   H   sO    
 a&LLOa&LLOE;;2;r   c                     t        | t        j                        st        j                  |       } t        |t        j                        st        j                  |      }t	        t        |       t        |            S )zm
   Computes the pairwise cossim cos_sim(a[i], b[i])
   :return: Vector with res[i] = cos_sim(a[i], b[i])
   )r!   r"   r   r#   r3   normalize_embeddingsr   s     r   pairwise_cos_simr6   V   sS    
 a&LLOa&LLO2157KA7NOOr   
embeddingsc                 Z    t         j                  j                  j                  | dd      S )z[
    Normalizes the embeddings matrix, so that each sentence embedding has unit length
    r   r   r   )r"   r'   r(   r)   )r7   s    r   r5   r5   d   s&     88((qa(@@r   	sentencesshow_progress_bar
batch_sizec                 J    | j                  |||d      }t        |g|i |S )ab  
    Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
    other sentences and returns a list with the pairs that have the highest cosine similarity score.

    :param model: SentenceTransformer model for embedding computation
    :param sentences: A list of strings (texts or sentences)
    :param show_progress_bar: Plotting of a progress bar
    :param batch_size: Number of texts that are encoded simultaneously by the model
    :param query_chunk_size: Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time).
    :param corpus_chunk_size: Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time).
    :param max_pairs: Maximal number of text pairs returned.
    :param top_k: For each sentence, we retrieve up to top_k other sentences
    :param score_function: Function for computing scores. By default, cosine similarity.
    :return: Returns a list of triplets with the format [score, id1, id2]
    T)r:   r;   convert_to_tensor)encodeparaphrase_mining_embeddings)modelr9   r:   r;   argskwargsr7   s          r   paraphrase_miningrC   k   s2    . i;LYcw{|J'
DTDVDDr   i  i i  d   query_chunk_sizecorpus_chunk_size	max_pairstop_kscore_functionc                    |dz  }t        j                         }d}d}t        dt        |       |      D ])  }	t        dt        |       |      D ]  }
 || |
|
|z    | |	|	|z          }t	        j
                  |t        |t        |d               ddd      \  }}|j                         j                         }|j                         j                         }t        t        |            D ]n  }t        ||         D ][  \  }}|
|z   }|	|z   }||k7  s||   |   |kD  s"|j                  ||   |   ||f       |dz  }||k\  sG|j                         }|d   }] p  , t               }g }|j                         sg|j                         \  }}}t        ||g      \  }}||k7  r-||f|vr'|j                  ||f       |j!                  |||g       |j                         sgt        |d d      }|S )	a  
    Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
    other sentences and returns a list with the pairs that have the highest cosine similarity score.

    :param embeddings: A tensor with the embeddings
    :param query_chunk_size: Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time).
    :param corpus_chunk_size: Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time).
    :param max_pairs: Maximal number of text pairs returned.
    :param top_k: For each sentence, we retrieve up to top_k other sentences
    :param score_function: Function for computing scores. By default, cosine similarity.
    :return: Returns a list of triplets with the format [score, id1, id2]
    r   r1   r   TFr    largestsortedc                     | d   S )Nr    xs    r   <lambda>z.paraphrase_mining_embeddings.<locals>.<lambda>   s
    !A$ r   keyreverse)queuePriorityQueueranger$   r"   topkmincputolist	enumerateputgetsetemptyrM   addappend)r7   rE   rF   rG   rH   rI   pairs	min_score	num_addedcorpus_start_idxquery_start_idxscoresscores_top_k_valuesscores_top_k_idx	query_itr	top_k_idx
corpus_itrijentryadded_pairs
pairs_listscoresorted_isorted_js                            r   r?   r?      sE   & 
QJE !EII!!S_6GH 1$QJ9IJ 	1O#JO_?_$`blm}  O  Pa  a  cb  cF49JJvs5RUV\]^V_R`Gaghrv  @E  5F1!1"5"9"9";"B"B"D/335<<>"3v;/ 1	-67G	7R-S 
1)Iz')3A(:5AAv"5i"@"Ki"W		#6y#A)#LaQR"ST!Q	$	1$)IIKE(-aI
11	11, %KJkkmiikq!#QF^(xXx$8$KOOXx01uam, kkm 
EJr   c                      t        | i |S )z8This function is deprecated. Use semantic_search instead)semantic_search)rA   rB   s     r   information_retrievalry      s    D+F++r   
   query_embeddingscorpus_embeddingsc                 .   t        | t        j                  t        j                  f      rt	        j
                  |       } n%t        | t              rt	        j                  |       } t        | j                        dk(  r| j                  d      } t        |t        j                  t        j                  f      rt	        j
                  |      }n%t        |t              rt	        j                  |      }|j                  | j                  k7  r| j                  |j                        } t        t        |             D cg c]  }g  }}t        dt        |       |      D ]  }t        dt        |      |      D ]  }	 || |||z    ||	|	|z          }
t	        j                  |
t        |t        |
d               ddd      \  }}|j!                         j#                         }|j!                         j#                         }t        t        |
            D ]=  }t%        ||   ||         D ]&  \  }}|	|z   }||z   }||   j'                  ||d       ( ?   t        t        |            D ]"  }t)        ||   d d      ||<   ||   d| ||<   $ |S c c}w )	a  
    This function performs a cosine similarity search between a list of query embeddings  and a list of corpus embeddings.
    It can be used for Information Retrieval / Semantic Search for corpora up to about 1 Million entries.

    :param query_embeddings: A 2 dimensional tensor with the query embeddings.
    :param corpus_embeddings: A 2 dimensional tensor with the corpus embeddings.
    :param query_chunk_size: Process 100 queries simultaneously. Increasing that value increases the speed, but requires more memory.
    :param corpus_chunk_size: Scans the corpus 100k entries at a time. Increasing that value increases the speed, but requires more memory.
    :param top_k: Retrieve top k matching entries.
    :param score_function: Function for computing scores. By default, cosine similarity.
    :return: Returns a list with one entry for each query. Each entry is a list of dictionaries with the keys 'corpus_id' and 'score', sorted by decreasing cosine similarity scores.
    r   r   TFrK   )	corpus_idrt   c                     | d   S )Nrt   rO   rP   s    r   rR   z!semantic_search.<locals>.<lambda>  s    RST[R\ r   rS   )r!   npndarraygenericr"   
from_numpyliststackr$   r%   r&   r   torX   rY   rZ   r[   r\   ziprc   rM   )r{   r|   rE   rF   rH   rI   _queries_result_listrh   rg   
cos_scorescos_scores_top_k_valuescos_scores_top_k_idxrl   sub_corpus_idrt   r~   query_ididxs                      r   rx   rx      s   & "RZZ$<= ++,<=	$d	+ ;;'78
!!"a'+55a8#bjj"**%=>!,,->?	%t	,!KK(9: #3#:#::+../@/G/GH',S1A-B'CD!2DD C(8$9;KL c %a->)?AR S 	c'(8YiIi(jl}  O  P`  ar  Pr  ms  tJ =BJJzSVW\^ablmnbo^pSqwx  CG  PU  =V9#%9&=&A&A&C&J&J&L##7#;#;#=#D#D#F "3z?3 c	,/0DY0OQhirQs,t c(M5 0= @I.:H'188y[`9abcc	cc$ S,-. E#)*=c*BH\fj#kC #6s#;Ae#DC E 1 Es   >	Jc                    t         j                  j                  |      dk7  r4t        j                  t         j                  j                  |      d       t	        j
                  | d      }|j                  dk7  rFt        dj                  | |j                        t        j                         |j                          y|d	z   }t        |d
      5 }|j                  j                  d      }|t        |      nd}t        d|d      }|j!                  d      D ]0  }|s|j#                  t%        |             |j'                  |       2 	 ddd       t        j(                  ||       j+                          y# 1 sw Y   0xY w)z1
    Downloads a URL to a given path on disc
     Texist_ok)stream   z1Exception when trying to download {}. Response {})fileN_partwbzContent-LengthB)unittotal
unit_scale   )
chunk_size)ospathdirnamemakedirsrequestsr_   status_codeprintformatsysstderrraise_for_statusopenheadersintr   iter_contentupdater$   writerenameclose)	urlr   reqdownload_filepathfile_binarycontent_lengthr   progresschunks	            r   http_getr     s9    
wwt"
BGGOOD)D9
,,s4
(C
#AHHcoo^eheoeopW		& )+)9:'5'AN#tS$?%%%6 	)EE
+!!%(	)	) II&NN) )s   AF%-FFtarget_devicec                 j    | D ]-  }t        | |   t              s| |   j                  |      | |<   / | S )z4
    send a pytorch batch to a device (CPU/GPU)
    )r!   r   r   )batchr   rT   s      r   batch_to_devicer   &  s>      6eCj&)s}5E#J6 Lr   c                     | j                   j                  }||t        j                   j                  k(  r| j                   j                  S |dz   | j                   j                  z   S )z
  Gives a full name (package_name.class_name) for a class / object in Python. Will
  be used to load the correct classes from JSON files
  .)	__class__
__module__str__name__)omodules     r   fullnamer   1  sR     ;;!!&^v!9!99;;C<!++....r   c                 (   	 | j                  dd      \  }}	 t        j                  |       }	 t        ||      S # t        $ r d| z  }t        |      w xY w#  t        j                  |      }Y CxY w# t        $ r d|d|d}t        |      w xY w)z
    Import a dotted module path and return the attribute/class designated by the
    last name in the path. Raise ImportError if the import failed.
    r   r   z"%s doesn't look like a module pathzModule "z" does not define a "z" attribute/class)rsplit
ValueErrorImportError	importlibimport_modulegetattrAttributeError)dotted_pathmodule_path
class_namemsgr   s        r   import_from_stringr   =  s    
"-"4"4S!"<Z
6((5vz**  2[@#6((5  FQS]^#s   : A A3 AA03Bc                    t        j                  || j                        }g }t        |t	        |             }t        t        d|z  d      t	        |             }t        dt	        |       |      D ]  }t        | |||z    |       }|j                  |d      \  }}	t        t	        |            D ]  }
||
   d   |k\  sg }||
   j                  |d      \  }}|d   |kD  r:t        d|z  t	        |             }||
   j                  |d      \  }}|d   |kD  r:t        |j                         |      D ]  \  }}||k  r n|j                  |        |j                  |        ~ t        |d d	      }g }t               }t        |      D ]`  \  }}t        |      }g }|D ]  }||vs|j                  |        t	        |      |k\  s?|j                  |       |j                  |       b t        |d
 d	      }|S )aV  
    Function for Fast Community Detection
    Finds in the embeddings all communities, i.e. embeddings that are close (closer than threshold).
    Returns only communities that are larger than min_community_size. The communities are returned
    in decreasing order. The first element in each list is the central point in the community.
    )r   r   2   r   T)krL   r1   c                     t        |       S Nr$   rP   s    r   rR   z%community_detection.<locals>.<lambda>  s
    A r   rS   c                     t        |       S r   r   rP   s    r   rR   z%community_detection.<locals>.<lambda>  s
    #a& r   )r"   r#   r   rZ   r$   maxrX   r   rY   r   r\   rc   rM   r`   r]   r   )r7   	thresholdmin_community_sizer;   extracted_communitiessort_max_size	start_idxr   top_k_valuesr   ro   new_clustertop_val_largetop_idx_larger   valunique_communitiesextracted_ids
cluster_id	communitynon_overlapped_communitys                        r   community_detectionr   T  sO    Yz/@/@AI /ZAA 22B7ZIM1c*oz: 	Z	)j2HI:V
 %//,>/Ma s<() 	:AAr"i/  0:!}/A/AM[_/A/`,} $B')3$'M(93z?$KM3=a=3E3E_c3E3d0M= $B')3 !$M$8$8$:M J ,HCY&&s+	, &,,[9%	:( 9> ##8>NX\] EM!*+@!A 	;
I9%	#%  	5C-'(//4	5 '(,>>%%&>?  !9:	;   28HRVWr   repo_idrevision	cache_dirlibrary_namelibrary_version
user_agentignore_filesuse_auth_tokenreturnc           	         |t         }t        |t              rt        |      }t	               }d}	t        |t              r|}	n|rt        j                         }	|j                  | ||	      }
t        j                  j                  || j                  dd            }|
j                  }t        |      D ]*  \  }}|j                  dk(  s||= |j                  |        n |D ]o  }|0d}|D ]&  }t!        j                   |j                  |      s$d} n |r6t#        | |j                  |
j$                        }t        j                  j                  |j                  j'                  d       }t        j                  j)                  t        j                  j                  ||            }t        j*                  |d	       |||||||d
}t-        j.                  t0        j2                        t-        j.                  d      k\  rd|d<   t5        di |}t        j                  j7                  |dz         sXt        j8                  |dz          r |S )z
    Method derived from huggingface_hub.
    Adds a new parameters 'ignore_files', which allows to ignore certain files / file-patterns
    N)r   r   token/r   zmodules.jsonFT)filenamer   r   )r   r   force_filenamer   r   r   r   z0.8.1legacy_cache_layoutz.lockrO   )r   r!   r   r   r   r   	get_token
model_infor   r   joinreplacesiblingsr]   	rfilenamerc   fnmatchr   shasplitr   r   r   parsehuggingface_hub__version__r   existsremove)r   r   r   r   r   r   r   r   _apir   r   storage_folder	all_filesr   repofile
model_fileskip_downloadpatternr   relative_filepathnested_dirnamecached_download_argsr   s                          r   snapshot_downloadr    s    )	)T"	N	7DE.#&	""$85QJWW\\7??3,N ##I"9- X/#X&	   &&
#!M' ??:#7#7A$(M
 j22Z^^
 GGLL**>*>*D*DS*IJ GGLL):;
 	NT2'*'/(.$, . ==445w9OO ;? !676!5677>>$.)IIdWn%M&&P r   )F    )g      ?rz   r   )NNNNNNN)5r   r"   r   r   typingr   r   tqdm.autonotebookr   r   r   r   numpyr   rV   loggingr   r	   r
   pathlibr   r  huggingface_hub.constantsr   r   r   r   r   r  	packagingr   	getLoggerr   loggerr   r   r/   r3   r6   r5   r   boolr   rC   r?   ry   rx   r   r   r   r   r   r  rO   r   r   <module>r     s~      ! " 
  	     ( (   ; H H  			8	$v & 4v 4& 4,* *F *(& V P P6 PAV A 16')E!%cE)-E "%E: /3/5'-#&KR<V <(+<),< "%< !	<
 '//?/G&H<~, -0-3!#IP=f ='-=&)= (+= 	=
 %-ff-=v-E$F=@6& 
/.BZ #(,"&%))-(,-1QQsmQ S$_%Q 3-	Q
 c]Q dCo&Q 49%Q $T/*Q 	Qr   