
    i1                         S r SSKrSSKrSSKrSSKJr   SSKrS rSS jrS r " S S	5      r " S
 S\5      rSS jr SS jr " S S\5      rS rS rSS jr  SS jrg! \	 a    \
" S5         NYf = f)zO
This contrib module contains a few routines useful to do clustering variants.
    N)
ThreadPoolz2scipy not accessible, Python k-means will not workc                      g N )argkwargss     h/home/dmtnaga/Documents/work/airagagent/rag_env/lib/python3.13/site-packages/faiss/contrib/clustering.py	print_nopr
      s        c                    U R                   S   nUR                  SS5      nU(       a  [        O[        nU" SU R                    SU SU 35        U" S5        [        R
                  " Xa4USS	.UD6n	U	R                  U 5        U	R                  /n
U" 5         U	R                  nU" S
5        [        R                  " 5       nU	R                  U 5      u  p[        R                  " XS9nU" S[        R                  " 5       U-
  S S[        U5       S[        U5       35        UR                  5       nA	U(       d+  [        R                   " US-   5      U-  U-  nUSS USS -
  nOc[        R"                  " U5      nUU-  US   -  nUSS=== USS -  sss& [%        U5      U:X  d   eU" S[        U5       S[        U5       35        Sn/ n[        R                  " 5       n['        U5       H  n[)        UU   5      nU" S[        R                  " 5       U-
  S SU SU SU S3	SSS9  UUU   -   nUUU n[        R*                  " UU   U:H  5      (       d   e[        R
                  " Xb40 UD6n	U U   nU	R                  U5        U
R-                  U	R                  5        UR-                  U	R                  5        A	UnM     U" S[        R                  " 5       U-
  S S35        [        R.                  " U5      U
4$ )a%  
perform 2-level clustering on a training set xt
nc1 and nc2 are the number of clusters at each level, the final number of
clusters is nc2. Additional arguments are passed to the Kmeans object.

Rebalance allocates the number of sub-clusters depending on the number of
first-level assignment.
   verboseFz2-level clustering of z nb 1st level clusters = z total zperform coarse trainingi  )nitermax_points_per_centroidzassigning the training set	minlengthzdone in z.2fz s. Sizes of clusters -Nznb 2nd-level centroids r   [z s] training sub-cluster /z nc2= Tendflushz s)shapegetprintr
   faissKmeanstrainiteration_stats	centroidstimeassignnpbincountminmaxargsortarangecumsumsumrangeintallappendvstack)xtnc1nc2	rebalanceclustering_niterargsdr   logkmr"   
centroids1t0_assign1bcoccall_nc2bc_sumi0c2c1i1subsetxtsubs                             r	   two_level_clusteringrK      s    	Ahhy%(G%	C
 
*CC5PSuUV!"		
& $
 
B
 HHRL))*OE J$%	B2JA	W	,B(499;#C((>s2wiqR	RSA
YYsQw#%,QR&2cr7"23,&*,ws|#7|s"""%c'l^1S\NCD 
B	B	BCj'"+a		b %%>rd!C5cURTU[]eij"R&[2bvvgfo+,,,,\\!)D)6

r112
		",,  (499;#C(+,99R=/))r   c                 &   [         R                  " U 5      n [        U [         R                  5      (       a  [	        U R
                  R                  5       5       H@  nU R
                  R                  U5      nUR                  U5        UR                  U5      nMB     [        U R                  U40 UD6  SU l        g[        U [         R                  5      (       d   eU R                  [         R                  :X  d   e[!        ["        R$                  " U R&                  5      5      n[)        SU5        [+        XU R&                  40 UD6u  pgU R,                  R                  U5        U R,                  R/                  U5        U R                  U5        g)zB
Applies 2-level clustering to an index_ivf embedded in an index.
TNz
REBALANCE=)r   downcast_index
isinstanceIndexPreTransformr.   chainsizeatr!   applytrain_ivf_index_with_2levelindex
is_trainedIndexIVFmetric_type	METRIC_L2r/   r&   sqrtnlistr   rK   	quantizeradd)rU   r3   r8   ivtr4   r#   r>   s           r	   rT   rT   _   s#   
   'E%0011u{{'')*A"BHHRL"B + 	$EKK<t<eU^^,,,,///
bggekk"
#C	,'EELI	OO)$	OO	"	KKOr   c                   @    \ rS rSrSrS rS rS rS rS r	SS	 jr
S
rg)DatasetAssign   Wrapper for a matrix that offers a function to assign the vectors
to centroids. All other implementations offer the same interfacec                 8    [         R                  " USS9U l        g Nfloat32dtype)r&   ascontiguousarrayxselfrj   s     r	   __init__DatasetAssign.__init__   s    %%ay9r   c                 4    U R                   R                  S   $ )Nr   rj   r   rl   s    r	   countDatasetAssign.count       vv||Ar   c                 4    U R                   R                  S   $ Nr   rp   rq   s    r	   dimDatasetAssign.dim   rt   r   c                      U R                   U   $ r   rj   rl   indicess     r	   
get_subsetDatasetAssign.get_subset   s    vvgr   c                 F    [         R                  " U R                  US5      $ rv   )r   knnrj   rl   r#   s     r	   perform_searchDatasetAssign.perform_search   s    yyA..r   Nc                    U R                  U5      u  p4UR                  5       nUR                  5       nUR                  u  pV[        R                  " XV4SS9nUc+  [        R
                  R                  XtU R                  5        OB[        R
                  R                  XtUS S 2[        R                  4   U R                  -  5        XCU4$ re   )	r   ravelr   r&   zerosr]   rR   rj   newaxis)rl   r#   weightsDIncr9   sum_per_centroids           r	   	assign_toDatasetAssign.assign_to   s    ""9-GGIGGI88RG9=?FFII&4662FFII&71bjj=+ADFF+JK%%%r   rz   r   )__name__
__module____qualname____firstlineno____doc__rm   rr   rw   r}   r   r   __static_attributes__r   r   r	   ra   ra      s&    H:/&r   ra   c                   (    \ rS rSrSrSS jrS rSrg)DatasetAssignGPU   zGPU version of the previous c                    [         R                  X5        [        R                  " UR                  S   5      nUS:  a0  [        R
                  " [        R                  " 5       X$5      U l        g [        R                  " U5      U l        g )Nr   r   )	ra   rm   r   IndexFlatL2r   index_cpu_to_gpuStandardGpuResourcesrU   index_cpu_to_all_gpus)rl   rj   gpu_idr   rU   s        r	   rm   DatasetAssignGPU.__init__   sd    t'!!!''!*-Q;//**,DJ
 44U;DJr   c                     U R                   R                  5         U R                   R                  U5        U R                   R                  U R                  S5      $ rv   )rU   resetr]   searchrj   r   s     r	   r   DatasetAssignGPU.perform_search   s=    



y!zz  ++r   )rU   N)F)r   r   r   r   r   rm   r   r   r   r   r	   r   r      s    '	<,r   r   c                    U R                   S   nUR                   S   nUc  US-  R                  S5      nUc4  [        R                  " U R	                  S5      R                  S5      5      nUSU -  UR
                  -  -
  nUR                  SS9nUR                  5       U[        R                  " U5      U-  -      UR                  5       -   nX4$ )zassignment function for xq is sparse, xb is dense
uses a matrix multiplication. The squared norms can be provided if
available.
r      r   )axis)	r   r-   r&   arraypowerTargminr   r+   )	xqxbxq_normsxb_normsnqnbd2r   r   s	            r	   sparse_assign_to_denser      s    
 
!B	!B!G==#88BHHQKOOA./
QVbdd]
"B
		q	A

1ryy}r))*X^^-==A4Kr   c           
        ^ ^^^^^^
^^ T R                   S   nTR                   S   m[        R                  " USS9m
T
R                  [        R                  5        [        R
                  " U[        S9* mTc  TS-  R                  S5      mU
UUUUUUU U4	S jnUS:X  d  US:X  d  UT::  a$  [        [        U[        SUT5      5      5        T
T4$ [        U5      n	U	R                  U[        SUT5      5        T
T4$ )z
decomposes the sparse_assign_to_dense function into blocks to avoid a
possible memory blow up. Can be run in multithreaded mode, because scipy's
sparse-dense matrix multiplication is single-threaded.
r   rf   rg   r   r   c           
      L  >	 TX T-    nT
X T-    nT	X T-    nTc5  [         R                  " UR                  S5      R                  S5      5      nOTX T-    n[	        STT5       H>  n[        UTXUT-    UTXUT-    S9u  pgUS:X  a
  XrS S & XcS S & M+  Xc:  nXx   U-   X('   Xh   X8'   M@     g )Nr   r   r   )r   r   )r&   r   r   r-   r.   r   )r^   xq_blockIblockDblockxq_norms_blockjDiIimaskr   r   bbsr   qbsr   r   r   r   s            r	   handle_query_block9sparse_assign_to_dense_blocks.<locals>.handle_query_block   s    ac'?13w13wXXhnnQ&7&;&;A&>?N%ac'2Nq"c"A+13w'!!#g.	FB Avq	q	{!x!|!x #r   )r   r&   emptyfillinfonesr/   r-   listmapr.   r   )r   r   r   r   r   r   ntr   r   poolr   r   r   s   ``````    @@@r	   sparse_assign_to_dense_blocksr      s     
!B	!B
9%AFF266N	3	A!G==#( (. 
Qw"'R3YS#U1b#%678
 a4K "~#U1b#%67a4Kr   c                   4    \ rS rSrSrS rS rS rS	S jrSr	g)
DatasetAssignSparse   rc   c                     UR                   [        R                  R                  :X  d   eXl        [
        R                  " UR                  S5      R                  S5      5      U l	        g )Nr   r   )
	__class__scipysparse
csr_matrixrj   r&   r   r   r-   squared_normsrk   s     r	   rm   DatasetAssignSparse.__init__   sE    {{ell55555XXaggajnnQ&78r   c                 d    [         R                  " U R                  U   R                  5       5      $ r   )r&   r   rj   todenser{   s     r	   r}   DatasetAssignSparse.get_subset  s"    xxw//122r   c                 >    [        U R                  XR                  S9$ )N)r   )r   rj   r   r   s     r	   r   "DatasetAssignSparse.perform_search  s    ,FFI(:(:< 	<r   Nc                    U R                  U5      u  p4UR                  5       nUR                  5       nU R                  R                  S   nUc  [        R
                  " USS9n[        U5      n[        R                  R                  X$[        R                  " US-   5      4Xe4S9n[        R                  " XpR                  -  R                  5       5      nXCU4$ )Nr   rf   rg   r   )r   )r   r   rj   r   r&   r   lenr   r   
csc_matrixr+   r   r   )	rl   r#   r   r   r   nr   mr   s	            r	   r   DatasetAssignSparse.assign_to	  s    ""9-GGIGGIFFLLO?ggay1G^LL##1q5)*' $  88QZ$8$8$:;%%%r   )r   rj   r   )
r   r   r   r   r   rm   r}   r   r   r   r   r   r	   r   r      s    H9
3<&r   r   c                     [         R                  " USS9n[        R                  " [	        U5      U [        R
                  " U5      5      $ )Nint64rg   )r&   ri   r   imbalance_factorr   swig_ptr)kr%   s     r	   r   r     s6    !!&8F!!#f+q%..2HIIr   c                     U R                   [        R                  :X  a  gSS Kn[	        XR
                  5      (       a  g[        S[        U 5       35      e)NFr   TzUnknown tensor type )r   r&   ndarraytorchrN   TensorNotImplementedErrortype)rj   r   s     r	   check_if_torchr      sA    {{bjj !\\""
 4T!WI>
??r   c                 
   Uc  [         R                  nUR                  u  p4Sn[        U5      n[         R                  " U S:H  5      S   n[        U5      S:X  a  gU(       a  SSKnUR                  US   5      n	O[         R                  " US   5      n	U	SSS2==   S-  ss'   U	SSS2==   S-  ss'   [        U5      S:  a  U R                  S5      S-
  n
SXS:  '   XR                  5       -  n
U
S:  R                  5       n[        XR                  5      nUR                  X<U
S9n[        USU U5       H3  u  pX   nUU	-  X'   UU	-  X'   X   S-  X'   X==   X   -  ss'   US-  nM5     X|S n[        U5      S:  a  M  U$ )z.reassign centroids when some of them collapse Nr   r   g      P?r   float)rQ   p)r&   randomr   r   wherer   r   	ones_likeastyper-   r(   rQ   choicezip)hassignr#   rsr   r9   nsplitis_torchempty_centsr   facprobasnnznreplacecjscicjcs                    r	   reassign_centroidsr   )  s   	zYY??DAFi(H((7a<(+K
;1ooil+ll9Q<(!H	H1II k
Q
(1,z**,z s,,-iiFi3+ix0#6FBAGIMGIM!+*GKK7;&KaKF 7 "),) k
Q
, Mr   c           
         UR                  5       UR                  5       pU(       a  [        O[        n	U	" SXxXU4-  5        [        R
                  R                  U5      n
[        S5        [        R                  " 5       nU
R                  XpSS9nUR                  U5      n[        U5      n/ nU	" S5        Sn/ n[        U5       GH  n[        R                  " 5       nU	" SSS	S
9  UR                  U5      u  nnnU	" SSS	S
9  U[        R                  " 5       U-
  -  nUR                  5       nU(       a  UR                  5       nUR                  U5        [        R                   " UU S9nUR#                  SS5      R%                  S5      nSUUS:H  '   U(       a.  SSKnUR)                  U5      R+                  UR,                  5      nUU-  n[/        UX5      nU[        R                  " 5       U-
  U[1        U U5      US.nU	" SUUS   US   UUS   U4-  5        UR                  U5        Uc  GMn  U	" SU5        U(       a  SSKnUR3                  X5        GM  [        R2                  " XM5        GM     U(       a  X4$ U$ )a  Pure python kmeans implementation. Follows the Faiss C++ version
quite closely, but takes a DatasetAssign instead of a training data
matrix. Also redo is not implemented.

For the torch implementation, the centroids are tensors (possibly on GPU),
but the indices remain numpy on CPU.
zAClustering %d points in %dD to %d clusters, %d iterations seed %dz
preproc...F)rQ   replacez  doner   	assigningr   Tr   zcompute centroidsr   r   r   rf   N)objr$   time_searchr   r   zM  Iteration %d (%.2f s, search %.2f s): objective=%g imbalance=%.3f nsplit=%dr$   r  r   zstoring centroids in)rr   rw   r   r
   r&   r   RandomStater$   r   r}   r   r.   r   r-   itemr1   r'   reshaper   r   
from_numpytodevicer   r   save)r   datar   seed
checkpointr   return_statsr   r9   r:   r   r=   permr#   r   r"   t_search_totr  r^   t0sr%   r   sumserrr   r   r   r   ss                                r	   kmeansr  Z  sL    ::<q%	C 
$()a'=	> ? 
		t	$B	,	B99Q9.D%Ii(HOML
C5\iikKT...34T6		c))eeg((*C

3++f2oob!$++I6C1H""3'**4;;7C3J	#GY; YY[2%' 0F ;
 	 5ai=!1,-9 	
 	q!!&
3

91
.c f ))r   )T   )NN)NN @  r  Nr   )r  i  NTF)r   numpyr&   r   r$   multiprocessing.poolr   scipy.sparser   ImportErrorr   r
   rK   rT   ra   r   r   r   r   r   r   r   r  r   r   r	   <module>r     s       +@	D*NF& &D,} ,($ HL-`&- &DJ
@-b CGRQ
  @	
>?@s   A# #A43A4