
    i/                        S SK r S SKrS SKrS SKrSSKJrJrJrJ	r	  SSK
Jr   " S S5      r " S S\5      r\R                  " 5       rS	S
S\ S34 H)  q\ R"                  R%                  [         5      (       d  M)    O   SqS r " S S\5      rS r " S S\5      r " S S\5      r " S S\5      r " S S\5      r " S S\5      rSS jrg)    N   )
fvecs_read
ivecs_read
bvecs_mmap
fvecs_mmap)knnc                   ^    \ rS rSrSrS rS rSS jrS rSS jr	SS	 jr
SS
 jrS rS rSrg)Dataset   z*Generic abstract class for a test dataset c                 J    SU l         SU l        SU l        SU l        SU l        g)z1the constructor should set the following fields: L2Ndmetricnqnbntselfs    f/home/dmtnaga/Documents/work/airagagent/rag_env/lib/python3.13/site-packages/faiss/contrib/datasets.py__init__Dataset.__init__   s%        c                     [        5       e)z&return the queries as a (nq, d) array NotImplementedErrorr   s    r   get_queriesDataset.get_queries       !##r   Nc                     [        5       e)z&return the queries as a (nt, d) array r   r   maxtrains     r   	get_trainDataset.get_train   r    r   c                     [        5       e)z&return the queries as a (nb, d) array r   r   s    r   get_databaseDataset.get_database"   r    r   c              #      #    U R                  5       nUu  pEU R                  U-  U-  U R                  US-   -  U-  pv[        XgU5       H  nX8[        X-   U5       v   M     g7f)a  returns an iterator on database vectors.
bs is the number of vectors per batch
split = (nsplit, rank) means the dataset is split in nsplit
shards and we want shard number rank
The default implementation just iterates over the full matrix
returned by get_dataset.
r   N)r'   r   rangemin	r   bssplitxbnsplitranki0i1j0s	            r   database_iteratorDataset.database_iterator&   sg       46)477dQh+?6+IB#BRWb)** $s   A!A#c                     [        5       e)z6return the ground truth for k-nearest neighbor search r   r   ks     r   get_groundtruthDataset.get_groundtruth4   r    r   c                     [        5       e)z)return the ground truth for range search r   )r   threshs     r   get_groundtruth_rangeDataset.get_groundtruth_range8   r    r   c           
          SU R                    SU R                   SU R                   SU R                   SU R                   3
$ )Nzdataset in dimension z, with metric z
, size: Q z B z T r   r   s    r   __str__Dataset.__str__<   sD    'x~dkk] K77)3twwis477)= 	>r   c                    U R                  5       R                  U R                  U R                  4:X  d   eU R                  S:  a@  U R                  SS9nUR                  SU R                  4:X  d   SUR                  < 35       eU R                  5       R                  U R                  U R                  4:X  d   eU R                  SS9R                  U R                  S4:X  d   eg)z7runs the previous and checks the sizes of the matrices r   {   )r#   zshape=   )r9   N)	r   shaper   r   r   r$   r'   r   r:   )r   xts     r   check_sizesDataset.check_sizes@   s    !''DGGTVV+<<<<77Q;-B88TVV},GBHH.GG,  "((TWWdff,====##b#)//DGGR=@@@r   )r   r   r   r   r   N   )r   r   )__name__
__module____qualname____firstlineno____doc__r   r   r$   r'   r5   r:   r>   rA   rH   __static_attributes__ r   r   r
   r
      s3    5$$$+$$>Ar   r
   c                   B    \ rS rSrSrS
S jrS rSS jrS rSS jr	S	r
g)SyntheticDatasetJ   zGA dataset that is not completely random but still challenging to
index
c                    [         R                  U 5        XX44u  U l        U l        U l        U l        SnX2-   U-   n[        R                  R                  U5      n	U	R                  X4S9n
[        R                  " XR                  Xq5      5      n
XR                  U5      S-  S-   -  n
[        R                  " U
5      n
U
R                  S5      n
XPl        U
S U U l        XX#-    U l        XU-   S  U l        g )N
   )size   g?float32)r
   r   r   r   r   r   nprandomRandomStatenormaldotrandsinastyper   rG   r/   xq)r   r   r   r   r   r   seedd1nrsxs              r   r   SyntheticDataset.__init__O   s    ,-2M)$'GbLYY""4(IIA7I#FF1ggbn% a#%&FF1IHHYCR&rw-GH+r   c                     U R                   $ rJ   )rd   r   s    r   r   SyntheticDataset.get_queriesa       wwr   Nc                 B    Ub  UOU R                   nU R                  S U $ rJ   )r   rG   r"   s     r   r$   SyntheticDataset.get_traind   s#    '38wwy!!r   c                     U R                   $ rJ   )r/   r   s    r   r'   SyntheticDataset.get_databaseh   rm   r   c                     [        U R                  U R                  UU R                  S:X  a  [        R
                  5      S   $ [        R                  5      S   $ )Nr   r   )r   rd   r/   r   faiss	METRIC_L2METRIC_INNER_PRODUCTr8   s     r   r:    SyntheticDataset.get_groundtruthk   sW    GGTWWa#{{d2EOO
  	8=8R8R
  	r   )r   r   r   r   r   r/   rd   rG   )r   i:  rJ   )d   rM   rN   rO   rP   rQ   r   r   r$   r'   r:   rR   rS   r   r   rU   rU   J   s     $"r   rU   z/datasets01/simsearch/041218/z7/mnt/vol/gfsai-flash3-east/ai-group/datasets/simsearch/z/home/z/simsearch/data/zdata/c                     U q g rJ   )dataset_basedir)paths    r   set_dataset_basedirr|      s    Or   c                   >    \ rS rSrSrS rS rS
S jrS rS
S jr	S	r
g)DatasetSIFT1M   S
The original dataset is available at: http://corpus-texmex.irisa.fr/
(ANN_SIFT1M)
c                     [         R                  U 5        Su  U l        U l        U l        U l        [        S-   U l        g )N)rL   順 @B '  zsift1M/r
   r   r   r   r   r   rz   basedirr   s    r   r   DatasetSIFT1M.__init__   2    ,G)$'&2r   c                 2    [        U R                  S-   5      $ )Nzsift_query.fvecsr   r   r   s    r   r   DatasetSIFT1M.get_queries       $,,);;<<r   Nc                 Z    Ub  UOU R                   n[        U R                  S-   5      S U $ )Nzsift_learn.fvecsr   r   r   r"   s     r   r$   DatasetSIFT1M.get_train   .    '38$,,);;<YhGGr   c                 2    [        U R                  S-   5      $ )Nzsift_base.fvecsr   r   s    r   r'   DatasetSIFT1M.get_database       $,,)::;;r   c                 b    [        U R                  S-   5      nUb  US::  d   eUS S 2S U24   nU$ )Nzsift_groundtruth.ivecsrw   r   r   r   r9   gts      r   r:   DatasetSIFT1M.get_groundtruth   <    '??@=8O8ArrEB	r   r   r   r   r   r   rJ   rx   rS   r   r   r~   r~      !    
3
=H<r   r~   c                 ,    [         R                  " U SS9$ )Nr[   dtype)r\   ascontiguousarray)ri   s    r   sanitizer      s    33r   c                   L    \ rS rSrSrSS jrS rSS jrSS jrS r	SS	 jr
S
rg)DatasetBigANN   zS
The original dataset is available at: http://corpus-texmex.irisa.fr/
(ANN_SIFT1B)
c                     [         R                  U 5        US;   d   eXl        US-  nSSUS4u  U l        U l        U l        U l        [        S-   U l        g )N)
r         rX      2   rw      i    r   rL    r   zbigann/)	r
   r   nb_Mr   r   r   r   rz   r   )r   r   r   s      r   r   DatasetBigANN.__init__   sX    AAAA	E\,/E,A)$'&2r   c                 J    [        [        U R                  S-   5      S S  5      $ )Nzbigann_query.bvecs)r   r   r   r   s    r   r   DatasetBigANN.get_queries   s!    
4<<2F#FGJKKr   Nc                 l    Ub  UOU R                   n[        [        U R                  S-   5      S U 5      $ )Nzbigann_learn.bvecs)r   r   r   r   r"   s     r   r$   DatasetBigANN.get_train   s3    '38
4<<2F#FG	RSSr   c                 |    [        U R                  SU R                  -  -   5      nUb  US::  d   eUS S 2S U24   nU$ )Nzgnd/idx_%dM.ivecsrw   )r   r   r   r   s      r   r:   DatasetBigANN.get_groundtruth   sE    ':TYY'FFG=8O8ArrEB	r   c                     U R                   S:  d   S5       e[        [        U R                  S-   5      S U R                   5      $ )Nrw   dataset too large, use iteratorbigann_base.bvecs)r   r   r   r   r   r   s    r   r'   DatasetBigANN.get_database   s=    yy3A AA
4<<2E#EFxPQQr   c           	   #      #    [        U R                  S-   5      nUu  pEU R                  U-  U-  U R                  US-   -  U-  pv[        XgU5       H  n[	        X8[        X-   U5       5      v   M      g 7f)Nr   r   )r   r   r   r*   r   r+   r,   s	            r   r5   DatasetBigANN.database_iterator   sp     '::;46)477dQh+?6+IB#B2#bgr"2344 $   A2A4)r   r   r   r   r   r   )r   rJ   rK   rM   rN   rO   rP   rQ   r   r   r$   r:   r'   r5   rR   rS   r   r   r   r      s(    
3LTR5r   r   c                   L    \ rS rSrSrSS jrS rSS jrSS jrS r	SS	 jr
S
rg)DatasetDeep1B   zf
See
https://github.com/facebookresearch/faiss/tree/main/benchs#getting-deep1b
on how to get the data
c                     [         R                  U 5        SSSSSS.nX;   d   eSSUS	4u  U l        U l        U l        U l        [        S
-   U l        U R                  < SX R                     < S3U l        g )N100k1M10M100M1B)r   r   i r    ʚ;`   i]r   zdeep1b/deepz_groundtruth.ivecs)	r
   r   r   r   r   r   rz   r   gt_fname)r   r   
nb_to_names      r   r   DatasetDeep1B.__init__   sx    

 ,.	2u,D)$'&2LL*WW-/r   c                 D    [        [        U R                  S-   5      5      $ )Nzdeep1B_queries.fvecs)r   r   r   r   s    r   r   DatasetDeep1B.get_queries   s    
4<<2H#HIJJr   Nc                 l    Ub  UOU R                   n[        [        U R                  S-   5      S U 5      $ )Nzlearn.fvecs)r   r   r   r   r"   s     r   r$   DatasetDeep1B.get_train   s2    '38
4<<-#?@(KLLr   c                 \    [        U R                  5      nUb  US::  d   eUS S 2S U24   nU$ )Nrw   )r   r   r   s      r   r:   DatasetDeep1B.get_groundtruth   s6    &=8O8ArrEB	r   c                     U R                   S::  d   S5       e[        [        U R                  S-   5      S U R                    5      $ )Nr   r   
base.fvecs)r   r   r   r   r   s    r   r'   DatasetDeep1B.get_database   s>    ww%B!BB
4<<,#>?IJJr   c           	   #      #    [        U R                  S-   5      nUu  pEU R                  U-  U-  U R                  US-   -  U-  pv[        XgU5       H  n[	        X8[        X-   U5       5      v   M      g 7f)Nr   r   )r   r   r   r*   r   r+   r,   s	            r   r5   DatasetDeep1B.database_iterator   so     |3446)477dQh+?6+IB#B2#bgr"2344 $r   )r   r   r   r   r   r   )r   rJ   rK   r   rS   r   r   r   r      s(    /KMK5r   r   c                   8    \ rS rSrSrS	S jrS rS rS
S jrSr	g)DatasetGlovei  z<
Data from http://ann-benchmarks.com/glove-100-angular.hdf5
Nc                 &   SS K nU(       a   S5       eU(       d	  [        S-   nUR                  US5      U l        SU l        Su  U l        U l        U R                  S   R                  S   U l        U R                  S   R                  S   U l	        g )	Nr   znot implementedzglove/glove-100-angular.hdf5rIP)rw   r   traintest)
h5pyrz   File
glove_h5pyr   r   r   rF   r   r   )r   locdownloadr   s       r   r   DatasetGlove.__init__  s}    ...|!$BBC))C- //'*003//&)//2r   c                 x    [         R                  " U R                  S   5      n[        R                  " U5        U$ )Nr   r\   arrayr   rs   normalize_L2r   rd   s     r   r   DatasetGlove.get_queries  s,    XXdoof-.2	r   c                 x    [         R                  " U R                  S   5      n[        R                  " U5        U$ )Nr   r   r   r/   s     r   r'   DatasetGlove.get_database  s,    XXdoog./2	r   c                 P    U R                   S   nUb  US::  d   eUS S 2S U24   nU$ )N	neighborsrw   )r   r   s      r   r:   DatasetGlove.get_groundtruth  s6    __[)=8O8ArrEB	r   )r   r   r   r   r   r   )NFrJ   
rM   rN   rO   rP   rQ   r   r   r'   r:   rR   rS   r   r   r   r     s    
3

r   r   c                   4    \ rS rSrSrS rS rS rS	S jrSr	g)
DatasetMusic100i&  zC
get dataset from
https://github.com/stanis-morozov/ip-nsw#dataset
c                     [         R                  U 5        Su  U l        U l        U l        U l        SU l        [        S-   U l        g )N)rw   r   r   r   r   z
music-100/)	r
   r   r   r   r   r   r   rz   r   r   s    r   r   DatasetMusic100.__init__,  s9    ,@)$'&5r   c                 n    [         R                  " U R                  S-   SS9nUR                  SS5      nU$ )Nzquery_music100.binr[   r   r   rw   r\   fromfiler   reshaper   s     r   r   DatasetMusic100.get_queries2  s1    [[(<<INZZC 	r   c                 n    [         R                  " U R                  S-   SS9nUR                  SS5      nU$ )Nzdatabase_music100.binr[   r   r   rw   r   r   s     r   r'   DatasetMusic100.get_database7  s1    [[(??yQZZC 	r   Nc                 x    [         R                  " U R                  S-   5      nUb  US::  d   eUS S 2S U24   nU$ )Nzgt.npyrw   )r\   loadr   r   s      r   r:   DatasetMusic100.get_groundtruth<  s?    WWT\\H,-=8O8ArrEB	r   )r   r   r   r   r   r   rJ   r   rS   r   r   r   r   &  s    
6

r   r   c                   >    \ rS rSrSrS rS rS
S jrS rS
S jr	S	r
g)DatasetGIST1MiC  r   c                     [         R                  U 5        Su  U l        U l        U l        U l        [        S-   U l        g )N)i  r   r   r   zgist1M/r   r   s    r   r   DatasetGIST1M.__init__I  r   r   c                 2    [        U R                  S-   5      $ )Nzgist_query.fvecsr   r   s    r   r   DatasetGIST1M.get_queriesN  r   r   Nc                 Z    Ub  UOU R                   n[        U R                  S-   5      S U $ )Nzgist_learn.fvecsr   r"   s     r   r$   DatasetGIST1M.get_trainQ  r   r   c                 2    [        U R                  S-   5      $ )Nzgist_base.fvecsr   r   s    r   r'   DatasetGIST1M.get_databaseU  r   r   c                 b    [        U R                  S-   5      nUb  US::  d   eUS S 2S U24   nU$ )Nzgist_groundtruth.ivecsrw   r   r   s      r   r:   DatasetGIST1M.get_groundtruthX  r   r   r   rJ   rx   rS   r   r   r   r   C  r   r   r   c                    U S:X  a
  [        5       $ U S:X  a
  [        5       $ U R                  S5      (       a  U S:X  a  SO[        U SS 5      n[	        US9$ U R                  S	5      (       aW  U S
S nUS   S:X  a  S[        USS 5      -  nO.US:X  a  SnO%US   S:X  a  S[        USS 5      -  nO
 SU-   5       e[        US9$ U S:X  a
  [        5       $ U S:X  a	  [        US9$ [        SU -   5      e)zconverts a string describing a dataset to a Dataset object
Supports sift1M, bigann1M..bigann1B, deep1M..deep1B, music-100 and glove
sift1Mgist1Mbigannbigann1Br      r   )r   r   rZ   NMr   r   r   r9   zdid not recognize suffix )r   z	music-100glove)r   zunknown dataset )	r~   r   
startswithintr   r   r   r   RuntimeError)datasetr   dbsizeszsufs       r   dataset_from_namer  `  s   
 (	H				H	%	% J.C"4F&))			F	#	#9s5":.Fd]F2Y#Ccr
O+F=5==5''	K	  	G	X.. -788r   )deep1MF)osnumpyr\   rs   getpassvecs_ior   r   r   r   exhaustive_searchr   r
   rU   getuserusernamerz   r{   existsr|   r~   r   r   r   r   r   r   r  rS   r   r   <module>r"     s    
    D C "8A 8Av%w %\ ?? 	(A

*+-O 
ww~~o&&- O
G :4%5G %5P-5G -5` 7  Fg :G :#9r   