
    Ԕ9i@                        d dl Z d dlZd dlZd dlZddlmZmZmZm	Z	m
Z
mZ ddlmZ  G d d      Z G d de      Z ej                          Zd	d
de dfD ]#  ae j&                  j)                  t$              s# n dad Z G d de      Zd Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      ZddZy)     N   )
fvecs_read
ivecs_read
bvecs_mmap
fvecs_mmap
bvecs_iterbvecs_iter_chunked)knnc                   N    e Zd ZdZd Zd ZddZd ZddZddZ	dd	Z
d
 Zd Zy)Datasetz+ Generic abstract class for a test dataset c                 J    d| _         d| _        d| _        d| _        d| _        y)z2 the constructor should set the following fields: L2Ndmetricnqnbntselfs    `/var/www/html/backtest/airagagent/rag_env/lib/python3.12/site-packages/faiss/contrib/datasets.py__init__zDataset.__init__   s%        c                     t               )z' return the queries as a (nq, d) array NotImplementedErrorr   s    r   get_querieszDataset.get_queries       !##r   Nc                     t               )z' return the queries as a (nt, d) array r   r   maxtrains     r   	get_trainzDataset.get_train   r   r   c                     t               )z' return the queries as a (nb, d) array r   r   s    r   get_databasezDataset.get_database#   r   r   c              #      K   | j                         }|\  }}| j                  |z  |z  | j                  |dz   z  |z  }}t        |||      D ]  }||t        ||z   |         yw)a7  returns an iterator on database vectors.
        bs is the number of vectors per batch
        split = (nsplit, rank) means the dataset is split in nsplit
        shards and we want shard number rank
        The default implementation just iterates over the full matrix
        returned by get_dataset.
        r   N)r%   r   rangemin	r   bssplitxbnsplitranki0i1j0s	            r   database_iteratorzDataset.database_iterator'   sw       46)477dQh+?6+IBB# 	+BRR"Wb)**	+s   A$A&c                     t               )z7 return the ground truth for k-nearest neighbor search r   r   ks     r   get_groundtruthzDataset.get_groundtruth5   r   r   c                     t               )z* return the ground truth for range search r   )r   threshs     r   get_groundtruth_rangezDataset.get_groundtruth_range9   r   r   c           
          d| j                    d| j                   d| j                   d| j                   d| j                   
S )Nzdataset in dimension z, with metric z
, size: Q z B z T r   r   s    r   __str__zDataset.__str__=   sD    'x~dkk] K77)3twwis477)= 	>r   c                    | j                         j                  | j                  | j                  fk(  sJ | j                  dkD  rA| j                  d      }|j                  d| j                  fk(  sJ d|j                         | j                         j                  | j                  | j                  fk(  sJ | j                  d      j                  | j                  dfk(  sJ y)z8 runs the previous and checks the sizes of the matrices r   {   )r"   zshape=   )r5   N)	r   shaper   r   r   r#   r%   r   r6   )r   xts     r   check_sizeszDataset.check_sizesA   s    !''DGGTVV+<<<<77Q;-B88TVV},GBHH.GG,  "((TWWdff,====##b#)//DGGR=@@@r   N   )r   r   )__name__
__module____qualname____doc__r   r   r#   r%   r2   r6   r9   r;   rA    r   r   r   r      s3    5$$$+$$>Ar   r   c                   4    e Zd ZdZddZd Zd	dZd Zd
dZy)SyntheticDatasetzOA dataset that is not completely random but still challenging to
    index
    c                    t         j                  |        ||||f\  | _        | _        | _        | _        d}||z   |z   }t        j                  j                  |      }	|	j                  ||f      }
t        j                  |
|	j                  ||            }
|
|	j                  |      dz  dz   z  }
t        j                  |
      }
|
j                  d      }
|| _        |
d | | _        |
|||z    | _        |
||z   d  | _        y )N
   )size   g?float32)r   r   r   r   r   r   nprandomRandomStatenormaldotrandsinastyper   r@   r,   xq)r   r   r   r   r   r   seedd1nrsxs              r   r   zSyntheticDataset.__init__P   s    ,-r2rM)$'GbLYY""4(IIAr7I#FF1bggb!n% a#%&FF1IHHYCR&BrBw-BGH+r   c                     | j                   S rB   )rY   r   s    r   r   zSyntheticDataset.get_queriesb       wwr   Nc                 @    ||n| j                   }| j                  d | S rB   )r   r@   r!   s     r   r#   zSyntheticDataset.get_traine   s#    '38wwy!!r   c                     | j                   S rB   )r,   r   s    r   r%   zSyntheticDataset.get_databasei   r`   r   c                     t        | j                  | j                  || j                  dk(  rt        j
                        d   S t        j                        d   S )Nr   r   )r
   rY   r,   r   faiss	METRIC_L2METRIC_INNER_PRODUCTr4   s     r   r6   z SyntheticDataset.get_groundtruthl   sW    GGTWWa#{{d2EOO
  	8=8R8R
  	r   )r   i:  rB   )d   	rE   rF   rG   rH   r   r   r#   r%   r6   rI   r   r   rK   rK   K   s     $"r   rK   z/datasets01/simsearch/041218/z7/mnt/vol/gfsai-flash3-east/ai-group/datasets/simsearch/z/home/z/simsearch/data/zdata/c                     | a y rB   )dataset_basedir)paths    r   set_dataset_basedirrl      s    Or   c                   2    e Zd ZdZd Zd ZddZd ZddZy)	DatasetSIFT1Mz_
    The original dataset is available at: http://corpus-texmex.irisa.fr/
    (ANN_SIFT1M)
    c                     t         j                  |        d\  | _        | _        | _        | _        t        dz   | _        y )N)rD   順 @B '  zsift1M/r   r   r   r   r   r   rj   basedirr   s    r   r   zDatasetSIFT1M.__init__   2    ,G)$'&2r   c                 2    t        | j                  dz         S )Nzsift_query.fvecsr   rt   r   s    r   r   zDatasetSIFT1M.get_queries       $,,);;<<r   Nc                 X    ||n| j                   }t        | j                  dz         d | S )Nzsift_learn.fvecsr   r   rt   r!   s     r   r#   zDatasetSIFT1M.get_train   .    '38$,,);;<YhGGr   c                 2    t        | j                  dz         S )Nzsift_base.fvecsrw   r   s    r   r%   zDatasetSIFT1M.get_database       $,,)::;;r   c                 ^    t        | j                  dz         }||dk  sJ |d d d |f   }|S )Nzsift_groundtruth.ivecsrg   r   rt   r   r5   gts      r   r6   zDatasetSIFT1M.get_groundtruth   <    '??@=8O8ArrEB	r   rB   rh   rI   r   r   rn   rn      !    
3
=H<r   rn   c                 0    t        j                  | d      S )NrP   dtype)rQ   ascontiguousarray)r^   s    r   sanitizer      s    33r   c                   <    e Zd ZdZd	dZd Zd
dZd
dZd ZddZ	y)DatasetBigANNz_
    The original dataset is available at: http://corpus-texmex.irisa.fr/
    (ANN_SIFT1B)
    c                     t         j                  |        |dv sJ || _        |dz  }dd|df\  | _        | _        | _        | _        t        dz   | _        y )N)
r         rM      2   rg      i    rq   rD    rr   zbigann/)	r   r   nb_Mr   r   r   r   rj   rt   )r   r   r   s      r   r   zDatasetBigANN.__init__   sZ    AAAA	E\,/E,A)$'&2r   c                 J    t        t        | j                  dz         d d        S )Nzbigann_query.bvecs)r   r   rt   r   s    r   r   zDatasetBigANN.get_queries   s!    
4<<2F#FGJKKr   Nc                 j    ||n| j                   }t        t        | j                  dz         d |       S )Nzbigann_learn.bvecs)r   r   r   rt   r!   s     r   r#   zDatasetBigANN.get_train   s3    '38
4<<2F#FG	RSSr   c                 x    t        | j                  d| j                  z  z         }||dk  sJ |d d d |f   }|S )Nzgnd/idx_%dM.ivecsrg   )r   rt   r   r   s      r   r6   zDatasetBigANN.get_groundtruth   sE    ':TYY'FFG=8O8ArrEB	r   c                     | j                   dk  sJ d       t        t        | j                  dz         d | j                         S )Nrg   dataset too large, use iteratorbigann_base.bvecs)r   r   r   rt   r   r   s    r   r%   zDatasetBigANN.get_database   s=    yy3A AA
4<<2E#EFxPQQr   c           	   #      K   t        | j                  dz         }|\  }}| j                  |z  |z  | j                  |dz   z  |z  }}t        |||      D ]  }t	        ||t        ||z   |              ! y w)Nr   r   )r   rt   r   r'   r   r(   r)   s	            r   r2   zDatasetBigANN.database_iterator   s     '::;46)477dQh+?6+IBB# 	5B2b#b2gr"2344	5   A5A7)r   rB   rC   
rE   rF   rG   rH   r   r   r#   r6   r%   r2   rI   r   r   r   r      s(    
3LTR5r   r   c                   <    e Zd ZdZd	dZd Zd
dZd
dZd ZddZ	y)DatasetDeep1Bzv
    See
    https://github.com/facebookresearch/faiss/tree/main/benchs#getting-deep1b
    on how to get the data
    c                     t         j                  |        dddddd}||v sJ dd|d	f\  | _        | _        | _        | _        t        d
z   | _        | j                  d|| j                     d| _        y )N100k1M10M100M1B)rp   rq   逖 r    ʚ;`   i]rr   zdeep1b/deepz_groundtruth.ivecs)	r   r   r   r   r   r   rj   rt   gt_fname)r   r   
nb_to_names      r   r   zDatasetDeep1B.__init__   s|    

 Z,.	2u,D)$'&2LL*TWW-/r   c                 D    t        t        | j                  dz               S )Nzdeep1B_queries.fvecs)r   r   rt   r   s    r   r   zDatasetDeep1B.get_queries   s    
4<<2H#HIJJr   Nc                 j    ||n| j                   }t        t        | j                  dz         d |       S )Nzlearn.fvecs)r   r   r   rt   r!   s     r   r#   zDatasetDeep1B.get_train   s2    '38
4<<-#?@(KLLr   c                 X    t        | j                        }||dk  sJ |d d d |f   }|S )Nrg   )r   r   r   s      r   r6   zDatasetDeep1B.get_groundtruth   s6    &=8O8ArrEB	r   c                     | j                   dk  sJ d       t        t        | j                  dz         d | j                          S )Nr   r   
base.fvecs)r   r   r   rt   r   s    r   r%   zDatasetDeep1B.get_database   s>    ww%B!BB
4<<,#>?IJJr   c           	   #      K   t        | j                  dz         }|\  }}| j                  |z  |z  | j                  |dz   z  |z  }}t        |||      D ]  }t	        ||t        ||z   |              ! y w)Nr   r   )r   rt   r   r'   r   r(   r)   s	            r   r2   zDatasetDeep1B.database_iterator   s     |3446)477dQh+?6+IBB# 	5B2b#b2gr"2344	5r   )r   rB   rC   r   rI   r   r   r   r      s(    /KMK5r   r   c                   ,    e Zd ZdZddZd Zd ZddZy)	DatasetGlovezD
    Data from http://ann-benchmarks.com/glove-100-angular.hdf5
    Nc                    dd l }|rJ d       |s	t        dz   }|j                  |d      | _        d| _        d\  | _        | _        | j                  d   j                  d   | _        | j                  d   j                  d   | _	        y )	Nr   znot implementedzglove/glove-100-angular.hdf5rIP)rg   r   traintest)
h5pyrj   File
glove_h5pyr   r   r   r?   r   r   )r   locdownloadr   s       r   r   zDatasetGlove.__init__	  s}    ...|!$BBC))C- //'*003//&)//2r   c                 t    t        j                  | j                  d         }t        j                  |       |S )Nr   rQ   arrayr   rd   normalize_L2r   rY   s     r   r   zDatasetGlove.get_queries  s,    XXdoof-.2	r   c                 t    t        j                  | j                  d         }t        j                  |       |S )Nr   r   r   r,   s     r   r%   zDatasetGlove.get_database  s,    XXdoog./2	r   c                 L    | j                   d   }||dk  sJ |d d d |f   }|S )N	neighborsrg   )r   r   s      r   r6   zDatasetGlove.get_groundtruth  s6    __[)=8O8ArrEB	r   )NFrB   rE   rF   rG   rH   r   r   r%   r6   rI   r   r   r   r     s    
3

r   r   c                   *    e Zd ZdZd Zd Zd ZddZy)DatasetMusic100zO
    get dataset from
    https://github.com/stanis-morozov/ip-nsw#dataset
    c                     t         j                  |        d\  | _        | _        | _        | _        d| _        t        dz   | _        y )N)rg   r   rq   rr   r   z
music-100/)	r   r   r   r   r   r   r   rj   rt   r   s    r   r   zDatasetMusic100.__init__-  s9    ,@)$'&5r   c                 r    t        j                  | j                  dz   d      }|j                  dd      }|S )Nzquery_music100.binrP   r   r   rg   rQ   fromfilert   reshaper   s     r   r   zDatasetMusic100.get_queries3  s1    [[(<<INZZC 	r   c                 r    t        j                  | j                  dz   d      }|j                  dd      }|S )Nzdatabase_music100.binrP   r   r   rg   r   r   s     r   r%   zDatasetMusic100.get_database8  s1    [[(??yQZZC 	r   Nc                 r    t        j                  | j                  dz         }||dk  sJ |d d d |f   }|S )Nzgt.npyrg   )rQ   loadrt   r   s      r   r6   zDatasetMusic100.get_groundtruth=  s?    WWT\\H,-=8O8ArrEB	r   rB   r   rI   r   r   r   r   '  s    
6

r   r   c                   2    e Zd ZdZd Zd ZddZd ZddZy)	DatasetGIST1Mz_
    The original dataset is available at: http://corpus-texmex.irisa.fr/
    (ANN_GIST1M)
    c                     t         j                  |        d\  | _        | _        | _        | _        t        dz   | _        y )N)i  rp   rq   rr   zgist1M/rs   r   s    r   r   zDatasetGIST1M.__init__K  ru   r   c                 2    t        | j                  dz         S )Nzgist_query.fvecsrw   r   s    r   r   zDatasetGIST1M.get_queriesP  rx   r   Nc                 X    ||n| j                   }t        | j                  dz         d | S )Nzgist_learn.fvecsrz   r!   s     r   r#   zDatasetGIST1M.get_trainS  r{   r   c                 2    t        | j                  dz         S )Nzgist_base.fvecsrw   r   s    r   r%   zDatasetGIST1M.get_databaseW  r}   r   c                 ^    t        | j                  dz         }||dk  sJ |d d d |f   }|S )Nzgist_groundtruth.ivecsrg   r   r   s      r   r6   zDatasetGIST1M.get_groundtruthZ  r   r   rB   rh   rI   r   r   r   r   E  r   r   r   c                   J    e Zd ZdZddZd ZddZd ZddZddZ	dd	Z
d
 Zy)DatasetDINO10Ba  
    Data from https://dl.fbaipublicfiles.com/large_objects/dino_vitl_10B/
    The dataset contains 10 billion 1024-d vectors extracted from image patches from the YFCC100M dataset, using a Dino-ViT-L 16 model (facebook/dinov3-vitl16-pretrain-lvd1689m).
    The dataset is sharded in multiple chunked .bvecs files. Downloading instructions can be obtained with "wget https://dl.fbaipublicfiles.com/large_objects/dino_vitl_10B/README.md".
    Supported sizes : 100k 200k 500k 1M ... 5B 10B listed in supported_nbs (see __init__).
    c                    t         j                  |        g d}||vr|st        d| d|       t        j                  j                  t              st        dt               t        dz   | _        | j                  dz   | _        t        j                  j                  | j                        sJ d| j                          | j                  dz   | _	        t        j                  j                  | j                        sJ d	| d
| j                          | j                  dz   dz   t        |      z   dz   dz   | _        | j                  dz   | _        || _        d| _        d| _        d| _        d| _        y )N)rp   i@ i  rq   i i@KL r   i -1ir   i i er   i 5wl    rT     d(	 zUnsupported dataset size: z, supported values are: z0Provided dataset base directory does not exist: zdino_vitl_10B/chunked_base_10Bz2Index path should exist, check your dataset path: zqueries_clean.bvecsz*Queries path should exist as dataset size z is supported: zgts/gts_dino_patch__zk10.npyztrain_queries_99M.bvecsi   rp   ir   )r   r   
ValueErrorosrk   existsrj   rt   indexdir
queriesdirstrgtsdirtrain_queriesdirr   r   r   r   r   )r   r   ignore_supportedsupported_nbss       r   r   zDatasetDINO10B.__init__h  sf    ]]"+;9"=UVcUdeffww~~o.OP_O`abb&)99'99ww~~dmm,r0bcgcpcpbq.rr,,,)>>ww~~doo.  	B2\]_\``optpp  pA  1B  	B.llV+.??#b'ICOR[[ $/H Hr   c                 B    t        | j                        }t        |      S )z!Get all vectors as a single array)r   r   r   )r   queriess     r   r   zDatasetDINO10B.get_queries}  s    T__-  r   Nc                 h    ||dkD  rt        d      t        t        | j                        d|       S )z,Get training query vectors as a single arrayNr   zThe training set is potentially too large to fit in RAM (400 GB of data). Please use train_iterator or use maxtrain parameter below 10_000_000 to get the first maxtrain training vectors.)r   r   r   r   r!   s     r   r#   zDatasetDINO10B.get_train  sA    x*4%  'c  d  d
4#8#89)8DEEr   c                     | j                   dkD  rt        d      t        t        | j                  | j                         j                               S )z*Get all database vectors as a single arrayr   zThe dataset is potentially too large to fit in RAM. Please use database_iterator or use a dataset size equal to or below 10_000_000.
batch_size)r   r   r   r	   r   __next__r   s    r   r%   zDatasetDINO10B.get_database  sG    77Z%  'm  n  n.t}}QZZ\]]r   c              #   
  K   d}t        | j                  |      D ]c  }||j                  d   z   | j                  kD  r|d| j                  |z
   }t	        |       ||j                  d   z  }|| j                  k\  sc y yw)z_Iterator over the database of size nb, corresponding to the first nb vectors in the .bvecs filer   r   N)r	   r   r?   r   r   )r   r*   
total_readbatchs       r   r2   z DatasetDINO10B.database_iterator  s     
'"E 	EEKKN*TWW43tww345/!%++a.(JTWW$	s   A<B?Bc              #   ^   K   t        | j                  |      D ]  }t        |        yw)z;Iterator over all training query vectors in the .bvecs filer   N)r   r   r   )r   r*   r   s      r   train_iteratorzDatasetDINO10B.train_iterator  s,      5 5"E 	"E5/!	"s   +-c                 z    |dkD  rt        d      t        j                  | j                        }|ddd|f   }|S )zGet ground truth from .npy filerM   z+Ground truth files only available for k<=10N)r   rQ   r   r   )r   r5   gtss      r   r6   zDatasetDINO10B.get_groundtruth  s=    r6%&STTggdkk"!RaR%j
r   c                      y)N	euclideanrI   r   s    r   distancezDatasetDINO10B.distance  s    r   )FrB   )rr   )rM   )rE   rF   rG   rH   r   r   r#   r%   r2   r   r6   r   rI   r   r   r   r   a  s1    *!
F^	"
r   r   c                 2   | dk(  r
t               S | dk(  r
t               S | j                  d      r!| dk(  rdnt        | dd       }t	        |      S | j                  d	      rW| d
d }|d   dk(  rdt        |dd       z  }n,|dk(  rd}n$|d   dk(  rdt        |dd       z  }n
J d|z          t        |      S | dk(  r
t               S | dk(  rt        |      S | j                  d      r!| dk(  rdnt        | d
d       }t        |      S t        d| z         )z converts a string describing a dataset to a Dataset object
    Supports sift1M, bigann1M..bigann1B, deep1M..deep1B, music-100 and glove
    sift1Mgist1Mbigannbigann1Br      r   )r   r   rO   NMrq   r   r   r5   zdid not recognize suffix )r   z	music-100glove)r   dinodino10Br   zunknown dataset )
rn   r   
startswithintr   r   r   r   r   RuntimeError)datasetr   dbsizeszsufs       r   dataset_from_namer
    s?   
 (	H				H	% J.C"4F&))			F	#9s5":.Fd]F2Y#Ccr
O+F=5==5''	K	  	G	X..			F	##*i#7S=M(( -788r   )deep1MF)r   numpyrQ   rd   getpassvecs_ior   r   r   r   r   r	   exhaustive_searchr
   r   rK   getuserusernamerj   rk   r   rl   rn   r   r   r   r   r   r   r   r
  rI   r   r   <module>r     s    
    d c "8A 8Av%w %\ 7?? 	(A

*+- O 
ww~~o& O
G :4%5G %5P-5G -5` 7  Fg <G 8GW GR'9r   