
    9i_                     z    d dl mZmZmZmZmZmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZ ddlmZ  G d de      Zy	)
    )DictIteratorListOptionalTupleUnion)
AddedToken	Tokenizerdecoderspre_tokenizerstrainers)BPE)NFKC   )BaseTokenizerc                       e Zd ZdZ	 	 	 	 	 	 	 ddeeeeeef   f      deeeee	eef   e	eef   f   f      deee
f   ded	ed
ee   dee   f fdZededefd       Zdddgdg dfdeeee   f   dededeeee
f      dedee   defdZdddgdg ddfdeee   eee      f   dededeeee
f      dedee   dedee   fdZ xZS )SentencePieceBPETokenizerzrSentencePiece BPE Tokenizer

    Represents the BPE algorithm, with the pretokenization used by SentencePiece
    N<unk>Tvocabmerges	unk_tokenreplacementadd_prefix_spacedropoutfuse_unkc           	         ||t        t        |||||            }nt        t        |||            }|j                  t        |            |j	                  t        |      g       t               |_        t        j                  ||      |_	        t        j                  ||      |_        d||||d}	t        
| 5  ||	       y )N)r   r   r   )r   r   SentencePieceBPE)modelr   r   r   r   )r
   r   token_to_idstradd_special_tokensr   
normalizerr   	Metaspacepre_tokenizerr   decodersuper__init__)selfr   r   r   r   r   r   r   	tokenizer
parameters	__class__s             v/var/www/html/backtest/airagagent/rag_env/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.pyr'   z"SentencePieceBPETokenizer.__init__   s     !3!#eVWPYdl"mnI!#gU]"^_I  Y0<((#i.)9:#v	"0":":{eu"v	$..;Yij	 ("& 0

 	J/    vocab_filenamemerges_filenamec                 N    t        j                  | |      \  }}t        ||fi |S )N)r   	read_filer   )r.   r/   kwargsr   r   s        r,   	from_filez#SentencePieceBPETokenizer.from_file0   s(    noFv(A&AAr-   i0u     i  files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetshow_progressc                     t        j                  ||||||      }t        |t              r|g}| j                  j                  ||       y)z%Train the model using the given filesr6   r7   r8   r9   r:   r;   )trainerN)r   
BpeTrainer
isinstancer    
_tokenizertrain)	r(   r5   r6   r7   r8   r9   r:   r;   r>   s	            r,   rB   zSentencePieceBPETokenizer.train5   sP     %%!'))-'
 eS!GEeW5r-   iteratorlengthc	                 v    t        j                  ||||||      }	| j                  j                  ||	|       y)z(Train the model using the given iteratorr=   )r>   rD   N)r   r?   rA   train_from_iterator)
r(   rC   r6   r7   r8   r9   r:   r;   rD   r>   s
             r,   rF   z-SentencePieceBPETokenizer.train_from_iteratorM   sH     %%!'))-'
 	++ 	, 	
r-   )NNr   u   ▁TNF)__name__
__module____qualname____doc__r   r   r    r   intr   r	   boolfloatr'   staticmethodr3   r   rB   r   rF   __classcell__)r+   s   @r,   r   r   
   s    7;OS,3 !%#'#(0c4S>1230 sDsCx%S/)I$JJKL0 j)	0
 0 0 %0 4.0@ B# B B B  8?y"&("6S$s)^$6 6 	6
 U3
?346 6 s)6 66  8?y"&(" $
x'>>?
 
 	

 U3
?34
 
 s)
 
 
r-   r   N)typingr   r   r   r   r   r   
tokenizersr	   r
   r   r   r   tokenizers.modelsr   tokenizers.normalizersr   base_tokenizerr   r    r-   r,   <module>rV      s(    ? ? P P ! ' )\
 \
r-   