
    9i2                       d Z ddlmZ ddlZddlZddlmZmZ ddlm	Z	 ddl
mZ ddlmZmZmZmZ ddlmZmZ dd	lmZmZ erdd
lmZmZmZmZ ddlmZ 	 ddlZdZ	 ddl!m"Z" dZ# ejH                  e%      Z& edd      Z' G d dee      Z( G d de(      Z) G d de*e      Z+ e	d       G d d             Z,ddZ-y# e $ r dZY lw xY w# e $ r dZ#Y pw xY w)zText splitter base interface.    )annotationsN)ABCabstractmethod)	dataclass)Enum)TYPE_CHECKINGAnyLiteralTypeVar)BaseDocumentTransformerDocument)Selfoverride)Callable
CollectionIterableSequence)SetTF)PreTrainedTokenizerBaseTSTextSplitter)boundc                      e Zd ZdZddedddf	 	 	 	 	 	 	 	 	 	 	 	 	 ddZedd       Z	 d	 	 	 	 	 dd	Zdd
Z	ddZ
ddZe	 	 	 	 	 	 dd       Zedd e       df	 	 	 	 	 	 	 	 	 	 	 dd       Ze	 	 	 	 	 	 dd       Zy)r   z)Interface for splitting text into chunks.i     FTc                    |dk  rd| }t        |      |dk  rd| }t        |      ||kD  rd| d| d}t        |      || _        || _        || _        || _        || _        || _        y)ad  Create a new TextSplitter.

        Args:
            chunk_size: Maximum size of chunks to return
            chunk_overlap: Overlap in characters between chunks
            length_function: Function that measures the length of given chunks
            keep_separator: Whether to keep the separator and where to place it
                            in each corresponding chunk (True='start')
            add_start_index: If `True`, includes chunk's start index in metadata
            strip_whitespace: If `True`, strips whitespace from the start and end of
                              every document
        r   zchunk_size must be > 0, got z chunk_overlap must be >= 0, got zGot a larger chunk overlap (z) than chunk size (z), should be smaller.N)
ValueError_chunk_size_chunk_overlap_length_function_keep_separator_add_start_index_strip_whitespace)self
chunk_sizechunk_overlaplength_functionkeep_separatoradd_start_indexstrip_whitespacemsgs           g/var/www/html/backtest/airagagent/rag_env/lib/python3.12/site-packages/langchain_text_splitters/base.py__init__zTextSplitter.__init__/   s    * ?0=CS/!14]ODCS/!:%.}o ><46  S/!%+ /- /!1    c                     y)z$Split text into multiple components.N )r#   texts     r+   
split_textzTextSplitter.split_textW   s    r-   Nc           	        |xs i gt        |      z  }g }t        |      D ]  \  }}d}d}| j                  |      D ]  }	t        j                  ||         }
| j
                  r>||z   | j                  z
  }|j                  |	t        d|            }||
d<   t        |	      }t        |	|
      }|j                  |         |S )z&Create documents from a list of texts.r   start_index)page_contentmetadata)len	enumerater1   copydeepcopyr!   r   findmaxr   append)r#   texts	metadatas
metadatas_	documentsir0   indexprevious_chunk_lenchunkr5   offsetnew_docs                r+   create_documentszTextSplitter.create_documents[   s     32$U"3
	 ' 	*GAtE!". *==A7(("%77$:M:MMF IIeSF^<E.3H]+),U&"I  )*	* r-   c                    g g }}|D ]8  }|j                  |j                         |j                  |j                         : | j                  ||      S )zSplit documents.)r>   )r<   r4   r5   rG   )r#   r@   r=   r>   docs        r+   split_documentszTextSplitter.split_documentso   sV    ry 	+CLL))*S\\*	+ $$Ui$@@r-   c                h    |j                  |      }| j                  r|j                         }|xs d S N)joinr"   strip)r#   docs	separatorr0   s       r+   
_join_docszTextSplitter._join_docsw   s.    ~~d#!!::<D|tr-   c                \   | j                  |      }g }g }d}|D ]i  }| j                  |      }||z   t        |      dkD  r|ndz   | j                  kD  r|| j                  kD  r!t        j	                  d|| j                         t        |      dkD  r| j                  ||      }	|	|j                  |	       || j                  kD  s*||z   t        |      dkD  r|ndz   | j                  kD  ro|dkD  rj|| j                  |d         t        |      dkD  r|ndz   z  }|dd  }|| j                  kD  r?||z   t        |      dkD  r|ndz   | j                  kD  r|dkD  rj|j                  |       ||t        |      dkD  r|ndz   z  }l | j                  ||      }	|	|j                  |	       |S )Nr   zACreated a chunk of size %d, which is longer than the specified %d   )r   r6   r   loggerwarningrQ   r<   r   )
r#   splitsrP   separator_lenrO   current_doctotaldlen_rI   s
             r+   _merge_splitszTextSplitter._merge_splits}   s    --i8!# 	KA((+D[1AA1E1M""# 4+++NN'((	 {#a'//+yACC(  $"5"55[9IA9MSTU**+!AI!6!6{1~!F-0-=-AMq"  '2!"o  $"5"55[9IA9MSTU**+!AI q!Tc+.>.B]JJE=	K> ook95?KKr-   c                    t         sd}t        |      t        t              sd}t        |      dfd} | dd|i|S )z>Text splitter that uses HuggingFace tokenizer to count length.z`Could not import transformers python package. Please install it with `pip install transformers`.zATokenizer received was not an instance of PreTrainedTokenizerBasec                8    t        j                  |             S rL   )r6   tokenizer0   	tokenizers    r+   _huggingface_tokenizer_lengthzNTextSplitter.from_huggingface_tokenizer.<locals>._huggingface_tokenizer_length   s    y))$/00r-   r&   r0   strreturnintr/   )_HAS_TRANSFORMERSr   
isinstancer   )clsra   kwargsr*   rb   s    `   r+   from_huggingface_tokenizerz'TextSplitter.from_huggingface_tokenizer   sT    
 !E  S/!)%<=UCS/!	1 K#@KFKKr-   gpt2allc                   	 t         sd}t        |      |t        j                  |      	nt        j                  |      	d	fd}t        | t              r||d}i ||} | dd|i|S )z9Text splitter that uses tiktoken encoder to count length.zCould not import tiktoken python package. This is needed in order to calculate max_tokens_for_prompt. Please install it with `pip install tiktoken`.c                >    t        j                  |             S N)allowed_specialdisallowed_special)r6   encode)r0   rq   rr   encs    r+   _tiktoken_encoderz=TextSplitter.from_tiktoken_encoder.<locals>._tiktoken_encoder   s*    

$3'9   r-   )encoding_name
model_namerq   rr   r&   rc   r/   )_HAS_TIKTOKENImportErrortiktokenencoding_for_modelget_encoding
issubclassTokenTextSplitter)
ri   rv   rw   rq   rr   rj   r*   ru   extra_kwargsrt   s
      ``    @r+   from_tiktoken_encoderz"TextSplitter.from_tiktoken_encoder   s     A 
 c""!--j9C''6C	 c,-!.(#2&8	L 0/,/F?#4???r-   c                6    | j                  t        |            S )z2Transform sequence of documents by splitting them.)rJ   list)r#   r@   rj   s      r+   transform_documentsz TextSplitter.transform_documents   s    
 ##DO44r-   )r$   rf   r%   rf   r&   zCallable[[str], int]r'   zbool | Literal['start', 'end']r(   boolr)   r   re   Noner0   rd   re   	list[str]rL   )r=   r   r>   zlist[dict[Any, Any]] | Nonere   list[Document])r@   zIterable[Document]re   r   )rO   r   rP   rd   re   
str | None)rV   zIterable[str]rP   rd   re   r   )ra   r   rj   r	   re   r   )rv   rd   rw   r   rq   !Literal['all'] | AbstractSet[str]rr    Literal['all'] | Collection[str]rj   r	   re   r   )r@   Sequence[Document]rj   r	   re   r   )__name__
__module____qualname____doc__r6   r,   r   r1   rG   rJ   rQ   r\   classmethodrk   setr   r   r   r/   r-   r+   r   r   ,   sp   3  039> %!%&2&2 &2 .	&2
 7&2 &2 &2 
&2P 3 3 JN+F	(A*X L/L;>L	L L(  $!%=@U?D(@(@ (@ ;	(@
 =(@ (@ 
(@ (@T 5+57:5	5 5r-   c                  V     e Zd ZdZdd e       df	 	 	 	 	 	 	 	 	 	 	 d fdZddZ xZS )	r~   z/Splitting text to tokens using model tokenizer.rl   Nrm   c                    t        |   di | t        sd}t        |      |t	        j
                  |      }nt	        j                  |      }|| _        || _        || _	        y)zCreate a new TextSplitter.zCould not import tiktoken python package. This is needed in order to for TokenTextSplitter. Please install it with `pip install tiktoken`.Nr/   )
superr,   rx   ry   rz   r{   r|   
_tokenizer_allowed_special_disallowed_special)	r#   rv   rw   rq   rr   rj   r*   rt   	__class__s	           r+   r,   zTokenTextSplitter.__init__   sm     	"6"A 
 c""!--j9C''6C /#5 r-   c                     d fd}t         j                   j                   j                  j                  |      }t        ||      S )an  Splits the input text into smaller chunks based on tokenization.

        This method uses a custom tokenizer configuration to encode the input text
        into tokens, processes the tokens in chunks of a specified size with overlap,
        and decodes them back into text chunks. The splitting is performed using the
        `split_text_on_tokens` function.

        Args:
            text: The input text to be split into smaller chunks.

        Returns:
            A list of text chunks, where each chunk is derived from a portion
            of the input text based on the tokenization and chunking rules.
        c                h    j                   j                  | j                  j                        S rp   )r   rs   r   r   )_textr#   s    r+   _encodez-TokenTextSplitter.split_text.<locals>._encode  s4    ??)) $ 5 5#'#;#; *  r-   )r%   tokens_per_chunkdecoders   r`   )r   rd   re   z	list[int])	Tokenizerr   r   r   r   split_text_on_tokens)r#   r0   r   ra   s   `   r+   r1   zTokenTextSplitter.split_text  sE     	 --!--??))	
	 $CCr-   )rv   rd   rw   r   rq   r   rr   r   rj   r	   re   r   r   )r   r   r   r   r   r,   r1   __classcell__)r   s   @r+   r~   r~      s[    9 $!%=@U?D66 6 ;	6
 =6 6 
64Dr-   r~   c                  |    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZy)Languagez"Enum of the programming languages.cppgojavakotlinjstsphpprotopythonrstrubyrustscalaswiftmarkdownlatexhtmlsolcsharpcobolcluaperlhaskellelixir
powershellvisualbasic6N)r   r   r   r   CPPGOJAVAKOTLINJSr   PHPPROTOPYTHONRSTRUBYRUSTSCALASWIFTMARKDOWNLATEXHTMLSOLCSHARPCOBOLCLUAPERLHASKELLELIXIR
POWERSHELLVISUALBASIC6r/   r-   r+   r   r   /  s    ,
C	BDF	B	B
CEF
CDDEEHED
CFEA
CDGFJ!Lr-   r   )frozenc                  @    e Zd ZU dZded<   	 ded<   	 ded<   	 ded<   y	)
r   zTokenizer data class.rf   r%   r   zCallable[[list[int]], str]r   zCallable[[str], list[int]]rs   N)r   r   r   r   __annotations__r/   r-   r+   r   r   O  s)    *,&&=&&=r-   r   c                   g }|j                  |       }d}|j                  |j                  k  rd}t        |      |t	        |      k  rt        ||j                  z   t	        |            }||| }|s	 |S |j                  |      }|r|j                  |       |t	        |      k(  r	 |S ||j                  |j                  z
  z  }|t	        |      k  r|S )z6Split incoming text and return chunks using tokenizer.r   z3tokens_per_chunk must be greater than chunk_overlap)rs   r   r%   r   r6   minr   r<   )	r0   ra   rV   	input_ids	start_idxr*   cur_idx	chunk_idsdecodeds	            r+   r   r   ]  s    F  &II!!Y%<%<<Co
c)n
$i)"<"<<c)nMi0	 M ""9-MM'"c)n$M 	Y//)2I2III	 c)n
$ Mr-   )r0   rd   ra   r   re   r   ).r   
__future__r   r8   loggingabcr   r   dataclassesr   enumr   typingr   r	   r
   r   langchain_core.documentsr   r   typing_extensionsr   r   collections.abcr   r   r   r   r   AbstractSetrz   rx   ry   $transformers.tokenization_utils_baser   rg   	getLoggerr   rT   r   r   r~   rd   r   r   r   r/   r-   r+   <module>r      s    # "   # !   G ,HH2ML 
		8	$T(B5*C B5J;D ;D|"sD "@ $
> 
> 
>A
  M  s$   B? C ?C	C	CC