
    i2                       S r SSKJr  SSKrSSKrSSKJrJr  SSKJ	r	  SSK
Jr  SSKJrJrJrJr  SSKJrJr  SS	KJrJr  \(       a  SS
KJrJrJrJr  SSKJr   SSKrSr SSK!J"r"  Sr#\RH                  " \%5      r&\" SSS9r' " S S\\5      r( " S S\(5      r) " S S\*\5      r+\	" SS9 " S S5      5       r,SS jr-g! \  a    Sr Nhf = f! \  a    Sr# Nmf = f)zText splitter base interface.    )annotationsN)ABCabstractmethod)	dataclass)Enum)TYPE_CHECKINGAnyLiteralTypeVar)BaseDocumentTransformerDocument)Selfoverride)Callable
CollectionIterableSequence)SetTF)PreTrainedTokenizerBaseTSTextSplitter)boundc                     \ rS rSrSrSS\SSS4             SS jjr\SS j5       r S     SS
 jjr	SS jr
SS jrSS jr\      SS j5       r\SS	\" 5       S4           SS jj5       r\      SS j5       rSrg	)r   ,   z)Interface for splitting text into chunks.i     FTc                    US::  a  SU 3n[        U5      eUS:  a  SU 3n[        U5      eX!:  a  SU SU S3n[        U5      eXl        X l        X0l        X@l        XPl        X`l        g)a  Create a new TextSplitter.

Args:
    chunk_size: Maximum size of chunks to return
    chunk_overlap: Overlap in characters between chunks
    length_function: Function that measures the length of given chunks
    keep_separator: Whether to keep the separator and where to place it
                    in each corresponding chunk (True='start')
    add_start_index: If `True`, includes chunk's start index in metadata
    strip_whitespace: If `True`, strips whitespace from the start and end of
                      every document
r   zchunk_size must be > 0, got z chunk_overlap must be >= 0, got zGot a larger chunk overlap (z) than chunk size (z), should be smaller.N)
ValueError_chunk_size_chunk_overlap_length_function_keep_separator_add_start_index_strip_whitespace)self
chunk_sizechunk_overlaplength_functionkeep_separatoradd_start_indexstrip_whitespacemsgs           m/home/dmtnaga/Documents/work/airagagent/rag_env/lib/python3.13/site-packages/langchain_text_splitters/base.py__init__TextSplitter.__init__/   s    * ?0=CS/!14]ODCS/!%.}o ><46  S/!%+ /- /!1    c                    g)z$Split text into multiple components.N )r$   texts     r,   
split_textTextSplitter.split_textW   s    r/   Nc           	        U=(       d    0 /[        U5      -  n/ n[        U5       H  u  pVSnSnU R                  U5       H  n	[        R                  " X5   5      n
U R
                  (       a<  Xx-   U R                  -
  nUR                  U	[        SU5      5      nXzS'   [        U	5      n[        XS9nUR                  U5        M     M     U$ )z&Create documents from a list of texts.r   start_index)page_contentmetadata)len	enumerater3   copydeepcopyr"   r   findmaxr   append)r$   texts	metadatas
metadatas_	documentsir2   indexprevious_chunk_lenchunkr8   offsetnew_docs                r,   create_documentsTextSplitter.create_documents[   s     32$U"3
	 'GAE!".==7(("7$:M:MMF IIeSF^<E.3]+),U&"I  ) / ( r/   c                    / / p2U H9  nUR                  UR                  5        UR                  UR                  5        M;     U R                  X#S9$ )zSplit documents.)rA   )r?   r7   r8   rJ   )r$   rC   r@   rA   docs        r,   split_documentsTextSplitter.split_documentso   sM    ryCLL))*S\\*  $$U$@@r/   c                |    UR                  U5      nU R                  (       a  UR                  5       nU=(       d    S $ N)joinr#   strip)r$   docs	separatorr2   s       r,   
_join_docsTextSplitter._join_docsw   s.    ~~d#!!::<D|tr/   c                p   U R                  U5      n/ n/ nSnU GHr  nU R                  U5      nXh-   [        U5      S:  a  UOS-   U R                  :  Ga  X`R                  :  a!  [        R	                  SUU R                  5        [        U5      S:  a  U R                  XR5      n	U	b  UR                  U	5        X`R                  :  d,  Xh-   [        U5      S:  a  UOS-   U R                  :  at  US:  an  X`R                  US   5      [        U5      S:  a  UOS-   -  nUSS  nX`R                  :  a  M@  Xh-   [        U5      S:  a  UOS-   U R                  :  a  US:  a  Mn  UR                  U5        Xh[        U5      S:  a  UOS-   -  nGMu     U R                  XR5      n	U	b  UR                  U	5        U$ )Nr   zACreated a chunk of size %d, which is longer than the specified %d   )r    r9   r   loggerwarningrV   r?   r   )
r$   splitsrU   separator_lenrT   current_doctotaldlen_rM   s
             r,   _merge_splitsTextSplitter._merge_splits}   s    --i8!#A((+D[1AA1E1M""# +++NN'((	 {#a'//+ACC(  "5"55[9IA9MSTU**+!AI!6!6{1~!F-0-=-AMq"  '2!"o  "5"55[9IA9MSTU**+!AI q!c+.>.B]JJE= > ook5?KKr/   c                   ^ [         (       d  Sn[        U5      e[        T[        5      (       d  Sn[        U5      eSU4S jjnU " SSU0UD6$ )z>Text splitter that uses HuggingFace tokenizer to count length.z`Could not import transformers python package. Please install it with `pip install transformers`.zATokenizer received was not an instance of PreTrainedTokenizerBasec                8   > [        TR                  U 5      5      $ rQ   )r9   tokenizer2   	tokenizers    r,   _huggingface_tokenizer_lengthNTextSplitter.from_huggingface_tokenizer.<locals>._huggingface_tokenizer_length   s    y))$/00r/   r'   r2   strreturnintr1   )_HAS_TRANSFORMERSr   
isinstancer   )clsrh   kwargsr+   ri   s    `   r,   from_huggingface_tokenizer'TextSplitter.from_huggingface_tokenizer   sZ    
 ! E  S/!)%<==UCS/!	1 K#@KFKKr/   gpt2allc                  ^^^	 [         (       d  Sn[        U5      eUb  [        R                  " U5      m	O[        R                  " U5      m	SUUU	4S jjn[        U [        5      (       a  UUTTS.n0 UEUEnU " SSU0UD6$ )z9Text splitter that uses tiktoken encoder to count length.zCould not import tiktoken python package. This is needed in order to calculate max_tokens_for_prompt. Please install it with `pip install tiktoken`.c                8   > [        TR                  U TTS95      $ N)allowed_specialdisallowed_special)r9   encode)r2   rz   r{   encs    r,   _tiktoken_encoder=TextSplitter.from_tiktoken_encoder.<locals>._tiktoken_encoder   s*    

$3'9   r/   )encoding_name
model_namerz   r{   r'   rk   r1   )_HAS_TIKTOKENImportErrortiktokenencoding_for_modelget_encoding
issubclassTokenTextSplitter)
rq   r   r   rz   r{   rr   r+   r~   extra_kwargsr}   s
      ``    @r,   from_tiktoken_encoder"TextSplitter.from_tiktoken_encoder   s     }A 
 c""!--j9C''6C	 	 c,--!.(#2&8	L 0/,/F?#4???r/   c                6    U R                  [        U5      5      $ )z2Transform sequence of documents by splitting them.)rN   list)r$   rC   rr   s      r,   transform_documents TextSplitter.transform_documents   s    
 ##DO44r/   )r"   r   r   r!   r    r#   )r%   rn   r&   rn   r'   zCallable[[str], int]r(   zbool | Literal['start', 'end']r)   boolr*   r   rm   Noner2   rl   rm   	list[str]rQ   )r@   r   rA   zlist[dict[Any, Any]] | Nonerm   list[Document])rC   zIterable[Document]rm   r   )rT   r   rU   rl   rm   
str | None)r\   zIterable[str]rU   rl   rm   r   )rh   r   rr   r	   rm   r   )r   rl   r   r   rz   !Literal['all'] | AbstractSet[str]r{    Literal['all'] | Collection[str]rr   r	   rm   r   )rC   Sequence[Document]rr   r	   rm   r   )__name__
__module____qualname____firstlineno____doc__r9   r-   r   r3   rJ   rN   rV   rb   classmethodrs   setr   r   r   __static_attributes__r1   r/   r,   r   r   ,   sp   3  039> %!%&2&2 &2 .	&2
 7&2 &2 &2 
&2P 3 3 JN+F	(A*X L/L;>L	L L(  $!%=@U?D(@(@ (@ ;	(@
 =(@ (@ 
(@ (@T 5+57:5	5 5r/   c                  f   ^  \ rS rSrSrSS\" 5       S4           S	U 4S jjjrS
S jrSrU =r	$ )r      z/Splitting text to tokens using model tokenizer.ru   Nrv   c                   > [         TU ]  " S0 UD6  [        (       d  Sn[        U5      eUb  [        R
                  " U5      nO[        R                  " U5      nXpl        X0l        X@l	        g)zCreate a new TextSplitter.zCould not import tiktoken python package. This is needed in order to for TokenTextSplitter. Please install it with `pip install tiktoken`.Nr1   )
superr-   r   r   r   r   r   
_tokenizer_allowed_special_disallowed_special)	r$   r   r   rz   r{   rr   r+   r}   	__class__s	           r,   r-   TokenTextSplitter.__init__   si     	"6"}A 
 c""!--j9C''6C /#5 r/   c                   ^  SU 4S jjn[        T R                  T R                  T R                  R                  US9n[        XS9$ )a  Splits the input text into smaller chunks based on tokenization.

This method uses a custom tokenizer configuration to encode the input text
into tokens, processes the tokens in chunks of a specified size with overlap,
and decodes them back into text chunks. The splitting is performed using the
`split_text_on_tokens` function.

Args:
    text: The input text to be split into smaller chunks.

Returns:
    A list of text chunks, where each chunk is derived from a portion
    of the input text based on the tokenization and chunking rules.
c                b   > TR                   R                  U TR                  TR                  S9$ ry   )r   r|   r   r   )_textr$   s    r,   _encode-TokenTextSplitter.split_text.<locals>._encode  s4    ??)) $ 5 5#'#;#; *  r/   )r&   tokens_per_chunkdecoder|   rg   )r   rl   rm   z	list[int])	Tokenizerr   r   r   r   split_text_on_tokens)r$   r2   r   rh   s   `   r,   r3   TokenTextSplitter.split_text  sC     	 --!--??))	
	 $CCr/   )r   r   r   )r   rl   r   r   rz   r   r{   r   rr   r	   rm   r   r   )
r   r   r   r   r   r   r-   r3   r   __classcell__)r   s   @r,   r   r      sf    9 $!%=@U?D66 6 ;	6
 =6 6 
6 64D Dr/   r   c                      \ rS rSrSrSrSrSrSrSr	Sr
S	rS
rSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSr g) Languagei/  z"Enum of the programming languages.cppgojavakotlinjstsphpprotopythonrstrubyrustscalaswiftmarkdownlatexhtmlsolcsharpcobolcluaperlhaskellelixir
powershellvisualbasic6r1   N)!r   r   r   r   r   CPPGOJAVAKOTLINJSr   PHPPROTOPYTHONRSTRUBYRUSTSCALASWIFTMARKDOWNLATEXHTMLSOLCSHARPCOBOLCLUAPERLHASKELLELIXIR
POWERSHELLVISUALBASIC6r   r1   r/   r,   r   r   /  s    ,
C	BDF	B	B
CEF
CDDEEHED
CFEA
CDGFJ!Lr/   r   )frozenc                  H    \ rS rSr% SrS\S'    S\S'    S\S'    S\S	'   S
rg)r   iO  zTokenizer data class.rn   r&   r   zCallable[[list[int]], str]r   zCallable[[str], list[int]]r|   r1   N)r   r   r   r   r   __annotations__r   r1   r/   r,   r   r   O  s)    *,&&=&&=r/   r   c                   / nUR                  U 5      nSnUR                  UR                  ::  a  Sn[        U5      eU[	        U5      :  a  [        XAR                  -   [	        U5      5      nX4U nU(       d   U$ UR                  U5      nU(       a  UR                  U5        U[	        U5      :X  a   U$ XAR                  UR                  -
  -  nU[	        U5      :  a  M  U$ )z6Split incoming text and return chunks using tokenizer.r   z3tokens_per_chunk must be greater than chunk_overlap)r|   r   r&   r   r9   minr   r?   )	r2   rh   r\   	input_ids	start_idxr+   cur_idx	chunk_idsdecodeds	            r,   r   r   ]  s    F  &II!!Y%<%<<Co
c)n
$i"<"<<c)nM0	 M ""9-MM'"c)n$M 	//)2I2III	 c)n
$ Mr/   )r2   rl   rh   r   rm   r   ).r   
__future__r   r;   loggingabcr   r   dataclassesr   enumr   typingr   r	   r
   r   langchain_core.documentsr   r   typing_extensionsr   r   collections.abcr   r   r   r   r   AbstractSetr   r   r   $transformers.tokenization_utils_baser   ro   	getLoggerr   rZ   r   r   r   rl   r   r   r   r1   r/   r,   <module>r     s    # "   # !   G ,HH2ML 
		8	$T(B5*C B5J;D ;D|"sD "@ $
> 
> 
>A
  M  s$   B? "C ?C
	C
CC