
    iJ                        S r SSKJr  SSKrSSKJrJr  SSKJr  SSK	J
r
  SSKJr   " S S	\5      r " S
 S5      r " S S\5      r " S S\5      r " S S5      rg)zMarkdown text splitters.    )annotationsN)Any	TypedDict)Document)Language)RecursiveCharacterTextSplitterc                  0   ^  \ rS rSrSrSU 4S jjrSrU =r$ )MarkdownTextSplitter   z=Attempts to split the text along Markdown-formatted headings.c                h   > U R                  [        R                  5      n[        TU ]  " SSU0UD6  g)z"Initialize a MarkdownTextSplitter.
separatorsN )get_separators_for_languager   MARKDOWNsuper__init__)selfkwargsr   	__class__s      q/home/dmtnaga/Documents/work/airagagent/rag_env/lib/python3.13/site-packages/langchain_text_splitters/markdown.pyr   MarkdownTextSplitter.__init__   s.    55h6G6GH
9J9&9    r   )r   r   returnNone)__name__
__module____qualname____firstlineno____doc__r   __static_attributes____classcell__)r   s   @r   r
   r
      s    G: :r   r
   c                  \    \ rS rSrSr   S	         S
S jjrSS jrSS jrSS jrSr	g)MarkdownHeaderTextSplitter   z4Splitting markdown files based on specified headers.Nc                ^    X l         [        US SS9U l        X0l        U=(       d    0 U l        g)a  Create a new MarkdownHeaderTextSplitter.

Args:
    headers_to_split_on: Headers we want to track
    return_each_line: Return each line w/ associated headers
    strip_headers: Strip split headers from the content of the chunk
    custom_header_patterns: Optional dict mapping header patterns to their
        levels. For example: {"**": 1, "***": 2} to treat **Header** as
        level 1 and ***Header*** as level 2 headers.
c                    [        U S   5      $ )Nr   )len)splits    r   <lambda>5MarkdownHeaderTextSplitter.__init__.<locals>.<lambda>0   s    3uQx=r   T)keyreverseN)return_each_linesortedheaders_to_split_onstrip_headerscustom_header_patterns)r   r/   r-   r0   r1   s        r   r   #MarkdownHeaderTextSplitter.__init__   s6    $ !1 $*%@$$
  +&<&B#r   c           	     N  ^ TU R                   ;  a  g[        R                  " T5      nSU SU SU SU S3	n[        R                  " XA5      nU(       aQ  UR	                  S5      R                  5       nU(       a+  [        U4S jUR                  S	S
5       5       5      (       d  gg)zCheck if line matches a custom header pattern.

Args:
    line: The line to check
    sep: The separator pattern to match

Returns:
    True if the line matches the custom pattern format
F^z(?!z
)(.+?)(?<!)$   c              3  ,   >#    U  H	  oT;   v   M     g 7fNr   ).0cseps     r   	<genexpr>?MarkdownHeaderTextSplitter._is_custom_header.<locals>.<genexpr>R   s     "N5M85Ms     T)r1   reescapematchgroupstripallreplace)r   liner<   escaped_seppatternrC   contents     `    r   _is_custom_header,MarkdownHeaderTextSplitter._is_custom_header7   s     d111 iin }C}J{m1[MQRS 	 'kk!n**,G s"NW__S"5M"NNNr   c                   / nU H  nU(       a'  US   S   US   :X  a  US   S==   SUS   -   -  ss'   M1  U(       a  US   S   US   :w  av  [        US   S   5      [        US   5      :  aU  US   S   R                  S5      S   S   S:X  a4  U R                  (       d#  US   S==   SUS   -   -  ss'   US   US   S'   M  UR                  U5        M     U Vs/ s H  n[	        US   US   S9PM     sn$ s  snf )	zlCombine lines with common metadata into chunks.

Args:
    lines: Line of text / associated header metadata
metadatarK   z  

r   #page_contentrP   )r'   r(   r0   appendr   )r   linesaggregated_chunksrH   chunks        r   aggregate_lines_to_chunks4MarkdownHeaderTextSplitter.aggregate_lines_to_chunksV   s8    -/D!%b)*5j9II
 ""%i0FT)_4LL0!%b)*5j9II)"-j9:SjAQ=RR%b))4::4@DQG3N** ""%i0FT)_4LL0484D!"%j1 "((.9 @ +
* %	"2U:=NO*
 	
 
s   C8c                J   UR                  S5      n/ n/ n0 n/ n0 nSnSn	U GH  n
U
R                  5       nSR                  [        [        R
                  U5      5      nU(       dK  UR                  S5      (       a  UR                  S5      S:X  a  SnSn	O5UR                  S5      (       a  SnSn	OUR                  U	5      (       a  SnSn	U(       a  UR                  U5        M  U R                   GH  u  pUR                  U5      =(       a/    [        U5      [        U5      :H  =(       d    U[        U5         S:H  nU R                  X5      nU(       d	  U(       d  Mn  Ub  XR                  ;   a  U R                  U   nOUR                  S
5      nU(       aN  US   S   U:  aB  UR                  5       nUS   U;   a  UR                  US   5        U(       a  US   S   U:  a  MB  U(       a'  U[        U5      [        U5      *  R                  5       nOU[        U5      S	 R                  5       nUUUS.nUR                  U5        US   X}'   U(       aA  UR                  SR                  U5      UR                  5       S.5        UR                  5         U R                   (       d  UR                  U5          Oc   U(       a  UR                  U5        OHU(       aA  UR                  SR                  U5      UR                  5       S.5        UR                  5         UR                  5       nGM     U(       a#  UR                  SR                  U5      US.5        U R"                  (       d  U R%                  U5      $ U Vs/ s H  n['        US   US   S9PM     sn$ s  snf )z4Split markdown file.

Args:
    text: Markdown file
rQ   Fr@   z```r7   Tz~~~r?   NrR   rO   levelname)r\   r]   datar^   )rK   rP   rK   rP   rS   )r(   rE   joinfilterstrisprintable
startswithcountrU   r/   r'   rL   r1   popcopyclearr0   r-   rY   r   )r   textrV   lines_with_metadatacurrent_contentcurrent_metadataheader_stackinitial_metadatain_code_blockopening_fencerH   stripped_liner<   r]   is_standard_headeris_custom_headercurrent_header_levelpopped_headerheader_textheaderrX   s                        r   
split_text%MarkdownHeaderTextSplitter.split_text   s    

4 .0%'+-)++-D JJLM GGF3??M$JKM  ++E22}7J7J57QUV7V$(M$)M"--e44$(M$)M))-88 % "&&}5 "55	%2%=%=c%B & &#c(2TmCH6MQT6T #
 $(#9#9-#M  &)9)9'"="==373N3Ns3S03699S>0 ) ,R 0 9=Q Q -9,<,<,>M  -V48HH 0 4 4]65J K ) ,R 0 9=Q Q , +8CCH9*M*S*S*UK +8C
*C*I*I*KK &:$($/.
 %++F317(. '+22+/99_+E,<,A,A,C (--/--'..}=C 6F !#**=9$'..'+yy'A(8(=(=(? $))+/446I L &&#yy9 0 $$112EFF -
, %	"2U:=NO,
 	
 
s   N )r1   r/   r-   r0   )FTN)
r/   zlist[tuple[str, str]]r-   boolr0   ry   r1   zdict[str, int] | Noner   r   )rH   ra   r<   ra   r   ry   )rV   zlist[LineType]r   list[Document]rh   ra   r   rz   )
r   r   r   r   r   r   rL   rY   rw   r    r   r   r   r#   r#      s\    >
 "'"8<C2C C 	C
 !6C 
C:>)
VI
r   r#   c                  .    \ rS rSr% SrS\S'   S\S'   Srg)	LineTypei  zLine type as typed dict.zdict[str, str]rP   ra   rK   r   Nr   r   r   r   r   __annotations__r    r   r   r   r}   r}     s    "Lr   r}   c                  8    \ rS rSr% SrS\S'   S\S'   S\S'   Srg	)

HeaderTypei  zHeader type as typed dict.intr\   ra   r]   r^   r   Nr~   r   r   r   r   r     s    $J
I
Ir   r   c                      \ rS rSrSrSSSSSSS	.r   S       SS jjrSS jrSS jrSS jr	SS jr
SS jrSS jrSS jrSrg
)&ExperimentalMarkdownSyntaxTextSplitteri  a0  An experimental text splitter for handling Markdown syntax.

This splitter aims to retain the exact whitespace of the original text while
extracting structured metadata, such as headers. It is a re-implementation of the
MarkdownHeaderTextSplitter with notable changes to the approach and
additional features.

Key Features:

* Retains the original whitespace and formatting of the Markdown text.
* Extracts headers, code blocks, and horizontal rules as metadata.
* Splits out code blocks and includes the language in the "Code" metadata key.
* Splits text on horizontal rules (`---`) as well.
* Defaults to sensible splitting behavior, which can be overridden using the
    `headers_to_split_on` parameter.

Example:
```python
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]
splitter = ExperimentalMarkdownSyntaxTextSplitter(
    headers_to_split_on=headers_to_split_on
)
chunks = splitter.split(text)
for chunk in chunks:
    print(chunk)
```

This class is currently experimental and subject to change based on feedback and
further development.
zHeader 1zHeader 2zHeader 3zHeader 4zHeader 5zHeader 6)rR   z##z###z####z#####z######Nc                    / U l         [        SS9U l        / U l        X0l        U(       a  [        U5      U l        OU R                  U l        X l        g)a  Initialize the text splitter with header splitting and formatting options.

This constructor sets up the required configuration for splitting text into
chunks based on specified headers and formatting preferences.

Args:
    headers_to_split_on (Union[list[tuple[str, str]], None]):
        A list of tuples, where each tuple contains a header tag (e.g., "h1")
        and its corresponding metadata key. If `None`, default headers are used.
    return_each_line (bool):
        Whether to return each line as an individual chunk.
        Defaults to `False`, which aggregates lines into larger chunks.
    strip_headers (bool):
        Whether to exclude headers from the resulting chunks.
r@   rT   N)	chunksr   current_chunkcurrent_header_stackr0   dictsplittable_headersDEFAULT_HEADER_KEYSr-   )r   r/   r-   r0   s       r   r   /ExperimentalMarkdownSyntaxTextSplitter.__init__H  sK    * ')%26;=!*&*+>&?D#&*&>&>D# 0r   c           
        U R                   R                  5         [        SS9U l        U R                  R                  5         UR                  SS9nU(       Gay  UR                  S5      nU R                  U5      nU R                  U5      nU R                  U5      nU(       a}  U R                  5         U R                  (       d  U R                  =R                  U-  sl        [        UR                  S5      5      nUR                  S5      nU R                  Xx5        OU(       ai  U R                  5         U R!                  X25      U R                  l        UR                  S5      U R                  R"                  S'   U R                  5         O7U(       a  U R                  5         OU R                  =R                  U-  sl        U(       a  GMy  U R                  5         U R$                  (       ar  U R                    V	V
s/ s HX  n	U	R                  R                  5         H6  n
U
(       d  M  U
R'                  5       (       a  M#  [        XR"                  S	9PM8     MZ     sn
n	$ U R                   $ s  sn
n	f )
a  Split the input text into structured chunks.

This method processes the input text line by line, identifying and handling
specific patterns such as headers, code blocks, and horizontal rules to
split it into structured chunks based on headers, code blocks, and
horizontal rules.

Args:
    text: The input text to be split into chunks.

Returns:
    A list of `Document` objects representing the structured
    chunks of the input text. If `return_each_line` is enabled, each line
    is returned as a separate `Document`.
r@   r   T)keependsr   r7      CoderS   )r   rg   r   r   r   
splitlinesre   _match_header_match_code_match_horz_complete_chunk_docr0   rT   r'   rD   _resolve_header_stack_resolve_code_chunkrP   r-   isspace)r   rh   	raw_linesraw_lineheader_match
code_match
horz_matchheader_depthru   rX   rH   s              r   rw   1ExperimentalMarkdownSyntaxTextSplitter.split_texth  s   " 	%26!!'')OOTO2	 }}Q'H--h7L))(3J))(3J((*))&&33x?3  #<#5#5a#89*003**<E((*262J2J3""/ 7A6F6Fq6I""++F3((*((*""//8;/3 i6 	  "    "[[(E!..99;D E !% Ed^^D; E(  {{s   )I20I2I2c                    [        U R                  5       H#  u  nu  pEXA:  d  M  U R                  S U U l          O   U R                  R                  X45        g r9   )	enumerater   rU   )r   r   ru   idepth_s         r   r   <ExperimentalMarkdownSyntaxTextSplitter._resolve_header_stack  sU    &t'@'@AMAz$,0,E,Ebq,I)	 B
 	!!((,)DEr   c                    UnU(       a6  UR                  S5      nX4-  nU R                  U5      (       a  U$ U(       a  M6  g)Nr   r@   )re   r   )r   current_liner   rX   r   s        r   r   :ExperimentalMarkdownSyntaxTextSplitter._resolve_code_chunk  sB     }}Q'HE))	 i
 r   c                d   U R                   R                  nU(       a  UR                  5       (       dp  U R                   H;  u  p#U R                  R                  SU-  5      nX0R                   R                  U'   M=     U R                  R                  U R                   5        [        SS9U l         g )NrR   r@   r   )
r   rT   r   r   r   getrP   r   rU   r   )r   chunk_contentr   value
header_keys        r   r   :ExperimentalMarkdownSyntaxTextSplitter._complete_chunk_doc  s    **77!6!6!8!8 $ 9 9!4488uE
:?""++J7 !: KKt112%26r   c                    [         R                  " SU5      nU(       a!  UR                  S5      U R                  ;   a  U$ g )Nz^(#{1,6}) (.*)r7   )rA   rC   rD   r   )r   rH   rC   s      r   r   4ExperimentalMarkdownSyntaxTextSplitter._match_header  s3    *D1U[[^t'>'>>Lr   c                ~    S Vs/ s H  n[         R                  " X!5      PM     nn[        S U 5       S 5      $ s  snf )N)z^```(.*)z^~~~(.*)c              3  6   #    U  H  o(       d  M  Uv   M     g 7fr9   r   r:   rC   s     r   r=   EExperimentalMarkdownSyntaxTextSplitter._match_code.<locals>.<genexpr>       9u5UU   
	rA   rC   nextr   rH   rulematchess       r   r   2ExperimentalMarkdownSyntaxTextSplitter._match_code  s:    4NO4ND288D'4NO994@@ P    :c                ~    S Vs/ s H  n[         R                  " X!5      PM     nn[        S U 5       S 5      $ s  snf )N)z
^\*\*\*+\nz^---+\nz^___+\nc              3  6   #    U  H  o(       d  M  Uv   M     g 7fr9   r   r   s     r   r=   EExperimentalMarkdownSyntaxTextSplitter._match_horz.<locals>.<genexpr>  r   r   r   r   s       r   r   2ExperimentalMarkdownSyntaxTextSplitter._match_horz  sB    -T
-TTBHHT -T 	 
 994@@
r   )r   r   r   r-   r   r0   )NFT)r/   zlist[tuple[str, str]] | Noner-   ry   r0   ry   r   r   r{   )r   r   ru   ra   r   r   )r   ra   r   z	list[str]r   ra   )r   r   )rH   ra   r   zre.Match[str] | None)r   r   r   r   r   r   r   rw   r   r   r   r   r   r   r    r   r   r   r   r     s     F  =A!&"	191 1 	1
 
1@=~F
7AAr   r   )r   
__future__r   rA   typingr   r   langchain_core.documentsr   langchain_text_splitters.baser   "langchain_text_splitters.characterr   r
   r#   r}   r   r   r   r   r   <module>r      sZ     " 	 ! - 2 M:9 :s
 s
ly  xA xAr   