
    #i                     $   S SK r S SKrS SKrS SKJr  S SKrS SKJr  S SK	J
r
  S SKJrJrJrJrJrJrJrJrJrJrJrJrJrJrJr   S SKrSr S SKJr  Sr  " S
 S5      r#g! \ a    Sr Nf = f! \ a     S S	K!J"r"  Sr  N*! \ a    Sr   N5f = ff = f)    N)Path)RecursiveCharacterTextSplitter)SentenceTransformer)PDF_DIRDOCUMENTS_DIRMETADATA_DIR
CHUNK_SIZECHUNK_OVERLAPCHUNK_MIN_SIZECHUNK_MAX_SIZESENTENCE_OVERLAPMIN_SENTENCES_PER_CHUNKEMBEDDING_MODELSEMANTIC_CHUNKING_ENABLEDSEMANTIC_MAX_CHARSSEMANTIC_SIMILARITY_THRESHOLDTECHNICAL_TERMSMEASUREMENT_PATTERNSTF)extract_text	PdfReaderc                      \ rS rSrS rS rS rS rS rS\	S\
4S	 jrS
\
S\
S\
4S jrS\
S\4S jrS rS rS rS
\
S\
S\
4S jrS
\
4S jrS
\
S\4S jrS
\
S\S\4S jrS
\
S\4S jrS
\
S\4S jrS,S
\
S\
S\S\S\4
S jjrS-S
\
S\S\4S jjrS-S \
S\S\4S! jjrS.S"\4S# jjr S/S$\
S%\
S&\S'\S(\
S)\
S\
S\S\4S* jjrS+r g)0PDFProcessor.   c           	          [        [        [        S-  5      [        5      n[	        [        U[
        / SQS9U l        SU l        [        U l	        [        U l        / SQU l        S S SSSSSSS.U l        g )N皙?)


z.  )
chunk_sizechunk_overlaplength_function
separatorsF)	u   copyright\s+©?\s*\d{4}zall rights reservedzpage\s+\d+\s+of\s+\d+confidentialdraftz^\s*$z^\d+$z[a-f0-9]{32,}z	^[\s\W]*$r   )
start_timeend_timechunks_createdchunks_filteredsections_detectedtables_detectedlists_detectedmemory_peak_mb)maxintr	   r
   r   lentext_splittersemantic_chunking_enabledr   semantic_max_charsr   semantic_similarity_thresholdboilerplate_patternsprocessing_stats)selfoverlap_sizes     :/var/www/html/leadgen/backtest/airagagent/pdf_processor.py__init__PDFProcessor.__init__/   sy    3zC/0-@;!&0	
 */&"4-J*
%
!  !" 	!
    c                     [        US5       n[        R                  " UR                  5       5      R	                  5       sSSS5        $ ! , (       d  f       g= f)z/Generate hash of file content to detect changesrbN)openhashlibmd5read	hexdigest)r7   	file_pathfs      r9   get_file_hashPDFProcessor.get_file_hashV   s5    )T"a;;qvvx(224 #""s   2A		
Ac                     [         S-  nUR                  5       (       a,  [        US5       n[        R                  " U5      sSSS5        $ 0 $ ! , (       d  f       0 $ = f)z%Get record of already processed filesprocessed_files.jsonrN)r   existsr?   jsonload)r7   metadata_filerE   s      r9   get_processed_files PDFProcessor.get_processed_files[   sN    $'==!!mS)Qyy| *)	 *)	s   A
Ac                    UR                   nU R                  5       nU[        U5      [        UR	                  5       R
                  5      S.XT'   [        [        S-  S5       n[        R                  " XVSS9  SSS5        [        [        U5      R                   S3-  n[        US5       n[        R                  " X6SS9  SSS5        g! , (       d  f       NX= f! , (       d  f       g= f)z%Record processed file and save chunks)hashchunk_countprocessed_daterI   w   )indentNz_chunks.json)namerO   r0   strstatst_mtimer?   r   rL   dumpr   r   stem)r7   rD   	file_hashchunksfilenamemetadatarE   
chunk_files           r9   save_processed_file PDFProcessor.save_processed_filec   s    >> ++-v;!).."2";";<
 ,!77=IIh!, > #X(;(;'<L%II
*c"aIIf* #" >=
 #"s   "C
+C

C
C)c                    UR                   R                  5       S:X  ay  [        (       a   U R                  U5      $  [        (       a  [        [        U5      5      nU$ SSK	J
n  U" U5      nSnUR                   H  nX6R                  5       S-   -  nM     U$ UR                   R                  5       S:X  a&   [        USSS9 nUR                  5       sSSS5        $ [        SUR                    35      e! [         a  n[        SU SU 35         SnANSnAff = f! [         a  n[        S	U SU 35         SSK	J
n  U" U5      nSnUR                   H  nX6R                  5       S-   -  nM     Us SnA$ ! [         a%  n[        S
U SU 35        SU 3s SnAs SnA$ SnAff = fSnAff = f! , (       d  f       g= f! [         a_     [        USSS9 nUR                  5       sSSS5        s $ ! , (       d  f        g= f! [         a  n[        SU SU 35        e SnAff = f[         a  n[        SU SU 35        e SnAff = f)z?Extract text from PDF or text files using best available methodz.pdfz'Warning: PyMuPDF extraction failed for z: Nr   r    r   z#Warning: PDF extraction failed for z.Error: Both PDF extraction methods failed for z Error extracting text from PDF: z.txtrJ   zutf-8)encodingzlatin-1z"Warning: Failed to read text file z with latin-1 encoding: zError reading text file zUnsupported file type: )suffixlowerHAS_PYMUPDF_extract_with_pymupdf	ExceptionprintHAS_PDFMINERr   rY   PyPDF2r   pagesr?   rB   UnicodeDecodeError
ValueError)	r7   rD   etextr   readerpagee2rE   s	            r9   extract_text_from_file#PDFProcessor.extract_text_from_filew   sO   !!#v-{V55i@@C<'I7DK 1&y1FD & 1 1 3d :: !-K ##%/)S7;q668 <; 6y7G7G6HIJJ[ ! VCI;bQRPSTUUV   C;I;bLM	C0&y1FD & 1 1 3d :: !-K  CJ9+UWXZW[\]=bTBBBCC <;% iyAQ vvx BAAA  >ykIabcadef  02aSABs   C4  D 9D 7G F1	G 4
D>DD
F.&F)89E71F.7
F&F!F&F)F.!F&&F))F.1
F?;G ?G 
I
HG4(	H1I
4
H	>HH
H'H""H''I
3II
rD   returnc                 2   [         R                  " U5      n/ n U HC  nUR                  S[         R                  [         R                  -  S9nUR                  U5        ME     UR                  5         SR                  U5      $ ! UR                  5         f = f)zBHigh-fidelity extraction using PyMuPDF with basic structural cues.rt   )flagsr   )fitzr?   get_textTEXT_DEHYPHENATETEXT_PRESERVE_WHITESPACEappendclosejoin)r7   rD   docrp   rv   rt   s         r9   rk   "PDFProcessor._extract_with_pymupdf   s    ii	"	}}//$2O2OO %  T"  IIKyy IIKs   A	B Brt   r`   c                 z  ^	^
 UR                  5       m
UR                  5       m	/ SQn/ SQn/ SQn/ SQn[        U	U
4S jU 5       5      [        U	U
4S jU 5       5      [        U	U
4S jU 5       5      [        U	U
4S jU 5       5      S	.n[        UR                  5       5      nUS
:  a  [        XwR                  S9$ g)zw
Detect document type based on content and filename.
Returns: 'technical', 'research', 'legal', 'manual', or 'default'
)specification	technicalapiprotocolstandardmeasurementcalibrationconfiguration	parameterphppmtemperaturevoltagecurrent)abstractintroductionmethodologyresults
conclusion
referencescitationstudyresearch
experiment
hypothesiszpeer-reviewedjournal)whereasherebypursuantstatute
regulation
compliancelegallawact	ordinanceclausesection
subsectionarticle	paragraph)
manualguidetutorialzhow tostep	procedureinstructionuserzgetting startedzquick startc              3   B   >#    U  H  oT;   d  UT;   d  M  S v   M     g7f   N .0indfilename_lower
text_lowers     r9   	<genexpr>5PDFProcessor._detect_document_type.<locals>.<genexpr>   s#     m*>3BSWZ^lWlQQ*>   	c              3   B   >#    U  H  oT;   d  UT;   d  M  S v   M     g7fr   r   r   s     r9   r   r      s#     k)<#z@QUX\jUjAA)<r   c              3   B   >#    U  H  oT;   d  UT;   d  M  S v   M     g7fr   r   r   s     r9   r   r      s"     e&6s:KsVdOd&6r   c              3   B   >#    U  H  oT;   d  UT;   d  M  S v   M     g7fr   r   r   s     r9   r   r      s#     g'8:<MQTXfQf!!'8r   )r   r   r   r   rV   )keydefault)ri   sumr.   valuesget)r7   rt   r`   technical_indicatorsresearch_indicatorslegal_indicatorsmanual_indicatorsscores	max_scorer   r   s            @@r9   _detect_document_type"PDFProcessor._detect_document_type   s    
 ZZ\
!) 



 m*>mmk)<kke&6eeg'8gg	
 (	>v::..r<   document_typec                 <    [         R                  U[         S   5      $ )z8Get chunking configuration for a specific document type.r   )DOCUMENT_TYPE_CONFIGSr   )r7   r   s     r9   _get_chunking_config!PDFProcessor._get_chunking_config   s    $((8Mi8XYYr<   c                 
  ^  SSK nSSKn SSKnSnUR                  nT R                  U5      nT R                  5       nXh;   a  X   S   U:X  a  [        SU S35        g[        SU S	35        U(       a  WR                  UR                  5       5      n	UR                  5       SSSSSSSS
S.	T l
        T R                  U5      n
T R                  X5      nT R                  X5      nUT R                  S'   [        SU 35        T R                  U5      nT R                  X5      nSU 3n[!        U5      T R                  S'   U(       d(  [        SU S35        T R#                  U
SSU5      nSU 3nU(       d,  [        SU S35        T R$                  R'                  U
5      nSn/ n[)        U5       HV  u  nnT R+                  UU5      (       a  UR-                  U5        M0  T R                  S==   S-  ss'   [        SU S35        MX     / n[)        U5       H  u  nnSnSnUR/                  S5      nU(       a  [!        US   R1                  5       5      S:  ar  US   R1                  5       nUR3                  5       (       a  [!        U5      S:  a  UnSnO6[4        R6                  " SU5      (       a  [4        R8                  " SSU5      nSnT R;                  UUU[!        U5      XUU5      nUR-                  UUS.5        M     UR                  5       T R                  S '   U(       a.  W	R=                  5       R>                  S!-  S!-  T R                  S"'   O: SSK nURC                  URD                  5      RF                  S!-  T R                  S"'   U Vs/ s H  n[!        US#   5      PM     nnU(       a  [I        U5      [!        U5      -  T R                  S$'   [K        U5      T R                  S%'   [M        U5      T R                  S&'   [I        U 4S' jU 5       5      [!        U5      -  S(-  T R                  S)'   T RO                  XU5        T R                  S    T R                  S*   -
  n[        S+U S,[!        U5       S-[!        U5       S.35        [        S/U S0US1 S2T R                  S"   S3 S435        [        S5T R                  RQ                  S$S5      S6 S7T R                  RQ                  S%S5       S8T R                  RQ                  S&S5       35        U$ ! [         a    Sn GNf = f!   ST R                  S"'    GN= fs  snf )9z3Process a single PDF file with Phase 3 enhancementsr   NTFrR   u   ✓ z already processed, skipping...zProcessing z...r   )	r&   r'   r(   r)   r*   r+   r,   r-   r   r   z  Detected document type: structure_aware_r(   z-Warning: Structure-aware chunking failed for z, using sentence-basedrf   sentence_based_z&Warning: Sentence chunking failed for z, using recursive fallbackrecursive_fallbackr)   r   z  Filtered out chunk z (boilerplate/low quality)r   P      z	^\d+\.\s+)contentra   r'   i   r-   r   chunk_size_avgchunk_size_minchunk_size_maxc              3   L   >#    U  H  oTR                   S    -
  S-  v   M     g7f)r   rV   N)r6   )r   xr7   s     r9   r   +PDFProcessor.process_pdf.<locals>.<genexpr>m  s4       ;B  vApq@U@UVf@g<gjk;k  vAs   !$      ?chunk_size_stdr&   u   ✓ Processed z into z chunks (filtered from )z  Type: z, Time: z.2fzs, Memory: z.1fMBz  Chunk sizes: avg=z.0fz, min=z, max=))timeospsutilImportErrorrX   rF   rO   rm   Processgetpidr6   rx   _extract_document_titler   r   _structure_aware_chunk_textr0   _sentence_based_chunk_textr1   
split_text	enumerate_is_valid_chunkr   splitstripisupperrematchsub_extract_chunk_metadatamemory_inforssresource	getrusageRUSAGE_SELF	ru_maxrssr   minr.   rc   r   )r7   rD   r   r   r   
HAS_PSUTILr`   r^   processed_filesprocessrt   document_titler   
doc_configr_   chunk_methodfiltered_chunksichunk
chunk_datasection_titlesection_levellines
first_linera   r   chunk_sizesdurations   `                           r9   process_pdfPDFProcessor.process_pdf   s   	J >>&&y1	224 &(0I=XJ&EFGH:S)* nnRYY[1G))+ !" &
!
 **9555dE 224B1>o.*=/:; ..}=
 11$C)-925f+./ A(Kabc44T2q*MF,]O<L :8*D^_`''2248F/L !&)HAu##E:66&&u-%%&78A=8-aS0JKL * 
!/2HAuMMKK%EU1X^^-.3"1X^^-
%%''C
Oq,@$.M$%MXXlJ77$&FF<Z$HM$%M33xC$8.}H  $ ) 34 -1IIKj)6=6I6I6K6O6ORV6VY]6]D!!"23<:B:L:LXMaMa:b:l:los:s%%&67
 ;EE*s5+,*E69+6F[IY6YD!!"2369+6FD!!"2369+6FD!!"237:  ;B  vA  ;B  8B  EH  IT  EU  8U  X[  7[D!!"23  zB ((4t7L7L\7ZZxjs:.??VWZ[aWbVccdefx~[I^I^_oIpqtHuuwxy#D$9$9$=$=>NPQ$RSV#W X**../?CD E**../?CDF 	G {  	J	P<:;%%&67 Fs#   T 9T- UT*)T*-Uc                 n   SS/n/ nU H1  nUR                  [        [        R                  " U5      5      5        M3     U R	                  5       n/ nU H_  nUR
                  U;  a  UR                  U5        M&  U R                  U5      nXFR
                     S   U:w  d  MN  UR                  U5        Ma     U$ )z<Get list of files (PDFs and text files) that need processingz*.pdfz*.txtrR   )extendlistr   globrO   rX   r   rF   )r7   supported_extensionsfilesextr  	new_filesrD   current_hashs           r9   get_new_filesPDFProcessor.get_new_files{  s     '1'CLLgll3/01 ( 224	I~~_4  +  $11)<">>26:lJ$$Y/  r<   c                     U R                  5       n/ nU(       d  [        S5        / $ [        S[        U5       S35        U H.  nU R                  U5      nU(       d  M  UR	                  U5        M0     U$ )z!Process all new or modified fileszNo new files to process.zFound z! new/modified files to process...)r  rm   r0   r  r  )r7   r  
all_chunksrD   r_   s        r9   process_all_newPDFProcessor.process_all_new  sr    &&(	
,-Is9~&&GHI"I%%i0Fv!!&) #
 r<   fallbackc                 z  ^ U(       d  [        U5      R                  $ UR                  5        H  mTR                  5       mT(       d  M  [	        T5      S:  d  [	        TR                  5       5      S:X  a  MJ  [        U4S jS 5       5      (       a  Mf  [        S T 5       5      S:  a  M~  TR                  5       T:X  a  [        S T 5       5      S:  a  M  [        R                  " S	S
T5      n[	        UR                  5       5      S:  a  M  USS s  $    [        U5      R                  R                  SS5      $ )z6Derive a human friendly title from the extracted text.   r   c              3   H   >#    U  H  oTR                  5       ;   v   M     g 7fN)ri   )r   tokenlines     r9   r   7PDFProcessor._extract_document_title.<locals>.<genexpr>  s     s6rUDJJL(6rs   ")libraryzdue date	copyrightisbnwwwemailc              3   @   #    U  H  oR                  5       v   M     g 7fr(  )isdigitr   cs     r9   r   r+    s     -199;;      c              3   @   #    U  H  oR                  5       v   M     g 7fr(  )isalphar3  s     r9   r   r+    s     +FAIIKKr5  r   z[^A-Za-z0-9\s\'\-:,]rf   N   _r   )r   r]   
splitlinesr   r0   r   anyr   upperr   r   replace)r7   rt   r$  cleanedr*  s       @r9   r   $PDFProcessor._extract_document_title  s    >&&&OO%D::<D4y1}DJJL 1Q 6s6rsss---1zz|t#+F+F(F(Jff4b$?G7==?#a'4C= ! &" H~""**344r<   c           	         U R                   (       d  g[        R                  " SU5       Vs/ s H2  n[        UR	                  5       5      S:  d  M"  UR	                  5       PM4     nn[        U5      S:  a  g U R                   R                  USSSS9n/ nUS   /n[        US   5      n[        S
[        U5      5       H  n	[        [        R                  " XI   XIS
-
     5      5      n
U[        X9   5      -   S-   nXR                  :  d  XR                  :  aK  SR                  U5      R	                  5       nU(       a  UR                  U5        X9   /n[        X9   5      nM  UR                  X9   5        UnM     U(       a7  SR                  U5      R	                  5       nU(       a  UR                  U5        U R!                  U5      n[        U5      S
::  a  gU$ s  snf ! [         a  n[        SU S	35         SnAgSnAff = f)zHSplit text into semantically coherent chunks using paragraph embeddings.Nz\n{2,}r   r       T)
batch_sizeconvert_to_numpynormalize_embeddingsz*Warning: semantic chunk embedding failed (z).r   rV   r   )embedding_modelr   r   r0   r   encoderl   rm   rangefloatnpdotr4   r3   r   r   _merge_small_chunks)r7   rt   para
paragraphs
embeddingsexcr_   r   current_lenidx
similaritycandidate_len
chunk_texts                r9   _semantic_chunk_text!PDFProcessor._semantic_chunk_text  s   ##/1xx	4/Hb/HtCPTPZPZP\L]`aLaldjjl/H
bz?Q		--44!%%)	 5 J a=/*Q-(C
O,Crvvjoz'7JKLJ'#jo*>>BM>>>-RiRiBi#[[1779
MM*-%?+!*/2z/+ - W-335Jj)))&1v;!W c  	>se2FG	s#   !GG8G$ $
H.HHc                    / nUR                  S5      n/ SQn[        U5       GH  u  pVUR                  5       nU(       a  [        U5      S:  a  M.  UR	                  5       (       a$  [        U5      S:  a  UR                  XuS45        Mg  [        R                  " SU5      nU(       ad  [        S UR                  5       SS  5       5      n	[        U	S5      n
UR                  S5      n[        U5      S:  a  UR                  XU
45        M  [        R                  " S	U5      nU(       aP  [        UR                  S5      5      n
UR                  S
5      n[        U5      S:  a  UR                  XU
45        GMW  [        R                  " SU5      (       d  GMv  [        U5      S:  d  GM  UR                  S5      (       a  GM  UR                  XuS
45        GM     U$ )z
Detect headings and sections in the document.
Returns list of tuples: (heading_text, position, level)
Level: 1 = major section, 2 = subsection, 3 = sub-subsection
r   ))z^\s*[A-Z][A-Z\s]{3,}\s*$r   )z^\s*(\d+)\.\s+([A-Z][^\n]+)$N)z+^\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,})\s*$rV   )^\s*(#{1,3})\s+(.+)$Nr6  r   r   z)^\s*(\d+)(?:\.(\d+))?(?:\.(\d+))?\s+(.+)$c              3   6   #    U  H  o(       d  M  S v   M     g7fr   r   )r   gs     r9   r   0PDFProcessor._detect_headings.<locals>.<genexpr>  s     H'B!aAA'Bs   
	NrY  rV   z ^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+$r   ).!?:)r   r   r   r0   r   r   r   r   r   groupsr  groupendswith)r7   rt   headingsr  patternsr
  r*  line_strippednumbered_matchdepthlevelheading_textmarkdown_matchs                r9   _detect_headingsPDFProcessor._detect_headings  s    

4 	
 !'GA JJLM C$6$: $$&&3}+=+B1 56  XX&RTabNH~'<'<'>r'BHHE1-33A6|$q(OO\e$<=  XX&=}MNN0034-33A6|$q(OO\e$<= xx;]KK}%*=3I3IJ^3_3_OO]q$9:C (F r<   rd  c                    U(       d  SUS4/$ / nUR                  S5      nUS   S   S:  aC  SR                  USUS   S    5      R                  5       nU(       a  UR                  SUS45        [	        [        U5      5       Hq  nX&   u  pxn	US-   [        U5      :  a
  X&S-      S   O
[        U5      n
SR                  XHS-   U
 5      R                  5       nU(       d  M^  UR                  X{U	45        Ms     U$ )zv
Split text into sections based on detected headings.
Returns list of tuples: (section_title, section_content, level)
rf   r   r   r   NIntroduction)r   r   r   r   rH  r0   )r7   rt   rd  sectionsr  start_contentr
  rj  heading_posheading_levelend_possection_contents               r9   _split_into_sections!PDFProcessor._split_into_sections&  s    
 qM?"

4  A;q>A IIeOXa[^&<=CCEM BC s8}%A7?{4L} +,A#H*=hsmA&3u:G #ii!mG(DEKKMO NO & r<   c                    / nUR                  S5      nSnU[        U5      :  Ga  X4   R                  5       nU[        U5      S-
  :  Ga'  [        R                  " SU5      (       d  [        R                  " SU5      (       a  U/nUS-   nU[        U5      :  a  XtS-   :  a  X7   R                  5       n[        R                  " SU5      (       d8  [        R                  " S	U5      (       d  [        R                  " S
U5      (       a  UR                  U5        US-  nOOU[        U5      :  a
  XtS-   :  a  M  [        U5      S:  a*  SR                  U5      n	UR                  U	SXG45        UnGM\  [        R                  " SU5      (       d  [        R                  " SU5      (       Ga  U/n
US-   nU[        U5      :  a  XtS-   :  a  X7   R                  5       n[        R                  " SU5      (       dJ  [        R                  " SU5      (       d.  U(       a>  [        U5      S:  a/  X7   R                  S5      (       a  U
R                  U5        US-  nOU(       d  US-  nOOU[        U5      :  a
  XtS-   :  a  M  [        U
5      S:  a*  SR                  U
5      nUR                  USXG45        UnGM  US-  nU[        U5      :  a  GM  U$ )z|
Detect tables and lists in text.
Returns list of tuples: (content, type, start_pos, end_pos)
type: 'table', 'list', 'code'
r   r   rV   z^[\w\s]+\|[\w\s]+z^[\w\s]+\t+[\w\s]+r   2   z^[\w\s\-\.]+\|[\w\s\-\.]+z^[\w\s\-\.]+\t+[\w\s\-\.]+z
^[\-=\s]+$tableu   ^[\s]*[•\-\*\+]\s+z^[\s]*\d+[\.\)]\s+d   r   r  )r   r0   r   r   r   r   r   
startswith)r7   rt   detectedr  r
  r*  table_linesj	next_linetable_content
list_lineslist_contents               r9   _detect_tables_and_lists%PDFProcessor._detect_tables_and_listsF  se    

4 #e*n8>>#D 3u:>!880$77288DY[_;`;`#'&KAAc%j.QRZ$)HNN$4	88$@)LL88$A9MM88M9=='..y9FA! c%j.QRZ ;'1,(,		+(> (FG  xx/66"((CXZ^:_:_"V
E#e*nW % 0Ixx 7CCxx 5yAA!c)nq&8UX=P=PQT=U=U")))4Q&Q #e*nW z?a'#'99Z#8LOO\61$@AAFAe #e*nh r<   c                     U(       d  / $ Sn[         R                  " X!5      n/ nU H5  nUR                  5       n[        U5      S:  d  M$  UR	                  U5        M7     U$ )z5Split text into sentences using regex-based approach.z?(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])\s*$|(?<=[.!?"\'])\s+(?=[A-Z])
   )r   r   r   r0   r   )r7   rt   sentence_endings	sentencesr?  sents         r9   _split_into_sentences"PDFProcessor._split_into_sentences  s\    I
 ^HH-4	 D::<D4y2~t$ 
 r<   Nr  r  configc                    Uc	  [         S   nUR                  S[        5      nUR                  S[        5      nUR                  S[        5      nUR                  S[
        5      nUR                  S[        5      n	U(       a  [        UR                  5       5      U:  a  / $ U R                  U5      n
[        U
5      U	:  a  / $ / n/ nSn/ nU
 GH@  n[        U5      S	-   nUU-   U:  a~  U(       aw  S
R                  U5      R                  5       n[        U5      U:  a  UR                  U5        [        U5      U:  a  X* S OUnUR                  5       n[        S U 5       5      nUR                  U5        UU-  nX:  d  M  [        U5      U	:  d  M  S
R                  U5      R                  5       n[        U5      U:  d  M  UR                  U5        [        U5      U:  a  X* S O/ nUR                  5       n[        S U 5       5      nGMC     U(       a?  S
R                  U5      R                  5       n[        U5      U:  a  UR                  U5        [        U5      S:  a  U$ / $ )z
Split text into chunks using sentence-based sliding window.
This is the PRIMARY chunking method - deterministic and reliable.
Now supports section-aware chunking and document-type-specific parameters.
Nr   r    chunk_min_sizechunk_max_sizesentence_overlapmin_sentencesr   r   r   c              3   >   #    U  H  n[        U5      S -   v   M     g7fr   r0   r   ss     r9   r   :PDFProcessor._sentence_based_chunk_text.<locals>.<genexpr>  s     $GASVaZ   c              3   >   #    U  H  n[        U5      S -   v   M     g7fr   r  r  s     r9   r   r    s     (K]Q!]r  )r   r   r	   r   r   r   r   r0   r   r  r   r   copyr   )r7   rt   r  r  r  r    r  r  r  r  r  r_   current_chunkcurrent_lengthoverlap_sentencessentencesentence_lengthrU  s                     r9   r   'PDFProcessor._sentence_based_chunk_text  s<    >*95FZZj9
$4nE$4nE!::&8:JK

?4KLs4::<(>9I ..t4	y>M)I!H!(ma/O /.@] XXm4::<
z?n4MM*- JM]I[_oIoM2C2D$E  vC! 1 6 6 8!$$G$G!G   *o-N +M0Bm0S XXm4::<
z?n4MM*-MPQ^M_csMs6G6H(Iy{%$5$:$:$<M%((K](K%KN7 "< -0668J:.0j)Vqv0b0r<   c                    Uc	  [         S   nUR                  S[        5      nUR                  S[        5      nU(       a  [	        UR                  5       5      U:  a  / $ U R                  U5      n[	        U5      U R                  S'   U R                  X5      n/ nU GH  u  pn
U	(       a  [	        U	R                  5       5      U:  a  M-  U R                  U	5      nU HE  u  p  nUS:X  a  U R                  S==   S-  ss'   M&  US	:X  d  M.  U R                  S
==   S-  ss'   MG     U(       a  U	n[        U5       Hw  u  pnnUR                  S5      nUUU nSR                  U5      n[	        U5      U:  a   [	        U5      U::  a  UR                  U5        SR                  USU UUS -   5      nMy     UR                  5       (       aF  [	        UR                  5       5      U:  a&  U R                  XX5      nUR                  U5        GMp  GMs  GMv  U R                  XX5      nUR                  U5        GM     [	        U5      S:X  a  U R                  USSU5      $ U$ )z
Structure-aware chunking: detect sections, process hierarchically.
This is the ENHANCED chunking method that respects document structure.
Now supports document-type-specific parameters.
Nr   r  r  r*   rz  r+   r   r  r,   r   r   rf   )r   r   r   r   r0   r   rl  r6   rv  r  reversedr   r   r   r   r  )r7   rt   r  r  r  rd  rp  r!  r  ru  r  tables_listsr:  content_typeprocessed_contentr  	start_posrt  r  r~  
table_textsection_chunkss                         r9   r   (PDFProcessor._structure_aware_chunk_text  sY    >*95F$4nE$4nEs4::<(>9I ((.58]12 ,,T< 
=E9MM"c/*?*?*A&B^&S  88IL *6%A7*))*;<A<!V+))*:;q@;	 *6 $3!GOP\G]CMG-33D9E"'	'":K!%;!7J :.8S_P^=^"))*5 )-		%
2CeGHo2U(V% H^ %**,,5F5L5L5N1OSa1a%)%D%DEVgt%}N%%n5 2b,
 "&!@!@an!w!!.1M >FR z?a224QGGr<   r  c                    Uc	  [         S   nUR                  S[        5      nUR                  S[        5      nU(       a  [	        UR                  5       5      U:  a  g[        [        R                  " SU5      5      n[        [        R                  " SU[        R                  5      5      nU(       d  U(       a  [	        UR                  5       5      U:  $ U R                  U5      n[	        U5      U:  a  gUR                  5       nU R                   H/  n	[        R                  " X[        R                  5      (       d  M/    g   [        S U 5       5      n
[	        U5      S:  a  U
[	        U5      -  S	:  a  g[        R                  " S
U5      n[	        U5      S:  aV  [	        [!        U5      5      nU[	        U5      -  nUS:  a  g1 SknU Vs/ s H  oU;  d  M
  UPM     nn[	        U5      S:  a  ggs  snf )z
Validate chunk quality - reject boilerplate and low-quality chunks.
Enhanced quality gates for Phase 2 & 3.
Returns True if chunk should be kept, False if it should be filtered out.
r   r  r  F\|   ^\s*[•\-\*\+]\s+c              3   |   #    U  H2  oR                  5       (       d  UR                  5       (       d  M.  S v   M4     g7fr   )isalnumisspacer3  s     r9   r   /PDFProcessor._is_valid_chunk.<locals>.<genexpr>H  s"      PEqYY[[AIIKEs   -<	<r   r   \b\w+\bg333333?>-   ar
  anatbebydoheinisitofonortoweandarebutcandidforhadhasmayshethewasyoubeendoeshavemustthattheythiswerewillwithcouldmightthesethosewouldshould   T)r   r   r   r   r0   r   boolr   search	MULTILINEr  ri   r5   
IGNORECASEr   findallset)r7   r  r  r  r  is_tableis_listr  chunk_lowerpatternalphanumeric_charswordsunique_wordsunique_ratio
stop_wordsrU   meaningful_wordss                    r9   r   PDFProcessor._is_valid_chunk%  s    >*95F$4nE

?4KLEKKM*^; 		%/0ryy!6r||LMwu{{}%77 ..u5	y>M) kkm00Gyyr}}== 1
 ! PE PPu:>03u:=C 

:{3u:>s5z?L'#e*4Lc! ZJ+0H5aZ4G5H#$q(	  Is   	G8G8	min_charsc                     U(       d  U$ / nU H:  nU(       a  [        U5      U:  a  US   S-   U-   US'   M)  UR                  U5        M<     U$ )z*Avoid creating very small dangling chunks.r   )r0   r   )r7   r_   r  mergedr  s        r9   rL   PDFProcessor._merge_small_chunks\  sQ    ME#e*y0#BZ&058r
e$	  r<   chunk_contentsourcechunk_idtotal_chunksr  r  c	                   ^ UR                  5       mU R                  U5      n	[        R                  " SU5      n
Sn[        R                  " STSS [        R
                  5      nU(       a   [        UR                  S5      5      n[        [        R                  " SU5      5      n[        [        R                  " SU[        R                  5      5      nU(       a  SO
U(       a  S	OS
n[        U4S jS 5       5      n[        U4S jU R                   5       5      nU	(       aG  [        S U	 5       5      [        U	5      -  n[        S[        SS[!        US-
  5      S-  -
  5      5      nOSn[        R"                  " SSUSS 5      R%                  5       nUUUUUUU(       a  UOSU(       a  UOS[        U	5      [        U
5      [        U5      US.['        US5      UUS.US.$ !    GNc= f)zg
Extract clean, actionable metadata from chunk content.
Enhanced with section information for Phase 2.
r  Nz(?:page|p\.?)\s*(\d+)r9  r   r  r  rz  r  rt   c              3   ,   >#    U  H	  oT;   v   M     g 7fr(  r   )r   termcontent_lowers     r9   r   7PDFProcessor._extract_chunk_metadata.<locals>.<genexpr>  s!       $P  GOdM$9  GOs   )r   r   r   r   datar   r   c              3   p   >#    U  H+  n[         R                  " UT[         R                  5      v   M-     g 7fr(  )r   r  r  )r   r  r  s     r9   r   r    s(     w]vRYRYYwr}}MM]vs   36c              3   T   #    U  H  n[        UR                  5       5      v   M      g 7fr(  )r0   r   r  s     r9   r   r    s     %Hic!'')nnis   &(g      ?g           r   z\s+r   )sentence_count
word_count
char_countr  rV   )readability_scorehas_technical_contentis_boilerplate)r  r  r  r  r  rv   r   r  structural_infocontent_qualityclean_excerpt)ri   r  r   r  r  r  r/   rb  r  r  r<  r5   r   r0   r  r.   absr   r   round)r7   r  r  r  r  r  r  r  r  r  r  page_number
page_matchr  r  r  r  r  avg_sentence_lengthr  r
  r  s                        @r9   r   $PDFProcessor._extract_chunk_metadataj  s    &++- ..}=	

:}5 YY7t9Lbmm\
!*"2"21"56
 		%78ryy!6r||TU"*w7 !$  $P  GO  $P  !Pw]a]v]vww "%%Hi%H"H3y>"Y #CS#<ORT<T8UXZ8Z2Z)[ \ # vsM$3,?@FFH  (,((5}4.;]"%i.!%j!-0 ,	  &++<a%@)>"0 
 +)
 	
-s   -G% %G*)r5   r6   r2   r3   r4   r1   )rf   r   Nr(  )i  )rf   r   )!__name__
__module____qualname____firstlineno__r:   rF   rO   rc   rx   r   rY   rk   r   dictr   r  r  r"  r   rV  r  rl  rv  r  r  r/   r   r   r  r   rL  r   __static_attributes__r   r<   r9   r   r   .   s   %
N5
+(4Kl t     0# 0 0 0dZ# Z$ ZFP*$5C 53 53 500 0d8S 8T 8t   @>S >T >@# $ (A1s A13 A1\_ A1mq A1  ~B A1FF FT FT FP5S 5$ 5$ 5nS   OP>
S >
# >
QT >
.1>
CF>
VY>
/2>
HK>
TX>
 >
r<   r   )$r@   rL   r   pathlibr   numpyrJ  langchain_text_splittersr   sentence_transformersr   airagagent.configr   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r}   rj   r   pdfminer.high_levelr   rn   ro   r   r   r   r<   r9   <module>r     s      	   C 5    &K
0Lz
 z
  K  $ 	s<   A  A.  A+*A+.B5A??BB
BB