
    #i                        d dl Z d dlZd dlZd dlmZ d dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ 	 d dlZdZ	 d dlmZ dZ  G d
 d      Z#y# e$ r dZY w xY w# e$ r 	 d d	l!m"Z" dZ n# e$ r dZ Y nw xY wY 6w xY w)    N)Path)RecursiveCharacterTextSplitter)SentenceTransformer)PDF_DIRDOCUMENTS_DIRMETADATA_DIR
CHUNK_SIZECHUNK_OVERLAPCHUNK_MIN_SIZECHUNK_MAX_SIZESENTENCE_OVERLAPMIN_SENTENCES_PER_CHUNKEMBEDDING_MODELSEMANTIC_CHUNKING_ENABLEDSEMANTIC_MAX_CHARSSEMANTIC_SIMILARITY_THRESHOLDTECHNICAL_TERMSMEASUREMENT_PATTERNSTF)extract_text	PdfReaderc                   \   e Zd Zd Zd Zd Zd Zd Zdede	fdZ
d	e	d
e	de	fdZde	defdZd Zd Zd Zd	e	de	de	fdZd	e	fdZd	e	defdZd	e	dedefdZd	e	defdZd	e	defdZd*d	e	de	dededef
dZd+d	e	dedefdZd+de	dedefd Zd,d!efd"Z	 d-d#e	d$e	d%ed&ed'e	d(e	de	dedefd)Zy).PDFProcessorc           	          t        t        t        dz        t              }t	        t        |t
        g d      | _        d| _        t        | _	        t        | _        g d| _        d d ddddddd| _        y )N皙?)


z.  )
chunk_sizechunk_overlaplength_function
separatorsF)	u   copyright\s+©?\s*\d{4}zall rights reservedzpage\s+\d+\s+of\s+\d+confidentialdraftz^\s*$z^\d+$z[a-f0-9]{32,}z	^[\s\W]*$r   )
start_timeend_timechunks_createdchunks_filteredsections_detectedtables_detectedlists_detectedmemory_peak_mb)maxintr	   r
   r   lentext_splittersemantic_chunking_enabledr   semantic_max_charsr   semantic_similarity_thresholdboilerplate_patternsprocessing_stats)selfoverlap_sizes     2/var/www/html/backtest/airagagent/pdf_processor.py__init__zPDFProcessor.__init__/   sy    3zC/0-@;!&0	
 */&"4-J*
%
!  !" 	!
    c                     t        |d      5 }t        j                  |j                               j	                         cddd       S # 1 sw Y   yxY w)z/Generate hash of file content to detect changesrbN)openhashlibmd5read	hexdigest)r6   	file_pathfs      r8   get_file_hashzPDFProcessor.get_file_hashV   s>    )T" 	5a;;qvvx(224	5 	5 	5s   1AAc                     t         dz  }|j                         r+t        |d      5 }t        j                  |      cddd       S i S # 1 sw Y   i S xY w)z%Get record of already processed filesprocessed_files.jsonrN)r   existsr=   jsonload)r6   metadata_filerC   s      r8   get_processed_filesz PDFProcessor.get_processed_files[   sO    $'==!mS) $Qyy|$ $	$	s   AAc                    |j                   }| j                         }|t        |      t        |j	                         j
                        d||<   t        t        dz  d      5 }t        j                  ||d       ddd       t        t        |      j                   dz  }t        |d      5 }t        j                  ||d       ddd       y# 1 sw Y   VxY w# 1 sw Y   yxY w)z%Record processed file and save chunks)hashchunk_countprocessed_daterF   w   )indentNz_chunks.json)namerL   r/   strstatst_mtimer=   r   rI   dumpr   r   stem)r6   rB   	file_hashchunksfilenamemetadatarC   
chunk_files           r8   save_processed_filez PDFProcessor.save_processed_filec   s    >> ++-v;!).."2";";<
 ,!77= 	-IIh!,	- #X(;(;'<L%II
*c" 	+aIIfa*	+ 	+	- 	-
	+ 	+s   #C/CCC&c                 p   |j                   j                         dk(  rnt        r	 | j                  |      S 	 t        rt        t        |            }|S ddl	m
}  ||      }d}|j                  D ]  }||j                         dz   z  } |S |j                   j                         dk(  r)	 t        |dd      5 }|j                         cddd       S t        d|j                          # t        $ r}t        d| d|        Y d}~d}~ww xY w# t        $ r}t        d	| d|        	 ddl	m
}  ||      }d}|j                  D ]  }||j                         dz   z  } |cY d}~S # t        $ r%}t        d
| d|        d| cY d}~cY d}~S d}~ww xY wd}~ww xY w# 1 sw Y   nxY wy# t        $ r^ 	 t        |dd      5 }|j                         cddd       cY S # 1 sw Y   nxY wn## t        $ r}t        d| d|         d}~ww xY wY yt        $ r}t        d| d|         d}~ww xY w)z?Extract text from PDF or text files using best available methodz.pdfz'Warning: PyMuPDF extraction failed for z: Nr   r    r   z#Warning: PDF extraction failed for z.Error: Both PDF extraction methods failed for z Error extracting text from PDF: z.txtrG   zutf-8)encodingzlatin-1z"Warning: Failed to read text file z with latin-1 encoding: zError reading text file zUnsupported file type: )suffixlowerHAS_PYMUPDF_extract_with_pymupdf	ExceptionprintHAS_PDFMINERr   rU   PyPDF2r   pagesr=   r@   UnicodeDecodeError
ValueError)	r6   rB   etextr   readerpagee2rC   s	            r8   extract_text_from_filez#PDFProcessor.extract_text_from_filew   sx   !!#v-V55i@@C'I7DK 1&y1FD & ; 1 1 3d ::;K ##%/)S7; $q668$ $ 6y7G7G6HIJJ[ ! VCI;bQRPSTUUV   C;I;bLM	C0&y1FD & ; 1 1 3d ::;K  CJ9+UWXZW[\]=bTBBBCC$ $ $% iyA (Q vvx( ( ( (  >ykIabcadef(
  02aSABs   C* D 8D *F0 8F#	F0 *	D3D		D	F F,8E*$F *	F3FF	FF FFF #F,(F0 0	H5:G1G$	G1!H5$G-	)G10H51	H:HHH5H5H00H5rB   returnc                 4   t        j                  |      }g }	 |D ]E  }|j                  dt         j                  t         j                  z        }|j                  |       G 	 |j                          dj                  |      S # |j                          w xY w)zBHigh-fidelity extraction using PyMuPDF with basic structural cues.ro   )flagsr   )fitzr=   get_textTEXT_DEHYPHENATETEXT_PRESERVE_WHITESPACEappendclosejoin)r6   rB   docrk   rq   ro   s         r8   rf   z"PDFProcessor._extract_with_pymupdf   s    ii	"	 #}}//$2O2OO %  T"# IIKyy IIKs   A
B Bro   r\   c                 x  	
 |j                         
|j                         	g d}g d}g d}g d}t        	
fd|D              t        	
fd|D              t        	
fd|D              t        	
fd|D              d	}t        |j                               }|d
k\  rt        ||j                        S y)z
        Detect document type based on content and filename.
        Returns: 'technical', 'research', 'legal', 'manual', or 'default'
        )specification	technicalapiprotocolstandardmeasurementcalibrationconfiguration	parameterphppmtemperaturevoltagecurrent)abstractintroductionmethodologyresults
conclusion
referencescitationstudyresearch
experiment
hypothesiszpeer-reviewedjournal)whereasherebypursuantstatute
regulation
compliancelegallawact	ordinanceclausesection
subsectionarticle	paragraph)
manualguidetutorialzhow tostep	procedureinstructionuserzgetting startedzquick startc              3   4   K   | ]  }|v s|v sd   yw   N .0indfilename_lower
text_lowers     r8   	<genexpr>z5PDFProcessor._detect_document_type.<locals>.<genexpr>   s!     m3#BSWZ^lWlQm   c              3   4   K   | ]  }|v s|v sd   ywr   r   r   s     r8   r   z5PDFProcessor._detect_document_type.<locals>.<genexpr>   s!     k#z@QUX\jUjAkr   c              3   4   K   | ]  }|v s|v sd   ywr   r   r   s     r8   r   z5PDFProcessor._detect_document_type.<locals>.<genexpr>   s      es#:KsVdOder   c              3   4   K   | ]  }|v s|v sd   ywr   r   r   s     r8   r   z5PDFProcessor._detect_document_type.<locals>.<genexpr>   s!     gC:<MQTXfQf!gr   )r   r   r   r   rR   )keydefault)rd   sumr-   valuesget)r6   ro   r\   technical_indicatorsresearch_indicatorslegal_indicatorsmanual_indicatorsscores	max_scorer   r   s            @@r8   _detect_document_typez"PDFProcessor._detect_document_type   s    
 ZZ\
!) 



 m*>mmk)<kke&6eeg'8gg	
 (	>v6::..r:   document_typec                 <    t         j                  |t         d         S )z8Get chunking configuration for a specific document type.r   )DOCUMENT_TYPE_CONFIGSr   )r6   r   s     r8   _get_chunking_configz!PDFProcessor._get_chunking_config   s    $((8Mi8XYYr:   c                 $
    ddl }ddl}	 ddl}d}|j                  } j                  |      } j                         }||v r||   d   |k(  rt        d| d       yt        d| d	       |rj                  |j                               }	|j                         dddddddd
d	 _
         j                  |      }
 j                  |
|      } j                  |
|      }| j                  d<   t        d|         j                  |      } j                  |
|      }d| }t!        |       j                  d<   |s(t        d| d        j#                  |
dd|      }d| }|s,t        d| d        j$                  j'                  |
      }d}g }t)        |      D ]O  \  }} j+                  ||      r|j-                  |       * j                  dxx   dz  cc<   t        d| d       Q g }t)        |      D ]  \  }}d}d}|j/                  d      }|rt!        |d   j1                               dk  re|d   j1                         }|j3                         rt!        |      dk\  r|}d}n/t5        j6                  d|      rt5        j8                  dd|      }d} j;                  |||t!        |      ||||      }|j-                  ||d        |j                          j                  d <   |r.	j=                         j>                  d!z  d!z   j                  d"<   n:	 ddl }|jC                  |jD                        jF                  d!z   j                  d"<   |D cg c]  }t!        |d#          }}|rtI        |      t!        |      z   j                  d$<   tK        |       j                  d%<   tM        |       j                  d&<   tI         fd'|D              t!        |      z  d(z   j                  d)<    jO                  |||        j                  d     j                  d*   z
  }t        d+| d,t!        |       d-t!        |       d.       t        d/| d0|d1d2 j                  d"   d3d4       t        d5 j                  jQ                  d$d      d6d7 j                  jQ                  d%d       d8 j                  jQ                  d&d              |S # t        $ r d}Y w xY w#  d j                  d"<   Y xY wc c}w )9z3Process a single PDF file with Phase 3 enhancementsr   NTFrN   u   ✓ z already processed, skipping...zProcessing z...r   )	r%   r&   r'   r(   r)   r*   r+   r,   r   r   z  Detected document type: structure_aware_r'   z-Warning: Structure-aware chunking failed for z, using sentence-basedra   sentence_based_z&Warning: Sentence chunking failed for z, using recursive fallbackrecursive_fallbackr(   r   z  Filtered out chunk z (boilerplate/low quality)r   P      z	^\d+\.\s+)contentr]   r&   i   r,   r   chunk_size_avgchunk_size_minchunk_size_maxc              3   H   K   | ]  }|j                   d    z
  dz    yw)r   rR   N)r5   )r   xr6   s     r8   r   z+PDFProcessor.process_pdf.<locals>.<genexpr>m  s/       ;BpqA@U@UVf@g<gjk;k  ;Bs   "      ?chunk_size_stdr%   u   ✓ Processed z into z chunks (filtered from )z  Type: z, Time: z.2fzs, Memory: z.1fMBz  Chunk sizes: avg=z.0fz, min=z, max=))timeospsutilImportErrorrT   rD   rL   rh   Processgetpidr5   rs   _extract_document_titler   r   _structure_aware_chunk_textr/   _sentence_based_chunk_textr0   
split_text	enumerate_is_valid_chunkr{   splitstripisupperrematchsub_extract_chunk_metadatamemory_inforssresource	getrusageRUSAGE_SELF	ru_maxrssr   minr-   r_   r   )r6   rB   r   r   r   
HAS_PSUTILr\   rZ   processed_filesprocessro   document_titler   
doc_configr[   chunk_methodfiltered_chunksichunk
chunk_datasection_titlesection_levellines
first_liner]   r   chunk_sizesdurations   `                           r8   process_pdfzPDFProcessor.process_pdf   s   	J >>&&y1	224 &x(0I=XJ&EFGH:S)* nnRYY[1G))+ !" &
!
 **9555dHE 224B1>o.*=/:; ..}=
 11$
C)-925f+./ A(Kabc44T2q*MF,]O<L :8*D^_`''2248F/L !&) 	MHAu##E:6&&u-%%&78A=8-aS0JKL	M 
!/2 	HAuMMKK%EU1X^^-.3"1X^^-
%%'C
Oq,@$.M$%MXXlJ7$&FF<Z$HM$%M33xC$8.,}H  $ )	4 -1IIKj)6=6I6I6K6O6ORV6VY]6]D!!"23<:B:L:LXMaMa:b:l:los:s%%&67
 ;EEs5+,EE69+6F[IY6YD!!"2369+6FD!!"2369+6FD!!"237:  ;B  vA  ;B  8B  EH  IT  EU  8U  X[  7[D!!"23  IzB ((4t7L7L\7ZZxjs:.??VWZ[aWbVccdefx~[I^I^_oIpqtHuuwxy#D$9$9$=$=>NPQ$RSV#W X**../?CD E**../?CDF 	G {  	J	P<:;%%&67 Fs#   S$ (9S6 %T$S32S36T
c                 \   ddg}g }|D ]/  }|j                  t        t        j                  |                   1 | j	                         }g }|D ]Z  }|j
                  |vr|j                  |       #| j                  |      }||j
                     d   |k7  sJ|j                  |       \ |S )z<Get list of files (PDFs and text files) that need processingz*.pdfz*.txtrN   )extendlistr   globrL   rT   r{   rD   )r6   supported_extensionsfilesextr   	new_filesrB   current_hashs           r8   get_new_fileszPDFProcessor.get_new_files{  s     '1' 	2CLLgll3/01	2 224	 	0I~~_4  +  $11)<"9>>26:lJ$$Y/	0 r:   c                     | j                         }g }|st        d       g S t        dt        |       d       |D ]'  }| j                  |      }|s|j	                  |       ) |S )z!Process all new or modified fileszNo new files to process.zFound z! new/modified files to process...)r  rh   r/   r  r
  )r6   r  
all_chunksrB   r[   s        r8   process_all_newzPDFProcessor.process_all_new  su    &&(	
,-Is9~&&GHI" 	*I%%i0F!!&)	*
 r:   fallbackc                 >   |st        |      j                  S |j                         D ]  j                         st	              dk  st	        j                               dk(  rAt        fddD              rVt        d D              dkD  rlj                         k(  rt        d D              dkD  rt        j                  d	d
      }t	        |j                               dk  r|dd c S  t        |      j                  j                  dd      S )z6Derive a human friendly title from the extracted text.   r   c              3   B   K   | ]  }|j                         v   y wN)rd   )r   tokenlines     r8   r   z7PDFProcessor._extract_document_title.<locals>.<genexpr>  s     sU5DJJL(ss   )libraryzdue date	copyrightisbnwwwemailc              3   <   K   | ]  }|j                           y wr  )isdigitr   cs     r8   r   z7PDFProcessor._extract_document_title.<locals>.<genexpr>  s     -1199;-      c              3   <   K   | ]  }|j                           y wr  )isalphar$  s     r8   r   z7PDFProcessor._extract_document_title.<locals>.<genexpr>  s     +FAAIIK+Fr&  r   z[^A-Za-z0-9\s\'\-:,]ra   N   _r   )r   rY   
splitlinesr   r/   r   anyr   upperr   r   replace)r6   ro   r  cleanedr  s       @r8   r   z$PDFProcessor._extract_document_title  s    >&&&OO% 	!D::<D4y1}DJJL 1Q 6s6rss---1zz|t#+F+F(F(Jff4b$?G7==?#a'4C= !	!" H~""**344r:   c           	         | j                   syt        j                  d|      D cg c]/  }t        |j	                               dkD  s |j	                         1 }}t        |      dk  ry	 | j                   j                  |ddd      }g }|d   g}t        |d         }t        d
t        |            D ]  }	t        t        j                  ||	   ||	d
z
                 }
|t        ||	         z   dz   }|
| j                  k  s|| j                  kD  rGdj                  |      j	                         }|r|j                  |       ||	   g}t        ||	         }|j                  ||	          |} |r2dj                  |      j	                         }|r|j                  |       | j!                  |      }t        |      d
k  ry|S c c}w # t        $ r}t        d| d	       Y d}~yd}~ww xY w)zHSplit text into semantically coherent chunks using paragraph embeddings.Nz\n{2,}r   r       T)
batch_sizeconvert_to_numpynormalize_embeddingsz*Warning: semantic chunk embedding failed (z).r   rR   r   )embedding_modelr   r   r/   r   encoderg   rh   rangefloatnpdotr3   r2   r}   r{   _merge_small_chunks)r6   ro   para
paragraphs
embeddingsexcr[   r   current_lenidx
similaritycandidate_len
chunk_texts                r8   _semantic_chunk_textz!PDFProcessor._semantic_chunk_text  s   ##/1xx	4/HbtCPTPZPZP\L]`aLadjjlb
bz?Q		--44!%%)	 5 J a=/*Q-(C
O, 	,Crvvjoz#'7JKLJ'#jo*>>BMD>>>-RVRiRiBi#[[1779
MM*-%c?+!*S/2z#/+	, W-335Jj)))&1v;!W c  	>se2FG	s#   !GG,G 	G/G**G/c                 h   g }|j                  d      }g d}t        |      D ]  \  }}|j                         }|rt        |      dk  r(|j	                         r#t        |      dk\  r|j                  ||df       [t        j                  d|      }|rct        d |j                         dd D              }	t        |	d      }
|j                  d      }t        |      dkD  r|j                  |||
f       t        j                  d	|      }|rOt        |j                  d            }
|j                  d
      }t        |      dkD  r|j                  |||
f       =t        j                  d|      sUt        |      dk  se|j                  d      rx|j                  ||d
f        |S )z
        Detect headings and sections in the document.
        Returns list of tuples: (heading_text, position, level)
        Level: 1 = major section, 2 = subsection, 3 = sub-subsection
        r   ))z^\s*[A-Z][A-Z\s]{3,}\s*$r   )z^\s*(\d+)\.\s+([A-Z][^\n]+)$N)z+^\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,})\s*$rR   )^\s*(#{1,3})\s+(.+)$Nr'  r   r   z)^\s*(\d+)(?:\.(\d+))?(?:\.(\d+))?\s+(.+)$c              3   &   K   | ]	  }|sd   ywr   r   )r   gs     r8   r   z0PDFProcessor._detect_headings.<locals>.<genexpr>  s     H!aAHs   NrH  rR   z ^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+$r   ).!?:)r   r   r   r/   r   r{   r   r   r   groupsr   groupendswith)r6   ro   headingsr  patternsr   r  line_strippednumbered_matchdepthlevelheading_textmarkdown_matchs                r8   _detect_headingszPDFProcessor._detect_headings  s    

4 	
 !' !	;GAt JJLM C$6$: $$&3}+=+B1 56  XX&RTabNH~'<'<'>r'BHHE1-33A6|$q(OO\1e$<=  XX&=}MNN0034-33A6|$q(OO\1e$<= xx;]K}%*=3I3IJ^3_OO]Aq$9:C!	;F r:   rR  c                    |sd|dfgS g }|j                  d      }|d   d   dkD  r>dj                  |d|d   d          j                         }|r|j                  d|df       t	        t        |            D ]n  }||   \  }}}	|dz   t        |      k  r||dz      d   n
t        |      }
dj                  ||dz   |
       j                         }|s[|j                  |||	f       p |S )z
        Split text into sections based on detected headings.
        Returns list of tuples: (section_title, section_content, level)
        ra   r   r   r   NIntroduction)r   r}   r   r{   r8  r/   )r6   ro   rR  sectionsr  start_contentr   rX  heading_posheading_levelend_possection_contents               r8   _split_into_sectionsz!PDFProcessor._split_into_sections&  s   
 qM?"

4  A;q>A IIeOXa[^&<=CCEM BC s8}% 
	PA7?{4L+} +,A#H*=hqsmA&3u:G #iik!mG(DEKKMO NO
	P r:   c                    g }|j                  d      }d}|t        |      k  r\||   j                         }|t        |      dz
  k  rt        j                  d|      st        j                  d|      r|g}|dz   }|t        |      k  r||dz   k  r||   j                         }t        j                  d|      s,t        j                  d	|      st        j                  d
|      r|j                  |       |dz  }nn|t        |      k  r	||dz   k  rt        |      dk\  r*dj                  |      }	|j                  |	d||f       |}:t        j                  d|      st        j                  d|      r|g}
|dz   }|t        |      k  r||dz   k  r||   j                         }t        j                  d|      s:t        j                  d|      s$|r9t        |      dkD  r+||   j                  d      r|
j                  |       |dz  }n	|s|dz  }nn|t        |      k  r	||dz   k  rt        |
      dk\  r*dj                  |
      }|j                  |d||f       |}V|dz  }|t        |      k  r\|S )z
        Detect tables and lists in text.
        Returns list of tuples: (content, type, start_pos, end_pos)
        type: 'table', 'list', 'code'
        r   r   rR   z^[\w\s]+\|[\w\s]+z^[\w\s]+\t+[\w\s]+r   2   z^[\w\s\-\.]+\|[\w\s\-\.]+z^[\w\s\-\.]+\t+[\w\s\-\.]+z
^[\-=\s]+$tableu   ^[\s]*[•\-\*\+]\s+z^[\s]*\d+[\.\)]\s+d   r   r  )r   r/   r   r   r   r{   r}   
startswith)r6   ro   detectedr  r   r  table_linesj	next_linetable_content
list_lineslist_contents               r8   _detect_tables_and_listsz%PDFProcessor._detect_tables_and_listsF  s[    

4 #e*n8>>#D 3u:>!880$7288DY[_;`#'&KAAc%j.QRZ$)!HNN$4	88$@)L88$A9M88M9='..y9FA! c%j.QRZ ;'1,(,		+(> A(FG  xx/6"((CXZ^:_"V
E#e*nQW %a 0Ixx 7Cxx 5yA!c)nq&8U1X=P=PQT=U")))4Q&Q #e*nQW z?a'#'99Z#8LOO\61a$@AAFAe #e*nh r:   c                     |sg S d}t        j                  ||      }g }|D ]2  }|j                         }t        |      dkD  s"|j	                  |       4 |S )z5Split text into sentences using regex-based approach.z?(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])\s*$|(?<=[.!?"\'])\s+(?=[A-Z])
   )r   r   r   r/   r{   )r6   ro   sentence_endings	sentencesr0  sents         r8   _split_into_sentencesz"PDFProcessor._split_into_sentences  sc    I
 ^HH-t4	  	%D::<D4y2~t$	%
 r:   Nr  r  configc                    |	t         d   }|j                  dt              }|j                  dt              }|j                  dt              }|j                  dt
              }|j                  dt              }	|rt        |j                               |k  rg S | j                  |      }
t        |
      |	k  rg S g }g }d}g }|
D ]3  }t        |      d	z   }||z   |kD  rx|rvd
j                  |      j                         }t        |      |k\  r|j                  |       t        |      |k\  r|| d n|}|j                         }t        d |D              }|j                  |       ||z  }||k\  st        |      |	k\  sd
j                  |      j                         }t        |      |k\  s|j                  |       t        |      |k\  r|| d ng }|j                         }t        d |D              }6 |r>d
j                  |      j                         }t        |      |k\  r|j                  |       t        |      dkD  r|S g S )z
        Split text into chunks using sentence-based sliding window.
        This is the PRIMARY chunking method - deterministic and reliable.
        Now supports section-aware chunking and document-type-specific parameters.
        Nr   r   chunk_min_sizechunk_max_sizesentence_overlapmin_sentencesr   r   r   c              3   8   K   | ]  }t        |      d z     ywr   r/   r   ss     r8   r   z:PDFProcessor._sentence_based_chunk_text.<locals>.<genexpr>  s     $GASVaZ$G   c              3   8   K   | ]  }t        |      d z     ywr   r~  r  s     r8   r   z:PDFProcessor._sentence_based_chunk_text.<locals>.<genexpr>  s     (KQ!(Kr  )r   r   r	   r   r   r   r   r/   r   rv  r}   r{   copyr   )r6   ro   r  r  rw  r   ry  rz  r{  r|  rt  r[   current_chunkcurrent_lengthoverlap_sentencessentencesentence_lengthrE  s                     r8   r   z'PDFProcessor._sentence_based_chunk_text  sK    >*95FZZj9
$4nE$4nE!::&8:JK

?4KLs4::<(>9I ..t4	y>M)I! 	LH!(ma/O /.@] XXm4::<
z?n4MM*- JM]I[_oIoM3C2C2D$E  vC! 1 6 6 8!$$G$G!G   *o-N +M0Bm0S XXm4::<
z?n4MM*-MPQ^M_csMs7G6G6H(Iy{%$5$:$:$<M%((K](K%KN7	L< -0668J:.0j)Vqv0b0r:   c                 ~   |	t         d   }|j                  dt              }|j                  dt              }|rt	        |j                               |k  rg S | j                  |      }t	        |      | j                  d<   | j                  ||      }g }|D ]  \  }}	}
|	rt	        |	j                               |k  r'| j                  |	      }|D ]A  \  }}}}|dk(  r| j                  dxx   dz  cc<   %|d	k(  s+| j                  d
xx   dz  cc<   C |r|	}t        |      D ]u  \  }}}}|j                  d      }||| }dj                  |      }t	        |      |k\  rt	        |      |k  r|j                  |       dj                  |d| ||d z         }w |j                         st	        |j                               |k\  s5| j                  |||
|      }|j                  |       \| j                  |	||
|      }|j                  |        t	        |      dk(  r| j                  |dd|      S |S )z
        Structure-aware chunking: detect sections, process hierarchically.
        This is the ENHANCED chunking method that respects document structure.
        Now supports document-type-specific parameters.
        Nr   ry  rz  r)   rf  r*   r   r  r+   r   r   ra   )r   r   r   r   r/   r   rZ  r5   rc  rp  reversedr   r}   r{   r   r
  )r6   ro   rw  ry  rz  rR  r]  r  r  rb  r  tables_listsr+  content_typeprocessed_contentrm  	start_posra  r  rj  
table_textsection_chunkss                         r8   r   z(PDFProcessor._structure_aware_chunk_text  sq    >*95F$4nE$4nEs4::<(>9I ((.58]12 ,,T8< 
=E &	29M?M"c/*?*?*A&B^&S  88IL *6 A%<A7*))*;<A<!V+))*:;q@;	A $3!GOP\G] WCM<G-33D9E"'	'":K!%;!7J :.8S_P^=^"))*5 )-		%
2CeGHo2U(V%W %**,5F5L5L5N1OSa1a%)%D%DEVXegtv|%}N%%n5 "&!@!@R_anpv!w!!.1M&	2R z?a224QGGr:   r   c                    |	t         d   }|j                  dt              }|j                  dt              }|rt	        |j                               |k  ryt        t        j                  d|            }t        t        j                  d|t        j                              }|s|rt	        |j                               |k\  S | j                  |      }t	        |      |k  ry|j                         }| j                  D ])  }	t        j                  |	|t        j                        s) y t        d |D              }
t	        |      dkD  r|
t	        |      z  d	k  ryt        j                  d
|      }t	        |      dkD  rOt	        t!        |            }|t	        |      z  }|dk  ryh d}|D cg c]	  }||vs| }}t	        |      dk  ryyc c}w )z
        Validate chunk quality - reject boilerplate and low-quality chunks.
        Enhanced quality gates for Phase 2 & 3.
        Returns True if chunk should be kept, False if it should be filtered out.
        r   ry  r|  F\|   ^\s*[•\-\*\+]\s+c              3   b   K   | ]'  }|j                         s|j                         s$d  ) ywr   )isalnumisspacer$  s     r8   r   z/PDFProcessor._is_valid_chunk.<locals>.<genexpr>H  s       PqQYY[AIIK Ps   %//r   r   \b\w+\bg333333?>-   ar   anatbebydoheinisitofonortoweandarebutcandidforhadhasmayshethewasyoubeendoeshavemustthattheythiswerewillwithcouldmightthesethosewouldshould   T)r   r   r   r   r/   r   boolr   search	MULTILINErv  rd   r4   
IGNORECASEr   findallset)r6   r   rw  ry  r|  is_tableis_listrt  chunk_lowerpatternalphanumeric_charswordsunique_wordsunique_ratio
stop_wordsrQ   meaningful_wordss                    r8   r   zPDFProcessor._is_valid_chunk%  s    >*95F$4nE

?4KLEKKM*^; 		%/0ryy!6r||LMwu{{}%77 ..u5	y>M) kkm00 	Gyy+r}}=	
 ! PE PPu:>03u:=C 

:{3u:>s5z?L'#e*4Lc! ZJ+0HaAZ4GHH#$q(	  Is   2	G<G	min_charsc                     |s|S g }|D ]2  }|rt        |      |k  r|d   dz   |z   |d<   "|j                  |       4 |S )z*Avoid creating very small dangling chunks.r   )r/   r{   )r6   r[   r  mergedr   s        r8   r<  z PDFProcessor._merge_small_chunks\  sV    M 	%E#e*y0#BZ&058r
e$		% r:   chunk_contentsourcechunk_idtotal_chunksr   r   c	                    |j                         | j                  |      }	t        j                  d|      }
d}t        j                  ddd t        j
                        }|r	 t        |j                  d            }t        t        j                  d|            }t        t        j                  d|t        j                              }|rdn|rd	nd
}t        fddD              }t        fd| j                  D              }|	rGt        d |	D              t        |	      z  }t        dt        ddt!        |dz
        dz  z
              }nd}t        j"                  dd|dd       j%                         }|||||||r|nd|r|ndt        |	      t        |
      t        |      |dt'        |d      ||d|dS #  Y ExY w)z
        Extract clean, actionable metadata from chunk content.
        Enhanced with section information for Phase 2.
        r  Nz(?:page|p\.?)\s*(\d+)r*  r   r  r  rf  r  ro   c              3   &   K   | ]  }|v  
 y wr  r   )r   termcontent_lowers     r8   r   z7PDFProcessor._extract_chunk_metadata.<locals>.<genexpr>  s       $PdDM$9  $Ps   )r   r   r   r   datar   r   c              3   h   K   | ])  }t        j                  |t         j                         + y wr  )r   r  r  )r   r  r  s     r8   r   z7PDFProcessor._extract_chunk_metadata.<locals>.<genexpr>  s#     wRYRYYwr}}Mws   /2c              3   N   K   | ]  }t        |j                                 y wr  )r/   r   r  s     r8   r   z7PDFProcessor._extract_chunk_metadata.<locals>.<genexpr>  s     %Hc!'')n%Hs   #%g      ?g           r   z\s+r   )sentence_count
word_count
char_countr  rR   )readability_scorehas_technical_contentis_boilerplate)r  r  r  r   r   rq   r   r  structural_infocontent_qualityclean_excerpt)rd   rv  r   r  r  r  r.   rP  r  r  r-  r4   r   r/   r   r-   absr   r   round)r6   r  r  r  r  r   r   r  r  rt  r  page_number
page_matchr  r  r  r  r  avg_sentence_lengthr  r  r  s                        @r8   r   z$PDFProcessor._extract_chunk_metadataj  s    &++- ..}=	

:}5 YY7t9Lbmm\
!*"2"21"56
 		%78ryy!6r||TU"*w7 !$  $P  GO  $P  !Pw]a]v]vww "%%Hi%H"H3y>"Y #CS#<ORT<T8UXZ8Z2Z)[ \ # vsM$3,?@FFH  (,((5}4.;]"%i.!%j!-0 ,	  &++<a%@)>"0 
 +)
 	
-s   &G   G)ra   r   Nr  )i  )ra   r   )__name__
__module____qualname__r9   rD   rL   r_   rs   r   rU   rf   r   dictr   r  r  r  r   rF  r  rZ  rc  rp  rv  r.   r   r   r  r   r<  r   r   r:   r8   r   r   .   s   %
N5
+(4Kl t     0# 0 0 0dZ# Z$ ZFP*$5C 53 53 500 0d8S 8T 8t   @>S >T >@# $ (A1s A13 A1\_ A1mq A1  ~B A1FF FT FT FP5S 5$ 5$ 5nS   OP>
S >
# >
QT >
.1>
CF>
VY>
/2>
HK>
TX>
r:   r   )$r>   rI   r   pathlibr   numpyr:  langchain_text_splittersr   sentence_transformersr   airagagent.configr   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   rw   re   r   pdfminer.high_levelr   ri   rj   r   r   r   r:   r8   <module>r     s      	   C 5    &K
0Lz
 z
  K  $ 	sH   A  A-  A*)A*-B3A<;B<BBBBB