
    +Si`              #          d Z ddlZddlZddlmZmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZmZmZmZmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ 	 	 	 	 	 	 	 	 	 	 	 	 	 d$de
de"de%de%dedz  de&dee&   dz  de%de'de&de%de%dz  de(de(de(de	ddf"dZ)	 	 	 	 	 	 d%d e#de%dee&   dz  de&d!e(de%dedz  de%fd"Z*	 	 	 	 	 d&d e#de%dee&   dz  de&d!e(dedz  dee   fd#Z+y)'zIFunctions that can be used for the most common use-cases for pdfminer.six    N)	ContainerIterator)StringIO)AnyBinaryIOcast)HOCRConverterHTMLConverterPDFPageAggregatorTextConverterXMLConverter)ImageWriter)LAParamsLTPage)	PDFDeviceTagExtractor)PDFValueError)PDFPageInterpreterPDFResourceManager)PDFPage)AnyIO
FileOrNameopen_filenameinfoutfpoutput_typecodeclaparamsmaxpagespage_numberspasswordscalerotation
layoutmode
output_dirstrip_controldebugdisable_cachingkwargsreturnc           	         |r1t        j                         j                  t         j                         d}|rt	        |      }t        |       }d}|dk7  r-|t        j                  k(  rt        j                  j                  }|dk(  rt        |||||      }nw|dk(  rt        ||||||      }n`|dk(  rt        |||||
||      }nH|d	k(  rt        |||||
      }n2|dk(  rt        |t        t        |      |      }nd| }t!        |      |J t#        ||      }t%        j&                  | ||||       D ]*  }|j(                  |	z   dz  |_        |j+                  |       , |j-                          y)ak  Parses text from inf-file and writes to outfp file-like object.

    Takes loads of optional arguments but the defaults are somewhat sane.
    Beware laparams: Including an empty LAParams is not the same as passing
    None!

    :param inf: a file-like object to read PDF structure from, such as a
        file handler (using the builtin `open()` function) or a `BytesIO`.
    :param outfp: a file-like object to write the text to.
    :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
        Only 'text' works properly.
    :param codec: Text decoding codec
    :param laparams: An LAParams object from pdfminer.layout. Default is None
        but may not layout correctly.
    :param maxpages: How many pages to stop parsing after
    :param page_numbers: zero-indexed page numbers to operate on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param scale: Scale factor
    :param rotation: Rotation factor
    :param layoutmode: Default is 'normal', see
        pdfminer.converter.HTMLConverter
    :param output_dir: If given, creates an ImageWriter for extracted images.
    :param strip_control: Does what it says on the tin
    :param debug: Output more logging data
    :param disable_caching: Does what it says on the tin
    :param other:
    :return: nothing, acting as it does on two streams. Use StringIO to get
        strings.
    Ncachingtext)r   r   imagewriterxml)r   r   r/   stripcontrolhtml)r   r"   r$   r   r/   hocr)r   r   r1   tag)r   z1Output type can be text, html, xml or tag but is r   r!   r-   ih  )logging	getLoggersetLevelDEBUGr   r   sysstdoutbufferr   r   r
   r	   r   r   r   r   r   r   	get_pagesrotateprocess_pageclose)r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r/   rsrcmgrdevicemsginterpreterpages                         \/var/www/html/leadgen/airagagent/rag_env/lib/python3.12/site-packages/pdfminer/high_level.pyextract_text_to_fprG      s   ^ $$W]]3K!*- _)<=G#Ff#**!4

!!f#
 
	#&
 
	!#
 
	&
 
	gtHe'<EJ B+OC  $Wf5K!!## ' {{X-4  &' LLN    pdf_filer-   c           	         |
t               }t        | d      5 }t               5 }t        t        |      }t        |      }	t        |	|||      }
t        |	|
      }t        j                  |||||      D ]  }|j                  |        |j                         cddd       cddd       S # 1 sw Y   nxY wddd       y# 1 sw Y   yxY w)aw  Parse and return the text contained in a PDF file.

    :param pdf_file: Either a file path or a file-like object for the PDF file
        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param codec: Text decoding codec
    :param laparams: An LAParams object from pdfminer.layout. If None, uses
        some default settings that often work well.
    :return: a string containing all of the text extracted.
    Nrbr,   )r   r   r5   )r   r   r   r   r   r   r   r   r   r=   r?   getvalue)rI   r!   r    r   r-   r   r   fpoutput_stringrA   rB   rD   rE   s                rF   extract_textrO      s    , :	x	& ("hj (M(B$W5wUXV(&9%%
 	+D $$T*	+ %%'( ( ( ( ( ( (s#   CA7B.	C.B7	3CCc              #   Z  K   |
t               }t        | d      5 }t        t        |      }t	        |      }t        ||      }t        ||      }	t        j                  |||||      D ]'  }
|	j                  |
       |j                         }| ) 	 ddd       y# 1 sw Y   yxY ww)a  Extract and yield LTPage objects

    :param pdf_file: Either a file path or a file-like object for the PDF file
        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param laparams: An LAParams object from pdfminer.layout. If None, uses
        some default settings that often work well.
    :return: LTPage objects
    NrK   r,   )r   r5   )r   r   r   r   r   r   r   r   r=   r?   
get_result)rI   r!   r    r   r-   r   rM   resource_managerrB   rD   rE   layouts               rF   extract_pagesrT      s     ( :	x	& "(B-g>"#3hG()96B%%
 		D $$T*&&(FL		  s   B+A:B	B+B($B+)r.   utf-8Nr   N g      ?r   normalNFFF)rV   Nr   TrU   N)rV   Nr   TN),__doc__r6   r:   collections.abcr   r   ior   typingr   r   r   pdfminer.converterr	   r
   r   r   r   pdfminer.imager   pdfminer.layoutr   r   pdfminer.pdfdevicer   r   pdfminer.pdfexceptionsr   pdfminer.pdfinterpr   r   pdfminer.pdfpager   pdfminer.utilsr   r   r   strintfloatboolrG   rO   rT    rH   rF   <module>ri      s   O  
 /  & &  ' , 6 0 E $ ; ;  $*.!!w	ww w 	w
 ow w C.4'w w w w w d
w w w w  !w" 
#wx *. $(((((( C.4'(( 	((
 (( (( o(( 	((Z *. $%%% C.4'% 	%
 % o% f%rH   