
    +Si                          d dl Z d dlZd dlmZmZ d dlmZmZmZ d dl	m
Z
 d dlmZmZmZ d dlmZmZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
lmZmZ  ej<                  e      Z  ed      Z! ed      Z" G d d      Z#y)    N)	ContainerIterator)AnyBinaryIOClassVar)settings)PDFDocumentPDFNoPageLabelsPDFTextExtractionNotAllowed)PDFObjectNotFoundPDFValueError)	PDFParser)
dict_value	int_value
list_valueresolve1)LIT)Rect
parse_rectPagePagesc                      e Zd ZU dZdededededz  ddf
dZdefd	Zh d
Z	e
ee      ed<   ededed    fd       Ze	 	 	 	 	 ddedee   dz  dededededed    fd       ZdedefdZdededefdZdedee   fdZy)PDFPageaz  An object that holds the information about a page.

    A PDFPage object is merely a convenience class that has a set
    of keys and values, which describe the properties of a page
    and point to its contents.

    Attributes
    ----------
      doc: a PDFDocument object.
      pageid: any Python object that can uniquely identify the page.
      attrs: a dictionary of page attributes.
      contents: a list of PDFStream objects that represents the page content.
      lastmod: the last modified time of the page.
      resources: a dictionary of resources used by the page.
      mediabox: the physical size of the page.
      cropbox: the crop rectangle of the page.
      rotate: the page rotation (in degree).
      annots: the page annotations.
      beads: a chain that represents natural reading order.
      label: the page's label (typically, the logical page number).

    docpageidattrslabelNreturnc                    || _         || _        t        |      | _        || _        t        | j                  j                  d            | _        t        | j                  j                  di             | _        | j                  | j                  j                  d            | _
        | j                  | j                  j                  d      | j                        | _        | j                  | j                  j                  d            | _        t        | j                  j                  dd            dz   dz  | _        | j                  j                  d	      | _        | j                  j                  d
      | _        y)zInitialize a page object.

        doc: a PDFDocument object.
        pageid: any Python object that can uniquely identify the page.
        attrs: a dictionary of page attributes.
        label: page label string.
        LastModified	ResourcesMediaBoxCropBoxContentsRotater   ih  AnnotsBN)r   r   r   r   r   r   getlastmod	resources_parse_mediaboxmediabox_parse_cropboxcropbox_parse_contentscontentsr   rotateannotsbeads)selfr   r   r   r   s        Y/var/www/html/leadgen/airagagent/rag_env/lib/python3.12/site-packages/pdfminer/pdfpage.py__init__zPDFPage.__init__1   s    &



~ >?/7JJNN;+0
 ,,TZZ^^J-GH**4::>>)+DdmmT,,TZZ^^J-GH !!<=CsJjjnnX.ZZ^^C(
    c                 <    d| j                   d| j                  dS )Nz<PDFPage: Resources=z, MediaBox=>)r*   r,   )r4   s    r5   __repr__zPDFPage.__repr__P   s"    %dnn%7{4==BSSTUUr7   >   r%   r#   r"   r!   INHERITABLE_ATTRSdocumentc              #      	K   	 d
dt         dt        t        t         f   dt        t            d z  dt        t
        t        t        t         t        t         t         f   f   f      f 	fd		 j                         }d}dj                  v rB 	j                  d   j                        }|D ]  \  }}  ||t        |             d} |svj                  D ]f  }|j                         D ]Q  }	 j                  |      }t!        |t              r-|j#                  d	      t$        u r  ||t        |             S h y y # t        $ r t        j                  d       }Y w xY w# t&        $ r Y w xY ww)Nobjparentvisitedr   c              3     K   t        | t              r+| }t        j                  |            j	                         }n%| j
                  }t        |       j	                         }|
t               }||v ry |j                  |       |j                         D ]  \  }}|	j                  v s||vs|||<     |j                  d      }|!t        j                  s|j                  d      }|t        u rCd|v r?t        j                  d|d          t!        |d         D ]  } 
|||      E d {     y |t"        u rt        j                  d|       ||f y y 7 -w)NTypetypeKidszPages: Kids=%rzPage: %r)
isinstanceintr   getobjcopyobjidsetadditemsr;   r(   r   STRICTLITERAL_PAGESlogdebugr   LITERAL_PAGE)r>   r?   r@   	object_idobject_propertieskvobject_typechildclsdepth_first_searchr<   s            r5   rY   z0PDFPage.create_pages.<locals>.depth_first_search\   sd    
 #s#	$.xy/I$J$O$O$Q!  II	$.sO$8$8$:! %G#KK	" -1---!;L2L+,%a(- ,//7K"8??/33F;m+:K0K		*,=f,EF'(9&(AB UE1%9JGTTTU ,		*&78 "344 - Us   B'E+E0A>E.E/.EFr   TrB   N)r   dictstrrJ   r   tuplerF   get_page_labelsr
   	itertoolsrepeatcatalognextxrefs
get_objidsrG   rE   r(   rQ   r   )
rX   r<   page_labelspagesobjectsrI   treexrefr>   rY   s
   ``       @r5   create_pageszPDFPage.create_pagesZ   s    
 (,$	5$	5cN$	5 X_$	5 eCc4S>&9!::;<	$	5L	1080H0H0JK h&&&()9)9')BHDTDTUG& t(E4k1BCC   !__. E&ooe4%c40SWWV_5T"%hsD<M"NN	   	1#**40K	1" - sP   A(E<.E	 >A6E<5AE-E<	E*'E<)E**E<-	E96E<8E99E<fppagenosmaxpagespasswordcachingcheck_extractablec              #   $  K   t        |      }t        |||      }|j                  s-|rd|}	t        |	      d|d}
t        j                  |
       t        | j                  |            D ]  \  }}|r||vr| |s||dz   k  s y  y w)N)rn   ro   z Text extraction is not allowed: zThe PDF z contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case   )r   r	   is_extractabler   rO   warning	enumeraterj   )rX   rk   rl   rm   rn   ro   rp   parserr   	error_msgwarning_msgpagenopages                r5   	get_pageszPDFPage.get_pages   s      2&8WE !! >rfE	1)<< rf %A A  K(%c&6&6s&;< 	LFDF'1JH
2	s   B BBBvaluec                     d}|t         j                  d       |S 	 t        d t        |      D              S # t        $ r t         j                  d       |cY S w xY w)N)        r~   g      @g     @zHMediaBox missing from /Page (and not inherited), defaulting to US Letterc              3   2   K   | ]  }t        |        y wrZ   r   .0vals     r5   	<genexpr>z*PDFPage._parse_mediabox.<locals>.<genexpr>        GhsmG   z2Invalid MediaBox in /Page, defaulting to US Letter)rO   rt   r   r   r   )r4   r|   	us_letters      r5   r+   zPDFPage._parse_mediabox   sa    ,	=KK* 	GxGGG 	KKLM	s   8  AAr,   c                     ||S 	 t        d t        |      D              S # t        $ r t        j	                  d       |cY S w xY w)Nc              3   2   K   | ]  }t        |        y wrZ   r   r   s     r5   r   z)PDFPage._parse_cropbox.<locals>.<genexpr>   r   r   z0Invalid CropBox in /Page, defaulting to MediaBox)r   r   r   rO   rt   )r4   r|   r,   s      r5   r-   zPDFPage._parse_cropbox   sG    =O	GxGGG 	KKJKO	s   !  AAc                 J    g }|t        |      }t        |t              s|g}|S rZ   )r   rE   list)r4   r|   r0   s      r5   r/   zPDFPage._parse_contents   s,     Hh-$:r7   )Nr    TF)__name__
__module____qualname____doc__r	   objectr\   r6   r:   r;   r   rJ   __annotations__classmethodr   rj   r   r   rF   boolr{   r   r   r+   r-   r   r/    r7   r5   r   r      sJ   .)) ) 	)
 Tz) 
)>V# V-xC)  ;K ;HY4G ; ;z  *."'"" 3$&" 	"
 " "  " 
)	" "HS T "
C 
4 
D 
S T#Y r7   r   )$r_   loggingcollections.abcr   r   typingr   r   r   pdfminerr   pdfminer.pdfdocumentr	   r
   r   pdfminer.pdfexceptionsr   r   pdfminer.pdfparserr   pdfminer.pdftypesr   r   r   r   pdfminer.psparserr   pdfminer.utilsr   r   	getLoggerr   rO   rQ   rN   r   r   r7   r5   <module>r      sh      / * *  
 D ( I I ! +g! 6{GG Gr7   