
    +Si                     8   d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d d	l(m)Z) d d
l*m+Z+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?  ej                  eA      ZB G d de'      ZC G d deC      ZD e
de	ee5      ZE G d deCeeE         ZF G d deFe5         ZG G d deFe5         ZH G d deFe5         ZI G d deFe5         ZJy)    N)Sequence)BinaryIOClassVarGenericTextIOTypeVarcast)utils)ImageWriter)LAParamsLTAnnoLTCharLTComponentLTContainerLTCurveLTFigureLTImageLTItemLTLayoutContainerLTLineLTPageLTRectLTText	LTTextBoxLTTextBoxVerticalLTTextGroup
LTTextLineTextGroupElement)PDFColorSpace)PDFTextDevice)PDFValueError)PDFFontPDFUnicodeNotDefined)PDFGraphicStatePDFResourceManager)PDFPage)	PDFStream)AnyIOMatrixPathSegmentPointRectapply_matrix_ptapply_matrix_rectbbox2strencmake_compat_strmult_matrixc                      e Zd ZU eed<   eed<   	 	 d%dedededz  ddfdZ	d	e
deddfd
Zd	e
ddfdZdedededdfdZdeddfdZdededdfdZdededededee   ddfdZdededededededed edefd!Zdededefd"Zd#eddfd$Zy)&PDFLayoutAnalyzercur_itemctmNrsrcmgrpagenolaparamsreturnc                 Z    t        j                  | |       || _        || _        g | _        y N)r    __init__r8   r9   _stackselfr7   r8   r9   s       [/var/www/html/leadgen/airagagent/rag_env/lib/python3.12/site-packages/pdfminer/converter.pyr=   zPDFLayoutAnalyzer.__init__A   s)     	tW- /1    pagec                     t        ||j                        \  }}}}ddt        ||z
        t        ||z
        f}t        | j                  |      | _        y )Nr   )r.   mediaboxabsr   r8   r5   )r@   rC   r6   x0y0x1y1rE   s           rA   
begin_pagezPDFLayoutAnalyzer.begin_pageL   sK    ,S$--@RRq#b2g,BG5t{{H5rB   c                    | j                   r#J t        t        | j                                      t        | j                  t
              s#J t        t        | j                                     | j                  %| j                  j                  | j                         | xj                  dz  c_	        | j                  | j                         y )N   )r>   strlen
isinstancer5   r   typer9   analyzer8   receive_layout)r@   rC   s     rA   end_pagezPDFLayoutAnalyzer.end_pageQ   s    ;;5C$4 55$--0J#d4==6I2JJ0==$MM!!$--0qDMM*rB   namebboxmatrixc                     | j                   j                  | j                         t        ||t	        || j
                              | _        y r<   )r>   appendr5   r   r2   r6   )r@   rU   rV   rW   s       rA   begin_figurezPDFLayoutAnalyzer.begin_figureY   s3    4==) t[-JKrB   _c                 
   | j                   }t        | j                   t              s#J t        t	        | j                                      | j
                  j                         | _         | j                   j                  |       y r<   )r5   rP   r   rN   rQ   r>   popadd)r@   r[   figs      rA   
end_figurezPDFLayoutAnalyzer.end_figure]   sV    mm$--2LCT]]8K4LL2)#rB   streamc                 v   t        | j                  t              s#J t        t	        | j                                     t        ||| j                  j                  | j                  j                  | j                  j                  | j                  j                  f      }| j                  j                  |       y r<   )rP   r5   r   rN   rQ   r   rG   rH   rI   rJ   r^   )r@   rU   ra   items       rA   render_imagezPDFLayoutAnalyzer.render_imagec   s    $--2LCT]]8K4LL2]]t}}//1A1A4==CSCST

 	$rB   gstatestrokefillevenoddpathc                 x   dj                  d |D              }|dd dk7  ry|j                  d      dkD  rTt        j                  d|      D ]:  }||j	                  d      |j                  d       }| j                  |||||       < y|D 	cg c]%  }	t        t        |	d   dk7  r|	d	d n|d   d	d       ' }
}	|
D cg c]  }t        | j                  |       }}|D cg c]  }t        |d          }}|D cg c]W  }t        |ddd
   |d
dd
   d      D cg c]/  \  }}t        | j                  t        |      t        |      f      1 c}}Y }}}}t        ||d      D 	cg c]  \  }}	t        t        |g|	       }}}	t        |      dkD  r+|d	d dk(  r#|d	   |d   k(  r|dd	 dz   }|j!                          |dv r_t#        |j$                  |d   |d   ||||j&                  |j(                  ||j*                  
      }| j,                  j/                  |       y|dv r|\  \  }}\  }}\  }}\  }}}|d   |d   k(  }||k(  xr ||k(  xr ||k(  xr ||k(  xs ||k(  xr ||k(  xr ||k(  xr ||k(  }|rd|rbt1        |j$                  g |d   |d
   ||||j&                  |j(                  ||j*                  	      } | j,                  j/                  |        yt3        |j$                  |||||j&                  |j(                  ||j*                  	      }!| j,                  j/                  |!       yt3        |j$                  |||||j&                  |j(                  ||j*                  	      }!| j,                  j/                  |!       yc c}	w c c}w c c}w c c}}w c c}}}w c c}	}w )z@Paint paths described in section 4.4 of the PDF reference manual c              3   &   K   | ]	  }|d      yw)r   N ).0xs     rA   	<genexpr>z/PDFLayoutAnalyzer.paint_path.<locals>.<genexpr>u   s     +!+s   NrM   mzm[^m]+r   h   F)strict   lh>   mlmlh)original_pathdashing_style>   mlllhmllll   )joincountrefinditerstartend
paint_pathr	   r+   r-   r6   rN   zipfloatr*   rO   r]   r   	linewidthscolorncolordashr5   r^   r   r   )"r@   re   rf   rg   rh   ri   shaperq   subpathpraw_ptsptpts	operation	operatorsoperand1operand2transformed_pointsotransformed_pathlinerG   rH   rI   rJ   x2y2x3y3r[   is_closed_loophas_square_coordinatesrectcurves"                                     rA   r   zPDFLayoutAnalyzer.paint_pathl   s    +d++!9 [[![[E2 HqwwqzAEE!H5gwGH OSIJUadckAbcFtAwrs|DG  <CCR?488R0CCC<@AyYq\*AIA "&" " 	 /2!!$Q$14a4/*( $DHHuXh.PQ" "  	+=eL Aq [1'q'*    5zA~%*"4RCF9Jcr
S(	%
 $$FFMMMM"2"(++ !!$',,<?9R(2rHRhr2!$Q3q6!1"HCrCbBhC28*GBhE28EbER2X ' "&<!((*#a&*3q6*(
D MM%%d+#(((
E MM%%e,$$MMMM$KK
 !!%(s DA" s0   *N>N!N$>"N/ 4N)N//N6)N/fontfontsizescalingrisecidncsgraphicstatec	                 v   	 |j                  |      }	t        |	t              sJ t        t        |	                   	 |j                  |      }
|j                  |      }t        ||||||	|
|||
      }| j                  j                  |       |j                  S # t        $ r | j                  ||      }	Y zw xY wr<   )	to_unichrrP   rN   rQ   r#   handle_undefined_char
char_width	char_dispr   r5   r^   adv)r@   rW   r   r   r   r   r   r   r   text	textwidthtextdisprc   s                rA   render_charzPDFLayoutAnalyzer.render_char   s    	9>>#&DdC(9#d4j/9( OOC(	>>#&
 	$xx# $ 	9--dC8D	9s   :B B87B8c                 D    t         j                  d|d|       d| dS )Nzundefined: , z(cid:))logdebug)r@   r   r   s      rA   r   z'PDFLayoutAnalyzer.handle_undefined_char
  s)    		Kxr#12se1~rB   ltpagec                      y r<   rm   r@   r   s     rA   rS   z PDFLayoutAnalyzer.receive_layout  s    rB   rM   N)__name__
__module____qualname__r   __annotations__r)   r%   intr   r=   r&   rK   rT   rN   r,   rZ   r`   r'   rd   r$   boolr   r*   r   r"   r   r   r   r   r   rS   rm   rB   rA   r4   r4   =   s   	K
 $(		2#	2 	2 T/		2
 
	26w 6V 6 6
+W + +L LD L& LT LC D    i  D  {){) {) 	{)
 {) {#{) 
{)z  	
     & 
B'   V  rB   r4   c            	       L    e Zd Z	 	 d
dedededz  ddfdZdeddfdZdefd	Z	y)PDFPageAggregatorNr7   r8   r9   r:   c                 D    t         j                  | |||       d | _        y N)r8   r9   )r4   r=   resultr?   s       rA   r=   zPDFPageAggregator.__init__  s"     	""4("S%)rB   r   c                     || _         y r<   r   r   s     rA   rS   z PDFPageAggregator.receive_layout  s	    rB   c                 6    | j                   J | j                   S r<   r   r@   s    rA   
get_resultzPDFPageAggregator.get_result  s    {{&&&{{rB   r   )
r   r   r   r%   r   r   r=   r   rS   r   rm   rB   rA   r   r     sX     $(	*#* * T/	*
 
*V  F rB   r   IOTypec                   T    e Zd Z	 	 	 d
dedededededz  ddfdZe	de
defd	       Zy)PDFConverterNr7   outfpcodecr8   r9   r:   c                     t         j                  | |||       || _        || _        | j	                  | j                        | _        y r   )r4   r=   r   r   _is_binary_streamoutfp_binary)r@   r7   r   r   r8   r9   s         rA   r=   zPDFConverter.__init__)  s@     	""4("S"

 224::>rB   c                     dt        | dd      v ryt        | d      ryt        | t        j                        ryt        | t        j
                  t        j                  f      ryy)z"Test if an stream is binary or notbmoderk   TF)getattrhasattrrP   ioBytesIOStringIO
TextIOBase)r   s    rA   r   zPDFConverter._is_binary_stream6  sS     '%,,UF#rzz*R]];<rB   )utf-8rM   N)r   r   r   r%   r   rN   r   r   r=   staticmethodr(   r   r   rm   rB   rA   r   r   (  sr    
 $(?#? ? 	?
 ? T/? 
?  4  rB   r   c                        e Zd Z	 	 	 	 	 ddedededededz  dede	dz  d	df fd
Z
ded	dfdZded	dfdZdeded	dfdZdededededee   d	dfdZ xZS )TextConverterNr7   r   r   r8   r9   
showpagenoimagewriterr:   c                 J    t         |   |||||       || _        || _        y )Nr   r8   r9   )superr=   r   r   )	r@   r7   r   r   r8   r9   r   r   	__class__s	           rA   r=   zTextConverter.__init__G  s,     	%uVhW$&rB   r   c                     t        j                  || j                  d      }| j                  r8t	        t
        | j                        j                  |j                                y t	        t        | j                        j                  |       y )Nignore)
r
   compatible_encode_methodr   r   r	   r   r   writeencoder   r@   r   s     rA   
write_textzTextConverter.write_textU  s[    --dDJJI4::&,,T[[];$**40rB   r   c                      dt         dd f fd j                  r j                  d|j                   d        |        j                  d       y )Nrc   r:   c                 Z   t        | t              r| D ]
  } |        n/t        | t              rj                  | j	                                t        | t
              rj                  d       y t        | t              r)j                  j                  j                  |        y y y )N
)	rP   r   r   r   get_textr   r   r   export_image)rc   childrenderr@   s     rA   r   z,TextConverter.receive_layout.<locals>.render]  s    $,! "E5M"D&)0$	*%D'*t/?/?/K  --d3 0L*rB   zPage r   )r   r   r   pageidr@   r   r   s   ` @rA   rS   zTextConverter.receive_layout\  sK    		4 		4D 		4 ??OOeFMM?"56vrB   rU   ra   c                 L    | j                   t        j                  | ||       y y r<   )r   r   rd   )r@   rU   ra   s      rA   rd   zTextConverter.render_imagep  s%    '%%dD&9 (rB   re   rf   rg   rh   ri   c                      y r<   rm   )r@   re   rf   rg   rh   ri   s         rA   r   zTextConverter.paint_patht  s     	rB   )r   rM   NFN)r   r   r   r%   r(   rN   r   r   r   r   r=   r   r   rS   r'   rd   r$   r   r*   r   __classcell__)r   s   @rA   r   r   F  s    
 $( *.'#' ' 	'
 ' T/' ' !4'' 
'1s 1t 1V  (: :i :D :  	
  {# 
rB   r   c                      e Zd ZU dddddddZeeeef      ed<   d	dd
Zeeeef      ed<   	 	 	 	 	 	 	 	 	 	 	 	 d8de	de
dedededz  dededededededz  dedeeef   dz  deeef   dz  ddfdZdeddfdZd9dZd9d Zdeddfd!Zd"ed#ed$ed%ed&ed'eddfd(Zd"ed#ed)eddfd*Zd)ed#ed$ed%ed&ed'eddfd+Zd"eded$ed%ed,eddfd-Z	 d:d"ed#ed$ed%ed&ed'ed.eddfd/Zd"eddfd0Zded1ed2eddfd3Zd9d4Zd5eddfd6Z d9d7Z!y);HTMLConverteryellowmagentacyanredblackgray)figuretextlinetextbox	textgroupr   rC   RECT_COLORSblue)r  charTEXT_COLORSNr7   r   r   r8   r9   scale	fontscale
layoutmoder   
pagemarginr   r   rect_colorstext_colorsr:   c                 F   t         j                  | |||||       | j                  r| j                  st	        d      | j                  s| j                  rt	        d      |ddi}|ddd}|| _        || _        || _        |	| _        |
| _	        || _
        || _        || _        |rJ| j                  j                  | j                         | j                  j                  | j                         | j                  | _        d | _        g | _        | j'                          y )Nr   )Codec is required for a binary I/O outputz1Codec must not be specified for a text I/O outputr	  r  r  )r   rC   )r   r=   r   r   r!   r  r  r  r   r  r   r  r  updater  r
  _yoffset_font
_fontstackwrite_header)r@   r7   r   r   r8   r9   r  r  r  r   r  r   r   r  r  s                  rA   r=   zHTMLConverter.__init__  s   " 	 	 	
 TZZ KLL  TZZ STT!7+K$+V<K
"$$$&&&##D$4$45##D$4$45#/3
:<rB   r   c                     | j                   rCt        t        | j                        j	                  |j                  | j                                y t        t        | j                        j	                  |       y r<   r   r	   r   r   r   r   r   r   s     rA   r   zHTMLConverter.write  H    ::4::&,,T[[-DE$**40rB   c                     | j                  d       | j                  rd| j                   d}nd}| j                  |       | j                  d       y )Nz<html><head>
z<<meta http-equiv="Content-Type" content="text/html; charset=">
z5<meta http-equiv="Content-Type" content="text/html">
z</head><body>
)r   r   )r@   ss     rA   r  zHTMLConverter.write_header  sP    

#$::::,d, 
 IA

1

$%rB   c                     t        d| j                        D cg c]  }d| d| d }}ddj                  |       d}| j                  |       | j                  d       y c c}w )	NrM   z
<a href="#">z</a>z/<div style="position:absolute; top:0px;">Page: r   </div>
z</body></html>
)ranger8   r   r   )r@   i
page_linksr  s       rA   write_footerzHTMLConverter.write_footer  so    9>q$++9NOA
1#Rs$/O
OYYz*+85 	
 	

1

%& Ps   A#c                 8    | j                  t        |             y r<   )r   r0   r   s     rA   r   zHTMLConverter.write_text  s    

3t9rB   colorborderwidthro   ywrr   c                    | j                   j                  |      }|hd| d| d|| j                  z   d| j                  |z
  | j                  z   d|| j                  z   d|| j                  z   d}| j	                  |       y y )Nz(<span style="position:absolute; border:  zpx solid; left:px; top:
px; width:px; height:zpx;"></span>
)r  getr  r  r   )	r@   r&  r'  ro   r(  r)  rr   color2r  s	            rA   
place_rectzHTMLConverter.place_rect  s     !!%%e,!(!K= 1DJJ' ()TZZ78 9TZZ( )djj.)9  JJqM rB   rc   c                     | j                  |||j                  |j                  |j                  |j                         y r<   )r1  rG   rJ   widthheight)r@   r&  r'  rc   s       rA   place_borderzHTMLConverter.place_border  s(    {DGGTWWdjj$++VrB   c                 4   | j                   | j                   j                  |      }dt        |       d| d|| j                  z   d| j                  |z
  | j                  z   d|| j                  z   d|| j                  z   d}| j                  |       y y )Nz
<img src="z
" border="z!" style="position:absolute; left:r,  zpx;" width="
" height="" />
)r   r   r0   r  r  r   )	r@   rc   r'  ro   r(  r)  rr   rU   r  s	            rA   place_imagezHTMLConverter.place_image  s     '##006DSYKz+ ?DJJ' ()TZZ78 9djj.) *tzz>*&2  JJqM (rB   sizec           	      F   | j                   j                  |      }|d| d|| j                  z   d| j                  |z
  | j                  z   d|| j                  z  | j                  z   d	}| j                  |       | j                  |       | j                  d       y y )Nz&<span style="position:absolute; color:; left:r,  zpx; font-size:px;"></span>
)r  r/  r  r  r  r   r   )r@   r&  r   ro   r(  r:  r0  r  s           rA   
place_textzHTMLConverter.place_text  s     !!%%e, !DJJ' ()TZZ78 9!DJJ.?@	G  JJqMOOD!JJ{# rB   writing_modec                 0   | j                   j                  | j                         d | _        d| d| d| d|| j                  z   d| j                  |z
  | j                  z   d|| j                  z   d|| j                  z   d}| j                  |       y )	Nz'<div style="position:absolute; border: r+  zpx solid; writing-mode:r<  r,  r-  r.  r=  )r  rY   r  r  r  r   )	r@   r&  r'  ro   r(  r)  rr   r@  r  s	            rA   	begin_divzHTMLConverter.begin_div#  s     	tzz*
gQ{m ,(> *

N# $MMA%34 5^$ %$**n%U, 	
 	

1rB   c                     | j                   | j                  d       | j                  j                         | _         | j                  d       y )N</span>z</div>)r  r   r  r]   )r@   r&  s     rA   end_divzHTMLConverter.end_div:  s8    ::!JJy!__((*


8rB   fontnamer   c                     ||f}|| j                   k7  rj| j                   | j                  d       |j                  d      d   }| j                  d| d|| j                  z  | j                  z   d       || _         | j                  |       y )NrD  +z<span style="font-family: z; font-size:zpx">)r  r   splitr  r  r   )r@   r   rF  r   r   fontname_without_subset_tags         rA   put_textzHTMLConverter.put_text@  s    (#4::zz%

9%*2..*=b*A'JJ  ;< =%

2T^^CDDJ
 DJrB   c                 &    | j                  d       y )Nz<br>r   r   s    rA   put_newlinezHTMLConverter.put_newlineO  s    

6rB   r   c                      dt         t        z  dd f fddt        dd f fd |        xj                   j                  z  c_        y )Nrc   r:   c                 l    t        | t              r#j                  dd|        | D ]
  } |        y y )Nr  rM   )rP   r   r5  rc   r   r@   
show_groups     rA   rS  z0HTMLConverter.receive_layout.<locals>.show_groupS  s;    $,!!+q$7! &Eu%& -rB   c           
      N   t        | t              r؉xj                  | j                  z  c_        j	                  dd|        j
                  rdj                  dj                  | j                  z
  j                  z          j                  d| j                   d| j                   d       | D ]
  } |        | j                  | j                  D ]
  } |        y y t        | t              rj	                  dd|        y t        | t              r_j                  dd| j                  | j                  | j                  | j                         | D ]
  } |        j!                  d       y t        | t"              r?j%                  | d| j                  | j                  | j                  | j                         y j&                  d	k(  rt        | t(              r#j	                  d
d|        | D ]
  } |        y t        | t*              rbj	                  dd|        j-                  dt/        | j0                  dz         | j                  | j                  d       | D ]
  } |        y t        | t2              rUj	                  dd|        j-                  d| j5                         | j                  | j                  | j6                         y y t        | t(              r0| D ]
  } |        j&                  dk7  rj9                          y y t        | t*              rnj                  dd| j                  | j                  | j                  | j                  | j;                                | D ]
  } |        j!                  d       y t        | t2              rAt=        | j>                        }jA                  | j5                         || j6                         y t        | tB              r jE                  | j5                                y y )NrC   rM   z*<div style="position:absolute; top:%dpx;">z	<a name="z">Page z</a></div>
r   r  exactr  r     r	  loose)#rP   r   r  rJ   r5  r   r   r  r   groupsr   r   rB  rG   r3  r4  rE  r   r9  r  r   r   r?  rN   indexr   r   r:  rO  get_writing_moder1   rF  rL  r   r   )rc   r   grouprF  r   r@   rS  s       rA   r   z,HTMLConverter.receive_layout.<locals>.renderY  s_   $'(!!&!T2??JJD MMDGG3tzzABD JJ#DKK=}LQ " "E5M";;*!% *"5)* + D'*!!'1d3D(+xDGGTWWdjj$++V! "E5M"X&D'*  q$''477DJJTG+dJ/%%j!T:!% &u&i0%%iD9OO!DJJN+ "& &u&f-%%fa6OO		 . D*-! "E5M"??g-$$& .D),GGGGJJKK))+ " "E5M"Y'D&)*4==9dmmoxCD&)0 *rB   )r   r   r   r  r  r@   r   r   rS  s   ` @@rA   rS   zHTMLConverter.receive_layoutR  sN    	&[+;; 	& 	&J	1 J	1D J	1X 	v(rB   c                 $    | j                          y r<   r$  r   s    rA   closezHTMLConverter.close      rB   )r   rM   NrM   g      ?normalT2   Nr   NNr:   N)False)"r   r   r   r  r   dictrN   r   r
  r%   r(   r   r   r   r   r   r=   r   r  r$  r   r1  r   r5  r   r9  r?  rB  rE  rL  rO  r   rS   r_  rm   rB   rA   r   r     s	   -K$sCx.)  -K$sCx.)  $("*.-1-13#3 3 	3
 3 T/3 3 3 3 3 3 !4'3 3 #s(^d*3 #s(^d*3  
!3j1# 1$ 1
&'s t   	
    
*W# WC W{ Wt W  	
    
*$$ $ 	$
 $ $ 
$: $  	
     
.S T S C 5 T T)V T) T)lrB   r   c                       e Zd Z ej                  d      Z	 	 	 	 	 ddededede	de
dz  dedz  d	ed
dfdZded
dfdZddZddZded
dfdZded
dfdZddZy)XMLConverterz[ ---]Nr7   r   r   r8   r9   r   stripcontrolr:   c                     t         j                  | |||||       | j                  | j                   k(  rt	        d      || _        || _        | j                          y )Nr   r  )r   r=   r   r   r!   r   rh  r  )r@   r7   r   r   r8   r9   r   rh  s           rA   r=   zXMLConverter.__init__  sg     	 	 	
 TZZ0 KLL&(rB   r   c                     | j                   rCt        t        | j                        j	                  |j                  | j                                y t        t        | j                        j	                  |       y r<   r  r   s     rA   r   zXMLConverter.write  r  rB   c                     | j                   r | j                  d| j                    d       n| j                  d       | j                  d       y )Nz<?xml version="1.0" encoding="z" ?>
z<?xml version="1.0" ?>
z<pages>
r   r   r   s    rA   r  zXMLConverter.write_header  s<    ::JJ7

|6JKJJ12

;rB   c                 &    | j                  d       y )Nz	</pages>
rN  r   s    rA   r$  zXMLConverter.write_footer  s    

< rB   c                     | j                   r| j                  j                  d|      }| j                  t	        |             y Nrk   )rh  CONTROLsubr   r0   r   s     rA   r   zXMLConverter.write_text  s1    <<##B-D

3t9rB   r   c                 X     dt         dd f fddt         dd f fd |       y )Nrc   r:   c                 D   t        | t              r6j                  d| j                   dt	        | j
                         d       y t        | t              rIj                  dt	        | j
                         d       | D ]
  } |        j                  d       y y )N<textbox id="" bbox="r8  z<textgroup bbox="r  z</textgroup>
)rP   r   r   rY  r/   rV   r   rR  s     rA   rS  z/XMLConverter.receive_layout.<locals>.show_group  s    $	*

#DJJ<x8K7LFS D+.

.x		/B.C4HI! &Eu%&

+,	 /rB   c                 	   t        | t              rd| j                   dt        | j                         d| j
                   d}j                  |       | D ]
  } |        | j                  ;j                  d       | j                  D ]
  } |        j                  d       j                  d       y t        | t              r8d| j                   dt        | j                         d	}j                  |       y t        | t              r8d
| j                   dt        | j                         d	}j                  |       y t        | t              rId| j                   dt        | j                         d| j                          d}j                  |       y t        | t              rXd| j                   dt        | j                         d}j                  |       | D ]
  } |        j                  d       y t        | t              rIj                  dt        | j                         d       | D ]
  } |        j                  d       y t        | t               rod}t        | t"              rd}d| j$                   dt        | j                         d| d}j                  |       | D ]
  } |        j                  d       y t        | t&              rdt)        | j*                         dt        | j                         d| j,                  j                   d| j.                  j0                   d| j2                  dd}j                  |       j5                  | j7                                j                  d       y t        | t8              r$j                  d| j7                          d       y t        | t:              rj<                  Tj<                  j?                  |       }j                  d t)        |       d!| j@                   d"| jB                   d	       y j                  d#| j@                   d"| jB                   d	       y tE        tG        d$| f            )%Nz
<page id="ru  z
" rotate="r  z	<layout>
z
</layout>
z</page>
z<line linewidth="r8  z<rect linewidth="z<curve linewidth="z" pts="z"/>
z<figure name="z
</figure>
z<textline bbox="z</textline>
rk   z wmode="vertical"rt  "z>
z</textbox>
z<text font="z" colourspace="z" ncolour="z" size="z.3fr  z</text>
z<text>z<image src="z	" width="r7  z<image width="	Unhandled)$rP   r   r   r/   rV   rotater   rX  r   r   r   r   get_ptsr   rU   r   r   r   rY  r   r0   rF  r   r   r   r:  r   r   r   r   r   r   r3  r4  AssertionErrorrN   )	rc   r  r   r[  wmoderU   r   r@   rS  s	         rA   r   z+XMLConverter.receive_layout.<locals>.render  s   $'  .%dii01 2#{{m41 
 

1! "E5M";;*JJ|,!% *"5)*JJ}-

;'D&)""&..!1 2%dii019 
 

1D&)""&..!1 2%dii019 
 

1D'*""&..!1 2%dii01 2 LLN+52  

1D(+$TYYKx8K7LDQ

1! "E5M"

=)D*-

-htyy.A-B$GH! "E5M"

?+D),d$56/E#DJJ<x8K7LAeWTWX

1! "E5M"

>*D&) /0 1%dii01 2$$(HHMM? 3  $ 1 1 8 89 :!YYsO2/  

10

;'D&)

VDMMO#4I>?D'*##/++88>DJJ  #D	{ +""&** .##';;-v7 JJ(Jt{{m6R %S+t)<%=>>rB   r   r\  s   ` @@rA   rS   zXMLConverter.receive_layout  s6    		-V 		- 		-W	? W	?D W	?r 	vrB   c                 $    | j                          y r<   r^  r   s    rA   r_  zXMLConverter.closeF  r`  rB   )r   rM   NNFrc  )r   r   r   r   compilerp  r%   r(   rN   r   r   r   r   r=   r   r  r$  r   r   rS   r_  rm   rB   rA   rg  rg    s    bjj89G $(*."#  	
  T/ !4'  
61# 1$ 1 !s t 
eV e eNrB   rg  c                       e Zd ZdZ ej
                  d      Z	 	 	 	 ddedede	de
dedz  d	efd
Zdede	fdZde	ddfdZddZddZde	ddfdZddZdeddfdZddZy)HOCRConverterzKExtract an hOCR representation from explicit text information within a PDF.z[\x00-\x08\x0b-\x0c\x0e-\x1f]Nr7   r   r   r8   r9   rh  c                 v    t         j                  | |||||       || _        d| _        | j	                          y )Nr   F)r   r=   rh  within_charsr  )r@   r7   r   r   r8   r9   rh  s          rA   r=   zHOCRConverter.__init__]  sG     	 	 	
 )!rB   rV   r:   c                     |\  }}}}t        |      }t        | j                  d   |z
        }t        |      }t        | j                  d   |z
        }	d| d| d| d|	 S )Nrv   zbbox r+  )r   	page_bbox)
r@   rV   in_x0in_y0in_x1in_y1out_x0out_y0out_x1out_y1s
             rA   	bbox_reprzHOCRConverter.bbox_reprr  sq    '+$ueUT^^A&./UT^^A&./vhaxq&::rB   r   c                     | j                   rE|j                  | j                         }t        t        | j                        j                  |       y t        t        | j                        j                  |       y r<   )r   r   r	   r   r   r   r   )r@   r   encoded_texts      rA   r   zHOCRConverter.write{  sM    ::;;tzz2L4::&,,\:$**40rB   c                 l   | j                   r | j                  d| j                    d       n| j                  d       | j                  d       | j                  d       | j                  d       | j                  d       | j                  d       | j                  d	       | j                  d
       y )NzL<html xmlns='http://www.w3.org/1999/xhtml' xml:lang='en' lang='en' charset=''>
zD<html xmlns='http://www.w3.org/1999/xhtml' xml:lang='en' lang='en'>
z<head>
z<title></title>
zE<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />
zA<meta name='ocr-system' content='pdfminer.six HOCR Converter' />
zR  <meta name='ocr-capabilities' content='ocr_page ocr_block ocr_line ocrx_word'/>
z</head>
z<body>
rl  r   s    rA   r  zHOCRConverter.write_header  s    ::JJ448JJ<tE
 JJW 	

:

&'

T	
 	

P	
 	

C	
 	

;

:rB   c                 H    | j                  d       | j                  d       y )Nz0<!-- comment in the following line to debug -->
zD<!--script src='https://unpkg.com/hocrjs'></script--></body></html>
rN  r   s    rA   r$  zHOCRConverter.write_footer  s    

FG

S	
rB   c                 v    | j                   r| j                  j                  d|      }| j                  |       y ro  )rh  rp  rq  r   r   s     rA   r   zHOCRConverter.write_text  s-    <<##B-D

4rB   c                    t        | j                        dkD  rd}d| j                  v rd}d| j                  v r|dz  }| j                  d| j                   d| j                   d	| d
| j                  | j                         d| j                   d| j                   d| j                  j                          d       d| _        y )Nr   rk   Italiczfont-style: italic; Boldzfont-weight: bold; z<span style='font:"z"; font-size:z; z' class='ocrx_word' title='z	; x_font z
; x_fsize '>rD  F)	rO   working_textworking_fontr   working_sizer  working_bboxstripr  )r@   bold_and_italic_styless     rA   
write_wordzHOCRConverter.write_word  s    t  !A%%'"4,,,)?&***&*??&JJ&t'8'8&9 :!../r)* +..):):;< =++, -,,-R$$**,-W6	 "rB   r   c                 6     dt         dd f fd |       y )Nrc   r:   c                 j   j                   r t        | t              rj                          t        | t              rm| j
                  _        j                  d| j                   dj                  | j
                         d       | D ]
  } |        j                  d       y t        | t              rOj                  dj                  | j
                         d       | D ]
  } |        j                  d       y t        | t              r\j                  d| j                   dj                  | j
                         d       | D ]
  } |        j                  d       y t        | t              rj                   sPd	_         | j                         _        | j
                  _        | j"                  _        | j&                  _        y t+        | j                         j-                               d
k(  r0j                          j                  | j                                y j                   d   | j
                  d   k7  s2j$                  | j"                  k7  sj(                  | j&                  k7  rCj                          | j
                  _        | j"                  _        | j&                  _        xj                  | j                         z  c_        j                   d
   j                   d   | j
                  d   j                   d   f_        y y )Nz<div class='ocr_page' id='z	' title='r  r   z<span class='ocr_line' title='r  r>  z<div class='ocr_block' id='Tr   rM   rt   rv   )r  rP   r   r  r   rV   r  r   r   r  r   r   rY  r   r   r  r  rF  r  r:  r  rO   r  )rc   r   
child_liner   r@   s      rA   r   z,HOCRConverter.receive_layout.<locals>.render  s     Zf%=!$'!%

;;- ("nnTYY78> " "E5M"

:&D*-

4T^^DII5N4OrR #' 'J:&'

;'D),

::, '"nnTYY78> " "E5M"

:&D&)(((,D%(,D%(,		D%(,D%(,		D%..01Q6OO%JJt}}/ ))!,		!<,,=,,		9),0II),0MM),0II)%%8%))!,))!,		!))!,	)D%+ *rB   r}  r   s   ` @rA   rS   zHOCRConverter.receive_layout  s     9	 9	D 9	v 	vrB   c                 $    | j                          y r<   r^  r   s    rA   r_  zHOCRConverter.close  r`  rB   )utf8rM   NFrc  )r   r   r   __doc__r   r  rp  r%   r(   rN   r   r   r   r=   r,   r  r   r  r$  r   r  r   rS   r_  rm   rB   rA   r  r  J  s    U  bjj9:G $("#  	
  T/ *;d ;s ;1# 1$ 12
s t 
"&<V < <|rB   r  )Kr   loggingr   collections.abcr   typingr   r   r   r   r   r	   pdfminerr
   pdfminer.imager   pdfminer.layoutr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   pdfminer.pdfcolorr   pdfminer.pdfdevicer    pdfminer.pdfexceptionsr!   pdfminer.pdffontr"   r#   pdfminer.pdfinterpr$   r%   pdfminer.pdfpager&   pdfminer.pdftypesr'   pdfminer.utilsr(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   	getLoggerr   r   r4   r   r   r   r   r   rg  r  rm   rB   rA   <module>r     s   	  	 $   &     * , , 0 : B $ '    g!R Rj) & 
68U	3$gfo <6L' 6rjL' jZ	[<& [|nL' nrB   