
    Ui>                        d dl Z d dlZd dlZ	 ddlmZ 	 ddlmZ ej                  Zej                  Z	dZ
dZdZdZ	 ej                  Zej&                  Zej*                  edf   Zej0                  e   Zej0                  e   Zej0                  e   Zej0                  e   Zej0                  ej@                     Z!	 	 	 	 	 dAd
ejD                  dededejF                  de$de%fdZ&	 	 	 	 	 	 dBd
ejD                  dededejF                  de$de%fdZ'	 	 	 	 dCd
ejD                  dededejF                  def
dZ(	 dDd
ejD                  dedejF                  defdZ)	 	 dEd
ejD                  de
de
dedejF                  f
dZ*	 	 	 	 	 dFd
ejD                  dededede$dedejF                  fdZ+	 dGdddd	dddd
ejD                  dedededejF                  de$fd Z,dDdefd!Z-d"ed#edefd$Z.d
ejD                  d%edefd&Z/de%fd'Z0de%fd(Z1d)ede2fd*Z3d)ede2fd+Z4d,ejj                  d"ede2fd-Z6d. Z7	 d/ Z8d0 Z9defd1Z:defd2Z;d3edefd4Z<d5e2d6ed7e2dejz                  fd8Z>d5e2d6edejz                  fd9Z?dDd:ed;e%dejz                  fd<Z@dDd5e2d6ed=e%dejz                  fd>ZAd5e2d6ed?edejz                  fd@ZBy# e$ r d dlZY w xY w# e$ r d dlZY w xY w# e$ r eez  ez  ZY w xY w)H    N   )pymupdf)mupdf
point_like	rect_likematrix_like	quad_likeFpageclipflagstextpagesortreturnc                    t        j                  |        |t         j                  }|}|| j                  ||      }nt	        |d      | k7  rt        d      |j                         }|~|r|j                  d        |S )a_  Return the text blocks on a page.

    Notes:
        Lines in a block are concatenated with line breaks.
    Args:
        flags: (int) control the amount of data parsed into the textpage.
    Returns:
        A list of the blocks. Each item contains the containing rectangle
        coordinates, text lines, running block number and block type.
    r   r   parentnot a textpage of this pagec                     | d   | d   fS N   r    )bs    Z/var/www/html/eventheodds/airagagent/rag_env/lib/python3.12/site-packages/pymupdf/utils.py<lambda>z!get_text_blocks.<locals>.<lambda>R   s    1Q41,     key)r   CheckParentTEXTFLAGS_BLOCKSget_textpagegetattr
ValueErrorextractBLOCKSr   )r
   r   r   r   r   tpblockss          r   get_text_blocksr&   4   s    " }((	B	zD6	X	$	&677F./Mr   r   c                    fd}t        j                  |        |t         j                  }|}|| j                  ||      }nt	        |d      | k7  rt        d      |j                  |      }	|]|[t        j                  |      }|	D 
cg c];  }
t        ||
dd z        dt        t        j                  |
dd             z  k\  s:|
= }	}
|~|	r
|r ||	      }	|	S c c}
w )a  Return the text words as a list with the bbox for each word.

    Args:
        page: pymupdf.Page
        clip: (rect-like) area on page to consider
        flags: (int) control the amount of data parsed into the textpage.
        textpage: (pymupdf.TextPage) either passed-in or None.
        sort: (bool) sort the words in reading sequence.
        delimiters: (str,list) characters to use as word delimiters.
        tolerance: (float) consider words to be part of the same line if
            top or bottom coordinate are not larger than this. Relevant
            only if sort=True.

    Returns:
        Word tuples (x0, y0, x1, y1, "word", bno, lno, wno).
    c                    | j                  d        g }| d   g}t        j                  | d   dd       }| dd D ]  }t        j                  |dd       }t        |j                  |j                  z
        k  s%t        |j
                  |j
                  z
        k  r|j                  |       ||z  }||j                  d        |j                  |       |g}|} |j                  d        |j                  |       |S )	z1Sort words line-wise, forgiving small deviations.c                     | d   | d   fS r   r   ws    r   r   z4get_text_words.<locals>.sort_words.<locals>.<lambda>r   s    !A$! r   r   r   N   r   c                     | d   S Nr   r   r*   s    r   r   z4get_text_words.<locals>.sort_words.<locals>.<lambda>   s
    ! r   c                     | d   S r.   r   r*   s    r   r   z4get_text_words.<locals>.sort_words.<locals>.<lambda>   s
    ! r   )r   r   Rectabsy0y1appendextend)wordsnwordslinelrectr+   wrect	tolerances         r   
sort_wordsz"get_text_words.<locals>.sort_wordsp   s    

-
.azU1Xbq\*qr 	ALL2A'EEHHuxx'(I5uxx%((*+y8A		n	-d#s	 			n	%dr   Nr   r   r   r,   g      ?)	r   r   TEXTFLAGS_WORDSr    r!   r"   extractWORDSr0   r1   )r
   r   r   r   r   
delimitersr;   r<   r$   r6   r+   s         `    r   get_text_wordsr@   V   s    42 }''	B	zD6	X	$	&677OOJ'E  0||D!
D1Ra5L 1S3w||AbqE?R;S5S SA
 
 5!L
s   ;C!
C!c           	      ~   d }t        | |||d|      D cg c]  }t        j                  |dd       |d   f! }}|syt        j                         }|D ]
  \  }	}
||	z  } g }|d   g}|d   d   }|dd D ]  \  }	}
|d	   \  }}t	        |j
                  |	j
                  z
        |k  s%t	        |j                  |	j                  z
        |k  r|j                  |	|
f       ||	z  }q |||      }|j                  ||f       |	|
fg}|	}  |||      }|j                  ||f       |j                  d
        |d   d   }
|d   d   j                  }|dd D ]Y  \  }}t        t        t        |j
                  |z
  |j                  z              d      }d|dz   z  }|
||z   z  }
|j                  }[ |
S c c}w )a  Extract plain text avoiding unacceptable line breaks.

    Text contained in clip will be sorted in reading sequence. Some effort
    is also spent to simulate layout vertically and horizontally.

    Args:
        page: pymupdf.Page
        clip: (rect-like) only consider text inside
        flags: (int) text extraction flags
        textpage: pymupdf.TextPage
        tolerance: (float) consider words to be on the same line if their top
            or bottom coordinates do not differ more than this.

    Notes:
        If a TextPage is provided, all text is checked for being inside clip
        with at least 50% of its bbox.
        This allows to use some "global" TextPage in conjunction with sub-
        selecting words in parts of the defined TextPage rectangle.

    Returns:
        A text string in reading sequence. Left indentation of each line,
        inter-line and inter-word distances strive to reflect the layout.
    c                    |j                  d        d}| j                  }t        j                         }|D ]  \  }}||z  }t	        t        t        |j                  |z
  |j                  z  t        |      z              || j                  k(  s|j                  |k  rdnd      }|d|z  |z   z  }|j                  } |S )a  Create the string of one text line.

        We are trying to simulate some horizontal layout here, too.

        Args:
            clip: (pymupdf.Rect) the area from which all text is being read.
            line: (list) word tuples (rect, text) contained in the line
        Returns:
            Text in this line. Generated from words in 'line'. Distance from
            predecessor is translated to multiple spaces, thus simulating
            text indentations and large horizontal distances.
        c                      | d   j                   S r.   )x0r*   s    r   r   z4get_sorted_text.<locals>.line_text.<locals>.<lambda>   s    ! r   r    r   r    )
r   rD   r   
EMPTY_RECTmaxintroundwidthlenx1)r   r8   ltextrM   r9   rtdists           r   	line_textz"get_sorted_text.<locals>.line_text   s     			'	(WW""$ 		DAqQJEE144"9/#a&89:DGGmqttrzD
 S4Z!^#EB		 r   T)r   r   r   r   r;   Nr,   rE   r   r   c                      | d   j                   S r.   )r3   )ls    r   r   z!get_sorted_text.<locals>.<lambda>  s    adgg r   r      
)r@   r   r0   rG   r1   r2   r3   r4   r   minrI   rJ   height)r
   r   r   r   r;   rR   r+   r6   totalboxwrtextlinesr8   r9   w0r_rN   r3   distancebreakss                       r   get_sorted_textrb      s
   >@  

 
ae	ad#
E 
 !!#H DB E!H:D!HQKE !"I DbQ uxx"%% I-UXX5E1F)1SKKT
#RKE h-ELL%(J<DE h%E	LL%  
JJ&J'8A;D	q!Bab	 us5%((R-5<<!?@A1EA&XX	 Kk
s   $F:rectc                     |}|| j                         }nt        |d      | k7  rt        d      |j                  |      }|~|S )Nr   r   )r    r!   r"   extractTextbox)r
   rc   r   r$   rcs        r   get_textboxrg     sU    
 
B	z 	X	$	&677			4	 BIr   p1p2c                     t        j                  |        |}|"| j                  |t         j                        }nt	        |d      | k7  rt        d      |j                  ||      }|~|S )Nr   r   r   )r   r   r    TEXT_DEHYPHENATEr!   r"   extractSelection)r
   rh   ri   r   r   r$   rf   s          r   get_text_selectionrm   *  sr     	B	zD0H0HI	X	$	&677			R	$BIr   languagedpifulltessdatac                 <   t        j                  |        t        j                        fd}|r || |||      S | j                  |      }| j	                  dt         j
                        d   D ]`  }|d   dk7  rt        j                  |d         }	|	j                  dk  s|	j                  dk  rD	 t        j                  |d	         }
|
j                  |
j                  z
  dk7  r$t        j                  t         j                  |
      }
|
j                  rt        j                  |
d
      }
t        j                  d|
j                  |            }|j                  d
      }d}
|j                   }t        j"                  d|j                  z  d|j                  z        }||d   z  }|j%                  |d
|       |j'                          c |S # t(        t*        j,                  f$ r( 	 d}t        j2                  d        || |||      cY c S w xY w)as  Create a Textpage from combined results of normal and OCR text parsing.

    Args:
        flags: (int) control content becoming part of the result.
        language: (str) specify expected language(s). Default is "eng" (English).
        dpi: (int) resolution in dpi, default 72.
        full: (bool) whether to OCR the full page image, or only its images (default)
    c                    |dz  }t        j                  ||      }| j                  |      }t        j                  d|j	                  d|            }|j                  d      }| j                  j                  |j                  j                  z  }	t        j                  |	|	      | j                  z  }
|j                  ||
      }|j                          d }t        j                  |       |_        |S )NH   )matrixpdfF)compressrn   rq   r   r   ru   )r   Matrix
get_pixmapDocumentpdfocr_tobytes	load_pagerc   rK   derotation_matrixr    closeweakrefproxyr   )r
   ro   rn   r   zoommatpixocr_pdfocr_pageunzoomctmtpagerq   s               r   full_ocrz"get_textpage_ocr.<locals>.full_ocrP  s    RxnnT4(ooSo)"""""%% #  $$Q'8==#6#66nnVV,t/E/EE%%E#%>}}T*r   )r   dictr%   typer   bboxr   imager   rv   )rn   rq   N	transformrx   zFalling back to full page OCR)r   r   get_tessdatar    get_textTEXT_PRESERVE_IMAGESr0   rK   rY   PixmapnalphacsRGBr{   r|   r}   rc   ry   extend_textpager   RuntimeErrorr   FzErrorBaseg_exceptions_verboseexception_infomessage)r
   r   rn   ro   rp   rq   r   r   blockr   r   imgdocimgpageimgrectshrinkr   s        `          r   get_textpage_ocrr   =  s     ##H-H, c8U33
 E*EvW-I-IJ8T 8=A||E&M*::?dkkQ.	8..w0Cuusyy A%nnW]]C8yynnS!,%%&&8&LF &&q)GCllG^^A$5q7>>7IJF5--C##E3#?LLN/8B L e//0 	8 EOO;<D#x77	8s   9DG<HH)r   r   r   r   r?   r;   optionc                   t         j                  t         j                  t         j                  t         j                  t         j
                  t         j                  t         j                  t         j                  t         j                  t         j                  d
}|j                         }||v sJ ||vrd}|||   }|dk(  rt        | |||||      S |dk(  rt        | ||||      S |dk(  r|rt        | ||||      S t        j                  |        d}	|d	v r| j                  }|t        j                  |      }d}	n't!        |       t         j"                  u r| j                  }	|}
|
| j%                  ||
      }
nt'        |
d      | k7  rt)        d      |dk(  r|
j+                  |	|      }n|dk(  r|
j-                  |	|      }n|dk(  r|
j/                  |	|      }nm|dk(  r|
j1                  |	|      }nT|dk(  r|
j3                         }n>|dk(  r|
j5                         }n(|dk(  r|
j7                         }n|
j9                  |      }|~
|S )a  Extract text from a page or an annotation.

    This is a unifying wrapper for various methods of the pymupdf.TextPage class.

    Args:
        option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
        clip: (rect-like) restrict output to this area.
        flags: bit switches to e.g. exclude images or decompose ligatures.
        textpage: reuse this pymupdf.TextPage and make no new one. If specified,
            'flags' and 'clip' are ignored.

    Returns:
        the output of methods get_text_words / get_text_blocks or pymupdf.TextPage
        methods extractText, extractHTML, extractDICT, extractJSON, extractRAWDICT,
        extractXHTML or etractXML respectively.
        Default and misspelling choice is "text".
    )
r\   htmljsonrawjsonxmlxhtmlr   rawdictr6   r%   r\   Nr6   )r   r   r   r   r?   r%   )r   r   r   r   )r   r   r   r;   )r   r   r   r   r   r   r   )cbr   r   r   r   r   r   r   )r   )r   TEXTFLAGS_TEXTTEXTFLAGS_HTMLTEXTFLAGS_DICTTEXTFLAGS_RAWDICTTEXTFLAGS_XMLTEXTFLAGS_XHTMLr=   r   lowerr@   r&   rb   r   cropboxr0   r   Pager    r!   r"   extractJSONextractRAWJSONextractDICTextractRAWDICTextractHTML
extractXMLextractXHTMLextractText)r
   r   r   r   r   r   r?   r;   formatsr   r$   rP   s               r   r   r     se   : &&&&&&,,$$((&&,,((**G \\^FWW}!
 	
 t58$
 	
 D
 	
 	B))||||D!	dw||	#\\	B	zD6	X	$	&677NNbtN,	9	$/	6	NNbtN,	9	$/	6	NN	5MMO	7	OONNN%Hr   c                    t        | t        j                        r| j                  |      }n;t        | t        j                        r| j
                  }nJ dt        |       d       |j                  dd}	 t        | d      r| j                  |d<   t        j                  dd      }|j                  t        j                  z  r|j                   j"                  |_        |j                  t        j$                  z  r|j                   j&                  |_        |j                  t        j(                  k(  r|j*                  |d<   |S |j                  t        j,                  k(  rS|j.                  |d	<   ||d
<   |j                  t        j0                  z  r|j2                  j"                  |d<   |S d|d<   |S |j                  t        j4                  k(  r|j6                  j9                  dd      |d<   |j.                  |d	<   |j.                  dk  r|j
                  |d
<   |S ||d
<   |j                  t        j0                  z  r|j2                  j"                  |d<   |S d|d<   |S |j                  t        j:                  k(  r!|j6                  j9                  dd      |d<   |S |j                  t        j<                  k(  ri|j>                  jA                         |jA                         z  rJ |jC                  |j>                         d
|v rt        j                  |d
         |d
<   |S |j.                  |d	<   |S # t        $ r! t        dk\  rt        j                          Y w xY w)Nr   zUnexpected type(ln)=.)kindxrefrc   from   urir
   tor   g        \/file)"
isinstancer   OutlinedestinationLinkdestr   r   hasattrrc   	Exceptionr   r   Pointr   LINK_FLAG_L_VALIDltxLINK_FLAG_T_VALIDyLINK_URIr   	LINK_GOTOr
   LINK_FLAG_R_IS_ZOOMrb
LINK_GOTOR	file_specreplaceLINK_LAUNCH
LINK_NAMEDnamedkeysupdate)lndocumentr   nlpnts        r   getLinkDictr     s   "goo&~~h'	B	%ww,)R{!,,q))Q	'B2vBvJ
 --1
CzzG---		zzG---		yyG$$$HH5	D IA 
g''	'YY6
4::333BvJ8 I5 BvJ4 I1 
g((	(^^++D#66
YY6
99q=yyBtH( I% BtHzzG777!WWYY6
  I !6
 I 
g))	)^^++D#66
 I 
g((	(JJOO%	122
		$**2:}}RX.BtH I YY6
I[  1$(>(>(@s   8L; ;&M%$M%r   ddictc                    |syd }d }d }d }d }t        |      t        t        fv r || d|d      }|S |j                  dt        j
                        }|t        j
                  k(  ry|d   t        j                  k(  rJ|j                  d	d      }	|j                  d
t	        j                  dd            }
|
\  }} || |||	      }|S |d   t        j                  k(  r  |t	        j                  |d               }|S |d   t        j                  k(  r#t	        j                  |d         } |||      }|S |d   t        j                  k(  rB|d   dk  r:t	        j                  |d         } |t	        j                  |d
         ||      }|S |d   t        j                  k(  rO|d   dk\  rGt	        j                  |d         } ||d   |d
   j                  |d
   j                  |d	   ||      }|S y)zrCalculate the PDF action string.

    Notes:
        Supports Link annotations and outline items (bookmarks).
    rE   c                 ,    d|  dt        |||f       dS )Nz/A<</S/GoTo/D[z	 0 R/XYZ z]>>	_format_g)ar   cds       r   r   zgetDestStr.<locals>.<lambda>A  s#    N1#Yy!QPQ?S>TTW"X r   c           	      8    d|  dt        |||f       d| d| d	S )Nz/A<</S/GoToR/D[z /XYZ z]/F<</F/UF/Type/Filespec>>>>r   )r   r   r   r   efs         r   r   zgetDestStr.<locals>.<lambda>B  s7    OA3fYPQSTVWyEYDZZabcaddghigjj|*} r   c                     d|  d| d| dS )Nz/A<</S/GoToR/Dz/F<</Fr   r   r   )r   r   r   s      r   r   zgetDestStr.<locals>.<lambda>C  s    >!F1#SCU!V r   c                     d|  d| dS )Nz/A<</S/Launch/F<</Fr   r   r   )r   r   s     r   r   zgetDestStr.<locals>.<lambda>D  s     3A3c!<NO r   c                     d|  dS )Nz/A<</S/URI/URIz>>r   )r   s    r   r   zgetDestStr.<locals>.<lambda>E  s    .2. r   r   r   r   r   r   r   r
   )r   rI   floatgetr   	LINK_NONEr   r   r   get_pdf_strr   r   r   r   )r   r   str_goto
str_gotor1
str_gotor2
str_launchstr_urir   d_kindd_zoomr   d_leftd_topfspecs                 r   
getDestStrr  9  s    XH}JVJOJ.GE{sEl"a*YYvw001F"""V})))61%YYtW]]1a01feV4V}(((w**5<8:V}+++##E&M2%'V}***uV}q/@##E&M2'--eDk:E5IV}***uV}/A##E&M2&M$KMM$KMM&M
 r   lnkc           	      r   | j                   }| }|d   }t        t        ||z              }d}|d   t        j                  k(  r|d   dk\  rt        j
                  d   }|d   }| j                  j                  |      }	|j                  dt        j                  dd            }
| j                  |   }|j                   }| }|
|z  } ||	|j                  |j                  |j                  dd      |      }nt        j
                  d	   } |t        j                  |d         |      }n|d   t        j                  k(  r|d   dk\  rt        j
                  d
   }|j                  dt        j                  dd            }
t        |
      t        j                  urt        j                  dd      }
 ||d   |
j                  |
j                  |j                  dd      |d   |d   |      }nt        j
                  d   } |t        j                  |d         |d   |      }n|d   t        j                  k(  r$t        j
                  d   } ||d   |d   |      }n|d   t        j                   k(  r t        j
                  d   } ||d   |      }nJ|d   t        j"                  k(  r4t        j
                  d   }|j                  d      }||d   } |||      }|s|S t%        | j'                         D cg c]#  }|d   t        j(                  k(  s|d   |d   f% c}      }|j                  dd      }|r|d   |f|j+                         v r|}nBd}t        j,                  j/                         dz   }	 ||z  }||j1                         vrn|dz  }|j3                  dd|z        }|S c c}w )Nr   rE   r   r
   r   goto1r   r   goto2gotor1r   gotor2launchr   r   name	nameddestr   r   idr   z-L%iz/Linkz/Link/NM(%s))transformation_matrixr   tupler   r   
annot_skelr   	page_xrefr   r   r   r   r   r   r   r   r   r   r   annot_xrefsPDF_ANNOT_LINKitemsTOOLSset_annot_stemvaluesr   )r
   r  r   ictmrO   rc   annottxtpnor   r   	dest_pagedest_ctm	dest_ictmipntlnamer   
link_namesold_namer  istems                         r   getLinkTextr'  s  s    
$
$C4DFAU1t8_%DE
6{g'''v;!$$W-Cf+C;;((-D''$a 34CC(I 66H!	I?Ddffdffcggfa.@$GE$$W-C++CI6=E	V**	*v;!$$X.C''$a 34CCy-mmAq)F"FFE $$X.C++CI6FTJE	V++	+  *CKVd3	V((	(  'CJ%	V**	*  )=$EE4  #//1T!QqTW=S=S5S!A$!TJ wwtR HS[(+z/?/?/AA}}++-6!8D:,,..FA	  MM'>D#89EL# 	Us   >N4N4c            
      l    t        j                         D  cg c]	  \  } }}}|  c}}}} S c c}}}} w )zP
    Returns a list of upper-case colour names.
    :rtype: list of strings
    r   colors_wx_list)r  rO   gr   s       r   getColorListr,    s,    
 '.&<&<&>??]T1aD???s   .
c                  *    t        j                         S )z
    Returns list of (name, red, gree, blue) tuples, where:
        name: upper-case color name.
        read, green, blue: integers in range 0..255.
    :rtype: list of tuples
    r)  r   r   r   getColorInfoListr.    s     !!##r   r  c                 f    t        j                         j                  | j                         d      S )zRetrieve RGB color in PDF format by name.

    Returns:
        a triple of floats in range 0 to 1. In case of name-not-found, "white" is returned.
    )r   r   r   )r   colors_pdf_dictr   r   )r  s    r   getColorr1    s&     ""$((yAAr   c                 <   	 t               t               j                  | j                                  }|d   dz  }|d   dz  }|d   dz  }t        |||      }t        |dz  d      }t        |||      }||z
  }|dk(  rd}	n6||k(  rd||z
  |z  d	z  z  }	n"||k(  rd||z
  |z  dz   z  }	nd||z
  |z  d
z   z  }	t        t        |	            }
|dk(  rd}n||z  }t        t        |dz              }|
||fS # t        $ r t
        rt        j                          Y yw xY w)zRetrieve the hue, saturation, value triple of a color name.

    Returns:
        a triple (degree, percent, percent). If not found (-1, -1, -1) is returned.
    )rS   rS   rS   r   g     o@r   r   d   r   g      N@   r,   )r.  r,  indexupperr   r   r   r   rH   rJ   rX   rI   )r  r   rO   r+  r   cmaxVcmindeltahueHsatSs                r   getColorHSVr?    sE   |~33DJJLAB
 	
!uA	!uA	!uAq!Q<DdSj!Aq!Q<D4KEz	A!+,	A!+,A!+,E#JAqydlE#)Aq!99  G$:$:$<s   2C5 5#DDdocc                 d   | j                  |      \  }}}}d}d}|dk(  r|||||fS |rj	 t        j                  |      }|j                  }|j                  }|j
                  }	||z
  dk  r |	j                  |k  r|	j                  }d|z
  }|||||fS |dk7  r/	 t        j                  |      }|j                  }|j                  }n
|dz  }|dz  }|||||fS # t        $ r! t        j                          |dz  }|dz  }Y uw xY w# t        $ r! t        j                          |dz  }|dz  }Y ]w xY w)Ng?gɿrE   )
fontbufferr   g333333?zn/a)	extract_fontr   Fontascender	descenderr   r2   r   r   )
r@  r   fontnameextstypebufferascdscfontr   s
             r   _get_font_propertiesrN    sW   #&#3#3D#9 Hc5&
C
C
byeS#--	<<62D--C..C99DSy1}77S=''C#g
 eS#--
e|	<<)D--C..C 	s
s
S%c))#  	""$3JC3JC	  	""$3JC3JC	s$   A"C -D 'DD'D/.D/c                     d}d}| j                   j                  }	 |sn!|dz  }||j                  z  }|j                  }$d| d| S )Nr   r   z
num_spans=z num_chars=)
m_internalheadrL   next)r\   	num_spans	num_charsspans       r   _show_fz_textrV  ;  s`    
 II??D
Q	TXX	yy  	{+i[99r   c                 f   | \  }}|dd j                  d      dd }|ddd}d}t        |      D ]~  \  }} |rd}| d	k(  r||dz      |d
<   d}| j                  d      r+| dd j                  dd      j                  dd      }||d<   Z| j                  d      slt	        | dd       }||d<    |S )a"  Make a Python dict from a PDF page label rule.

    Args:
        item -- a tuple (pno, rule) with the start page number and the rule
                string like <</S/D...>>.
    Returns:
        A dict like
        {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}.
    r   r   r   NrE   )	startpageprefixfirstpagenumFr>  styleTP()rZ  Str[  )split	enumerate
startswithr   rI   )itemr  ruler   skipr%  r   s          r   	rule_dictrg  ^  s     IC":C $DR;ADT? "4D3;a!eAgJD??3QR  b)11#r:AAhK??4 DHA !An" Hr   c                     |D cg c]  }|d   | k  s| c}d   }t        |      }|j                  dd      }|j                  dd      }|dv rdnd}| |d   z
  |d   z   |z   }t        |||      S c c}w )	zReturn the label for this page number.

    Args:
        pgNo: page number, 0-based.
        labels: result of doc._get_page_labels().
    Returns:
        The label (str) of the page number. Errors return an empty string.
    r   rS   rZ  rE   r\  )r   ArY  r[  )rg  r   construct_label)	pgNolabelsr   rd  re  rZ  r\  r:  
pagenumbers	            r   get_label_pnorn    s     .!1A.r2DT?DXXh#FHHWb!E:%B1E[))D,@@5HJ5&*55 /s
   A2A2c                 ,   d}| dk(  rt        |      }n{| dk(  rt        |      j                         }n\| dk(  rt        |      j                         }n=| dk(  rt	        |      j                         }n| dk(  rt	        |      j                         }||z   }|S )z9Construct a label based on style, prefix and page number.rE   DrO   Rr   ri  )strintegerToRomanr   r6  integerToLetter)r\  rZ  r  n_strresults        r   rj  rj    s     E|C	#s#))+	#s#))+	#$**,	#$**,e^FMr   c           
      \   ddl }|j                  }d| }}t        d|      |k  r7|t        t	        j                  d|            z  }|dz  }t        d|      |k  r7d}t        t        |            D ]8  }t        |t        t	        j                  d|                  \  }}|||   z  }|}: |S )z-Returns letter sequence string for integer i.r   Nr      rE   )stringascii_uppercasepowrI   mathreversedrangedivmod)	r%  ry  lsr   r   str_tjr   r+  s	            r   rt  rt    s     			BaqA
b!*/	S"a!!	Q b!*/ EeAh aTXXb!_-.1A Lr   numc                 f    dfd}dj                   ||       D cg c]  }| c}      S c c}w )z$Return roman numeral for an integer.))i  M)i  CM)i  rp  )i  CD)r3  C)Z   XC)2   L)(   XL)
   X)	   IX)rV   r8  )r,   IV)r   Ic              3   l   K   D ]*  \  }}t        | |      \  }}||z   | ||z  z  } | dk  s* y  y wr.   )r  )r  rO   ltrr   r_   romans        r   	roman_numz!integerToRoman.<locals>.roman_num  sI      	FAs#q>DAq'M1q5LCax	s   ,44rE   )join)r  r  r   r  s      @r   rs  rs    s2    E  77y~.!A.//.s   	.line_dirrU  r   c                 R   | |d   } | \  }}t        j                  |      }t         j                  j                         rd}n|d   |d   z
  }||d   z  }||z  }||z  }|dk\  rJ|dk  rE|j                  d|fz
  }	|j
                  |dfz   }
|j                  |dfz
  }|j
                  d|fz   }n|dk  rJ|dk  rE|j                  |dfz   }	|j                  d|fz
  }
|j                  d|fz   }|j                  |dfz
  }n|dk  rJ|dk\  rE|j
                  d|fz
  }	|j                  |dfz   }
|j
                  |dfz
  }|j                  d|fz   }nD|j                  |dfz   }	|j                  d|fz
  }
|j                  d|fz   }|j                  |dfz
  }t        j                  |	|
||      S )a  Compute the quad located inside the bbox.

    The bbox may be any of the resp. tuples occurring inside the given span.

    Args:
        line_dir: (tuple) 'line["dir"]' of the owning line or None.
        span: (dict) the span. May be from get_texttrace() method.
        bbox: (tuple) the bbox of the span or any of its characters.
    Returns:
        The quad which is wrapped by the bbox.
    dirr   rE  rF  sizer   )	r   r0   r  set_small_glyph_heightsbltrbrtlQuad)r  rU  r   cossinr   rY   hshculurlllrs                r   recover_bbox_quadr    s    ;HC<<D}},,.tK00fF 
#B	#B	Qw27WW2wWWAwWWAwWW2w	qR1WWWAwWW2wWW2wWWAw	qR1WWW2wWWAwWWAwWW2wWWAwWW2wWW2wWWAw<<BB''r   c                     t        |       t        ust        |       dk7  rt        d      t        |      t        urt        d      t        | ||d         S )zRecover the quadrilateral of a text span.

    Args:
        line_dir: (tuple) 'line["dir"]' of the owning line.
        span: the span.
    Returns:
        The quadrilateral enveloping the span's text.
    r   bad line dir argumentbad span argumentr   )r   r  rL   r"   r   r  )r  rU  s     r   recover_quadr    sR     H~U"c(mq&8011Dz,--XtT&\::r   r8   spansc           	      &   || d   }t        |      dk(  rt        d      | d   }|\  }}t        ||d         }t        |      dkD  rt        ||d         }n|}|j                  }|j                  }t        j                  ||      }	||	z  }
t
        j                  j                         }t        |D cg c]  }|d   |rdn
|d   |d	   z
  z   c}      }t        j                  d| |
j                  d      }|j                  }||	 z  }|S c c}w )
a  Calculate the line quad for 'dict' / 'rawdict' text extractions.

    The lower quad points are those of the first, resp. last span quad.
    The upper points are determined by the maximum span quad height.
    From this, compute a rect with bottom-left in (0, 0), convert this to a
    quad and rotate and shift back to cover the text of the spans.

    Args:
        spans: (list, optional) sub-list of spans to consider.
    Returns:
        pymupdf.Quad covering selected spans.
    r  r   zbad span listr  r   rS   r  rE  rF  )rL   r"   r  r  r  r   planish_liner  r  rH   r0   r   quad)r8   r  r  r  r  q0q1line_llline_lrmat0x_lrsmallsh	line_rect	line_quads                   r   recover_line_quadr    s    }W
5zQ))E{HHC	ha	)B
5zA~(E"I.eeGeeG1D T>DMM113EQVWA65aq}q~'E	GW	A QDFFA.II$I 	Xs   4Dcharsc                    | |d   } |t        | |      S d|j                         vrt        d      t        | ||d         }t	        |      dkD  rt        | ||d         }n|}|j
                  }|j                  }t        j                  ||      }||z  }t        j                  j                         }	|d   |	rdn
|d   |d	   z
  z  }
t        j                  d|
 |j                  d      }|j                  }|| z  }|S )
a^  Calculate the span quad for 'dict' / 'rawdict' text extractions.

    Notes:
        There are two execution paths:
        1. For the full span quad, the result of 'recover_quad' is returned.
        2. For the quad of a sub-list of characters, the char quads are
           computed and joined. This is only supported for the "rawdict"
           extraction option.

    Args:
        line_dir: (tuple) 'line["dir"]' of the owning line.
        span: (dict) the span.
        chars: (list, optional) sub-list of characters to consider.
    Returns:
        pymupdf.Quad covering selected characters.
    r  r  z)need 'rawdict' option to sub-select charsr   r   rS   r  rE  rF  )r  r   r"   recover_char_quadrL   r  r  r   r  r  r  r0   r   r  )r  rU  r  r  r  span_llspan_lrr  r  r  r  	span_rect	span_quads                r   recover_span_quadr  K  s   " ;}Hd++diik!DEE	8T58	4B
5zA~xuRy9eeGeeG1DT>DMM113EVUj)9D<M)MOAQDFFA.II$Ir   charc                 t   | |d   } t        |       t        ust        |       dk7  rt        d      t        |      t        urt        d      t        |      t        u rt        j                  |d         }n5t        |      t        u rt        j                  |d         }nt        d      t        | ||      S )aD  Recover the quadrilateral of a text character.

    This requires the "rawdict" option of text extraction.

    Args:
        line_dir: (tuple) 'line["dir"]' of the span's line.
        span: (dict) the span dict.
        char: (dict) the character dict.
    Returns:
        The quadrilateral enveloping the character.
    r  r   r  r  r   r   )r   r  rL   r"   r   r   r0   r  )r  rU  r  r   s       r   r  r  x  s     ;H~U"c(mq&8011Dz,--DzT||DL)	du	||DG$,--XtT22r   )NNNF)NNNFNr   )NNNr   )N)NN)r   engrt   FN)r\   )Cr|  typingr   rE   r   r   r   format_gr   r   r   r   r   r	   
ByteStringAttributeErrorbytes	bytearray
memoryviewAnyAnyTypeUnionrI   OptIntOptionalr   OptFloatrr  OptStrr   OptDictOptBytesSequenceOptSeqr   TextPageboollistr&   r@   rb   rg   rm   r   r   r   r  r'  r,  r.  r  r1  r?  r{   rN  rV  rg  rn  rj  rt  rs  r  r  r  r  r  r  r   r   r   <module>r     s      	33 
		0""J
 **	c4i	 ??5!		
//$
??:&		) !%
,,
  	
  
H !%L
,,L
L L 	L
 L 
Lb !%r
,,r
r r 	r 	rp "&
,,
  		( !%
,, 	 	
 * Q
,,QQ Q 
	Q
 Q Q Ql j !%j
,,jj 	j
 j j jZ8d 8v7S 7 7# 7tPgll P P# PB@d @$$ $B3 B5 B$c $e $N"*g.. "*c "*e "*J:"$D6*3 &# $0 0 0D/( /(T /( /(7<< /(d;5 ; ; ; *D * * *Z* *T *$ *',, *Z3 3T 3 3',, 3S#       0"Z/J0s3   J7 K K 7	KK	KKK('K(