
    9i>                         S SK r S SKrS SKr SSKJr   SSKJr  \R                  r\R                  r	Sr
SrSrSr \R                  r\R&                  r\R*                  \S4   r\R0                  \   r\R0                  \   r\R0                  \   r\R0                  \   r\R0                  \R@                     r!     SAS
\RD                  S\S\S\RF                  S\$S\%4S jjr&      SBS
\RD                  S\S\S\RF                  S\$S\%4S jjr'    SCS
\RD                  S\S\S\RF                  S\4
S jjr( SDS
\RD                  S\S\RF                  S\4S jjr)  SES
\RD                  S\
S\
S\S\RF                  4
S jjr*     SFS
\RD                  S\S\S\S\$S\S\RF                  4S jjr+ SGSSSS	SSS.S
\RD                  S\S\S\S\RF                  S\$4S  jjjr,SDS\4S! jjr-S"\S#\S\4S$ jr.S
\RD                  S%\S\4S& jr/S\%4S' jr0S\%4S( jr1S)\S\24S* jr3S)\S\24S+ jr4S,\Rj                  S"\S\24S- jr6S. r7 S/ r8S0 r9S\4S1 jr:S\4S2 jr;S3\S\4S4 jr<S5\2S6\S7\2S\Rz                  4S8 jr>S5\2S6\S\Rz                  4S9 jr?SDS:\S;\%S\Rz                  4S< jjr@SDS5\2S6\S=\%S\Rz                  4S> jjrAS5\2S6\S?\S\Rz                  4S@ jrBg! \ a    S SKr GNf = f! \ a    S SKr GNf = f! \ a    \\-  \-  r GNf = f)H    N   )pymupdf)mupdf
point_like	rect_likematrix_like	quad_likeFpageclipflagstextpagesortreturnc                    [         R                  " U 5        Uc  [         R                  nUnUc  U R                  XS9nO[	        US5      U :w  a  [        S5      eUR                  5       nUc  AU(       a  UR                  S S9  U$ )a?  Return the text blocks on a page.

Notes:
    Lines in a block are concatenated with line breaks.
Args:
    flags: (int) control the amount of data parsed into the textpage.
Returns:
    A list of the blocks. Each item contains the containing rectangle
    coordinates, text lines, running block number and block type.
r   r   parentnot a textpage of this pagec                     U S   U S   4$ N   r    )bs    M/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/pymupdf/utils.py<lambda>!get_text_blocks.<locals>.<lambda>R   s    1Q41,    key)r   CheckParentTEXTFLAGS_BLOCKSget_textpagegetattr
ValueErrorextractBLOCKSr   )r
   r   r   r   r   tpblockss          r   get_text_blocksr'   4   s    " }((	B	zD6	X	$	&677F./Mr   r   c                   ^ U4S jn[         R                  " U 5        Uc  [         R                  nUnUc  U R                  XS9nO[	        US5      U :w  a  [        S5      eUR                  U5      n	Ubd  Uba  [         R                  " U5      nU	 V
s/ s H>  n
[        XSS -  5      S[        [         R                  " U
SS 5      5      -  :  d  M<  U
PM@     n	n
Uc  AU	(       a  U(       a  U" U	5      n	U	$ s  sn
f )ah  Return the text words as a list with the bbox for each word.

Args:
    page: pymupdf.Page
    clip: (rect-like) area on page to consider
    flags: (int) control the amount of data parsed into the textpage.
    textpage: (pymupdf.TextPage) either passed-in or None.
    sort: (bool) sort the words in reading sequence.
    delimiters: (str,list) characters to use as word delimiters.
    tolerance: (float) consider words to be part of the same line if
        top or bottom coordinate are not larger than this. Relevant
        only if sort=True.

Returns:
    Word tuples (x0, y0, x1, y1, "word", bno, lno, wno).
c                   > U R                  S S9  / nU S   /n[        R                  " U S   SS 5      nU SS  H  n[        R                  " USS 5      n[        UR                  UR                  -
  5      T::  d&  [        UR
                  UR
                  -
  5      T::  a  UR                  U5        X5-  nM  UR                  S S9  UR                  U5        U/nUnM     UR                  S S9  UR                  U5        U$ )	z1Sort words line-wise, forgiving small deviations.c                     U S   U S   4$ r   r   ws    r   r   4get_text_words.<locals>.sort_words.<locals>.<lambda>r   s    !A$!r   r   r   N   r   c                     U S   $ Nr   r   r+   s    r   r   r-      s    !r   c                     U S   $ r0   r   r+   s    r   r   r-      s    !r   )r   r   Rectabsy0y1appendextend)wordsnwordslinelrectr,   wrect	tolerances         r   
sort_words"get_text_words.<locals>.sort_wordsp   s    

-
.azU1Xbq\*qrALL2A'EEHHuxx'(I5uxx%((*+y8A		n	-d#s  			n	%dr   Nr   r   r   r.   g      ?)	r   r   TEXTFLAGS_WORDSr!   r"   r#   extractWORDSr2   r3   )r
   r   r   r   r   
delimitersr=   r>   r%   r8   r,   s         `    r   get_text_wordsrC   V   s    42 }''	B	zD6	X	$	&677OOJ'E  0||D!
!DRa5L 1S3w||AbqE?R;S5S SAu 	 
 5!L
s   ;C5C5c           	         S n[        U UUUSUS9 Vs/ s H!  n[        R                  " USS 5      US   4PM#     nnU(       d  g[        R                  " 5       nU H	  u  pX-  nM     / nUS   /nUS   S   nUSS  H  u  pUS	   u  p[	        UR
                  U	R
                  -
  5      U::  d&  [	        UR                  U	R                  -
  5      U::  a  UR                  X45        X-  nMp  U" X5      nUR                  UU45        X4/nU	nM     U" X5      nUR                  UU45        UR                  S
 S9  US   S   n
US   S   R                  nUSS  HZ  u  nn[        [        [        UR
                  U-
  UR                  -  5      5      S5      nSUS-   -  nU
UU-   -  n
UR                  nM\     U
$ s  snf )ag  Extract plain text avoiding unacceptable line breaks.

Text contained in clip will be sorted in reading sequence. Some effort
is also spent to simulate layout vertically and horizontally.

Args:
    page: pymupdf.Page
    clip: (rect-like) only consider text inside
    flags: (int) text extraction flags
    textpage: pymupdf.TextPage
    tolerance: (float) consider words to be on the same line if their top
        or bottom coordinates do not differ more than this.

Notes:
    If a TextPage is provided, all text is checked for being inside clip
    with at least 50% of its bbox.
    This allows to use some "global" TextPage in conjunction with sub-
    selecting words in parts of the defined TextPage rectangle.

Returns:
    A text string in reading sequence. Left indentation of each line,
    inter-line and inter-word distances strive to reflect the layout.
c                    UR                  S S9  SnU R                  n[        R                  " 5       nU H  u  pVXE-  n[	        [        [        UR                  U-
  UR                  -  [        U5      -  5      5      X0R                  :X  d  UR                  U::  a  SOS5      nUSU-  U-   -  nUR                  nM     U$ )a  Create the string of one text line.

We are trying to simulate some horizontal layout here, too.

Args:
    clip: (pymupdf.Rect) the area from which all text is being read.
    line: (list) word tuples (rect, text) contained in the line
Returns:
    Text in this line. Generated from words in 'line'. Distance from
    predecessor is translated to multiple spaces, thus simulating
    text indentations and large horizontal distances.
c                      U S   R                   $ r0   )x0r+   s    r   r   4get_sorted_text.<locals>.line_text.<locals>.<lambda>   s    !r   r    r   r    )
r   rG   r   
EMPTY_RECTmaxintroundwidthlenx1)r   r:   ltextrQ   r;   rtdists           r   	line_text"get_sorted_text.<locals>.line_text   s     			'	(WW""$DAJEE144"9/#a&89:GGmqttrzD
 S4Z!^#EB  r   T)r   r   r   r   r=   Nr.   rI   r   r   c                      U S   R                   $ r0   )r5   )ls    r   r   !get_sorted_text.<locals>.<lambda>  s    adggr   r      
)rC   r   r2   rK   r3   r4   r5   r6   r   minrM   rN   height)r
   r   r   r   r=   rV   r,   r8   totalboxwrtextlinesr:   r;   w0r_rR   r5   distancebreakss                       r   get_sorted_textrh      s   >@  


A 
ae	ad#
 
 
 !!#H  E!H:D!HQKE !"Ib uxx"%% I-UXX5E1F)1SKK
#KE h-ELL%(J<DE  h%E	LL%  
JJ&J'8A;D	q!Bab	us5%((R-5<<!?@A1EA&XX	 " Kk
s   (F>rectc                     UnUc  U R                  5       nO[        US5      U :w  a  [        S5      eUR                  U5      nUc  AU$ )Nr   r   )r!   r"   r#   extractTextbox)r
   ri   r   r%   rcs        r   get_textboxrm     sU    
 
B	z 	X	$	&677			4	 BIr   p1p2c                     [         R                  " U 5        UnUc  U R                  U[         R                  S9nO[	        US5      U :w  a  [        S5      eUR                  X5      nUc  AU$ )Nr   r   r   )r   r   r!   TEXT_DEHYPHENATEr"   r#   extractSelection)r
   rn   ro   r   r   r%   rl   s          r   get_text_selectionrs   *  sp     	B	zD0H0HI	X	$	&677			R	$BIr   languagedpifulltessdatac           	      T  ^ [         R                  " U 5        [         R                  " T5      mU4S jnU(       a	  U" XX!5      $ U R                  US9nU R	                  S[         R
                  S9S    GHk  nUS   S:w  a  M  [         R                  " US   5      n	U	R                  S::  d  U	R                  S::  a  MJ   [         R                  " US	   5      n
U
R                  U
R                  -
  S:w  a%  [         R                  " [         R                  U
5      n
U
R                  (       a  [         R                  " U
S
5      n
[         R                  " SU
R                  UTS95      nUR                  S
5      nSn
UR                   n[         R"                  " SUR                  -  SUR                  -  5      nXS   -  nUR%                  US
US9  UR'                  5         GMn     U$ ! [(        [*        R,                  4 a'     Sn[         R2                  " S5        U" XX!5      s s  $ f = f)a[  Create a Textpage from combined results of normal and OCR text parsing.

Args:
    flags: (int) control content becoming part of the result.
    language: (str) specify expected language(s). Default is "eng" (English).
    dpi: (int) resolution in dpi, default 72.
    full: (bool) whether to OCR the full page image, or only its images (default)
c           	        > US-  n[         R                  " XD5      nU R                  US9n[         R                  " SUR	                  SUTS95      nUR                  S5      nU R                  R                  UR                  R                  -  n	[         R                  " X5      U R                  -  n
UR                  X:S9nUR                  5         S n[        R                  " U 5      Ul        U$ )NH   )matrixpdfF)compressrt   rw   r   r   r{   )r   Matrix
get_pixmapDocumentpdfocr_tobytes	load_pageri   rO   derotation_matrixr!   closeweakrefproxyr   )r
   ru   rt   r   zoommatpixocr_pdfocr_pageunzoomctmtpagerw   s               r   full_ocr"get_textpage_ocr.<locals>.full_ocrP  s    RxnnT(ooSo)"""""%% #  $$Q'8==#6#66nnV,t/E/EE%%E%>}}T*r   )r   dictr&   typer   bboxr   imager   r|   )rt   rw   N	transformr~   zFalling back to full page OCR)r   r   get_tessdatar!   get_textTEXT_PRESERVE_IMAGESr2   rO   r_   PixmapnalphacsRGBr   r   r   ri   r   extend_textpager   RuntimeErrorr   FzErrorBaseg_exceptions_verboseexception_infomessage)r
   r   rt   ru   rv   rw   r   r   blockr   r   imgdocimgpageimgrectshrinkr   s        `          r   get_textpage_ocrr   =  s     ##H-H, 833
 E*EvW-I-IJ8T=A||E&M*::?dkkQ.	8..w0Cuusyy A%nnW]]C8yynnS!,%%&&8&LF &&q)GCllG^^A$5q7>>7IJF--C##E3#?LLN/ UB L e//0 	8 EOO;<Dx77	8s   ?D G&&<H'&H')r   r   r   r   rB   r=   optionc                   [         R                  [         R                  [         R                  [         R                  [         R
                  [         R                  [         R                  [         R                  [         R                  [         R                  S.
nUR                  5       nX;   d   eX;  a  SnUc  X   nUS:X  a  [        U UUUUUS9$ US:X  a  [        XX4US9$ US:X  a  U(       a  [        U UUUUS9$ [         R                  " U 5        Sn	US	;   a  U R                  nUb  [         R                  " U5      nSn	O([!        U 5      [         R"                  L a  U R                  n	Un
U
c  U R%                  X#S
9n
O['        U
S5      U :w  a  [)        S5      eUS:X  a  U
R+                  XS9nOUS:X  a  U
R-                  XS9nOUS:X  a  U
R/                  XS9nOjUS:X  a  U
R1                  XS9nOTUS:X  a  U
R3                  5       nO=US:X  a  U
R5                  5       nO&US:X  a  U
R7                  5       nOU
R9                  US9nUc  A
U$ )a  Extract text from a page or an annotation.

This is a unifying wrapper for various methods of the pymupdf.TextPage class.

Args:
    option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
    clip: (rect-like) restrict output to this area.
    flags: bit switches to e.g. exclude images or decompose ligatures.
    textpage: reuse this pymupdf.TextPage and make no new one. If specified,
        'flags' and 'clip' are ignored.

Returns:
    the output of methods get_text_words / get_text_blocks or pymupdf.TextPage
    methods extractText, extractHTML, extractDICT, extractJSON, extractRAWDICT,
    extractXHTML or etractXML respectively.
    Default and misspelling choice is "text".
)
rb   htmljsonrawjsonxmlxhtmlr   rawdictr8   r&   rb   Nr8   )r   r   r   r   rB   r&   )r   r   r   r   )r   r   r   r=   )r   r   r   r   r   r   r   )cbr   r   r   r   r   r   r   )r   )r   TEXTFLAGS_TEXTTEXTFLAGS_HTMLTEXTFLAGS_DICTTEXTFLAGS_RAWDICTTEXTFLAGS_XMLTEXTFLAGS_XHTMLr@   r    lowerrC   r'   rh   r   cropboxr2   r   Pager!   r"   r#   extractJSONextractRAWJSONextractDICTextractRAWDICTextractHTML
extractXMLextractXHTMLextractText)r
   r   r   r   r   r   rB   r=   formatsr   r%   rT   s               r   r   r     sQ   : &&&&&&,,$$((&&,,((**G \\^F}!
 	
 5$
 	
 D
 	
 	B))||||D!	dw||	#\\	B	zD6	X	$	&677NNbN,	9	/	6	NNbN,	9	/	6	NN	5MMO	7	OONNN%Hr   c                 <   [        U [        R                  5      (       a  U R                  U5      nOA[        U [        R                  5      (       a  U R
                  nO S[        U 5      < S35       eUR                  SS.n [        U S5      (       a  U R                  US'   [        R                  " SS5      nUR                  [        R                  -  (       a  UR                   R"                  Ul        UR                  [        R$                  -  (       a  UR                   R&                  Ul        UR                  [        R(                  :X  a  UR*                  US'   U$ UR                  [        R,                  :X  aX  UR.                  US	'   XCS
'   UR                  [        R0                  -  (       a  UR2                  R"                  US'   U$ SUS'    U$ UR                  [        R4                  :X  a  UR6                  R9                  SS5      US'   UR.                  US	'   UR.                  S:  a  UR
                  US
'   U$ XCS
'   UR                  [        R0                  -  (       a  UR2                  R"                  US'   U$ SUS'    U$ UR                  [        R:                  :X  a!  UR6                  R9                  SS5      US'   U$ UR                  [        R<                  :X  aq  UR>                  RA                  5       URA                  5       -  (       a   eURC                  UR>                  5        S
U;   a  [        R                  " US
   5      US
'   U$ UR.                  US	'   U$ ! [         a#    [        S:  a  [        R                  " 5          GNf = f)Nr   zUnexpected type(ln)=.)kindxrefri   from   urir
   tor   g        \/file)"
isinstancer   OutlinedestinationLinkdestr   r   hasattrri   	Exceptionr   r   Pointr   LINK_FLAG_L_VALIDltxLINK_FLAG_T_VALIDyLINK_URIr   	LINK_GOTOr
   LINK_FLAG_R_IS_ZOOMrb
LINK_GOTOR	file_specreplaceLINK_LAUNCH
LINK_NAMEDnamedkeysupdate)lndocumentr   nlpnts        r   getLinkDictr     s   "goo&&~~h'	B	%	%ww,)R{!,,q))Q	'B2vBvJ
 --1
CzzG---		zzG---		yyG$$$HH5	D IA 
g''	'YY6
4::333BvJ8 I5 BvJ4 I1 
g((	(^^++D#66
YY6
99q=yyBtH( I% tHzzG777!WWYY6
  I !6
 I 
g))	)^^++D#66
 I 
g((	(JJOO%	122
		$**2:}}RX.BtH I YY6
I[  1$(>(>(@s    M. .)NNr   ddictc                    U(       d  gS nS nS nS nS n[        U5      [        [        4;   a  U" U SUS5      nU$ UR                  S[        R
                  5      nU[        R
                  :X  a  gUS   [        R                  :X  aH  UR                  S	S5      n	UR                  S
[        R                  " SS5      5      n
U
u  pU" XX5      nU$ US   [        R                  :X  a!  U" [        R                  " US   5      5      nU$ US   [        R                  :X  a#  [        R                  " US   5      nU" X5      nU$ US   [        R                  :X  aD  US   S:  a;  [        R                  " US   5      nU" [        R                  " US
   5      X5      nU$ US   [        R                  :X  aQ  US   S:  aH  [        R                  " US   5      nU" US   US
   R                  US
   R                  US	   UU5      nU$ g)zfCalculate the PDF action string.

Notes:
    Supports Link annotations and outline items (bookmarks).
rI   c                 *    SU  S[        XU45       S3$ )Nz/A<</S/GoTo/D[z	 0 R/XYZ z]>>	_format_g)ar   cds       r   r   getDestStr.<locals>.<lambda>A  s    N1#Yy!PQ?S>TTW"Xr   c           	      6    SU  S[        XU45       SU SU S3	$ )Nz/A<</S/GoToR/D[z /XYZ z]/F<</F/UF/Type/Filespec>>>>r   )r   r   r   r   efs         r   r   r   B  s2    OA3fYPQVWyEYDZZabcaddghigjj|*}r   c                     SU  SU SU S3$ )Nz/A<</S/GoToR/Dz/F<</Fr   r   r   )r   r   r   s      r   r   r   C  s    >!F1#SCU!Vr   c                     SU  SU S3$ )Nz/A<</S/Launch/F<</Fr   r   r   )r   r   s     r   r   r   D  s     3A3c!<NOr   c                     SU  S3$ )Nz/A<</S/URI/URIz>>r   )r   s    r   r   r   E  s    .2.r   r   r   r   r   r   r   r
   )r   rM   floatgetr   	LINK_NONEr   r   r   get_pdf_strr   r   r   r   )r   r   str_goto
str_gotor1
str_gotor2
str_launchstr_urir   d_kindd_zoomr   d_leftd_topfspecs                 r   
getDestStrr  9  s    XH}JVJOJ.GE{sEl"a*YYvw001F"""V})))61%YYtW]]1a01e4V}(((w**5<8:V}+++##E&M2%'V}***uV}q/@##E&M2'--eDk:EIV}***uV}/A##E&M2&M$KMM$KMM&M
 r   lnkc           	         U R                   nU) nUS   n[        [        XC-  5      5      nSnUS   [        R                  :X  a  US   S:  a  [        R
                  S   nUS   nU R                  R                  U5      n	UR                  S[        R                  " SS5      5      n
U R                  U   nUR                   nU) nX-  nU" XR                  UR                  UR                  SS5      U5      nGO[        R
                  S	   nU" [        R                  " US   5      U5      nGOUS   [        R                  :X  a  US   S:  a  [        R
                  S
   nUR                  S[        R                  " SS5      5      n
[        U
5      [        R                  La  [        R                  " SS5      n
U" US   U
R                  U
R                  UR                  SS5      US   US   U5      nO[        R
                  S   nU" [        R                  " US   5      US   U5      nOUS   [        R                  :X  a$  [        R
                  S   nU" US   US   U5      nOUS   [        R                   :X  a   [        R
                  S   nU" US   U5      nOKUS   [        R"                  :X  a4  [        R
                  S   nUR                  S5      nUc  US   nU" X5      nU(       d  U$ [%        U R'                  5        Vs/ s H&  nUS   [        R(                  :X  d  M  US   US   4PM(     sn5      nUR                  SS5      nU(       a  US   U4UR+                  5       ;   a  UnOESn[        R,                  R/                  5       S-   n UU-  nUUR1                  5       ;  a  OUS-  nM!  UR3                  SSU-  5      nU$ s  snf )Nr   rI   r   r
   r   goto1r   r   goto2gotor1r   gotor2launchr   r   name	nameddestr   r   idr   z-L%iz/Linkz/Link/NM(%s))transformation_matrixr   tupler   r   
annot_skelr   	page_xrefr   r   r   r   r  r   r   r   r   r   r   annot_xrefsPDF_ANNOT_LINKitemsTOOLSset_annot_stemvaluesr   )r
   r  r   ictmrS   ri   annottxtpnor   r   	dest_pagedest_ctm	dest_ictmipntlnamer   
link_namesold_namer  istems                         r   getLinkTextr/  s  s    
$
$C4DFAU18_%DE
6{g'''v;!$$W-Cf+C;;((-D''$a 34CC(I 66H!	I?Dffdffcggfa.@$GE$$W-C++CI6=E	V**	*v;!$$X.C''$a 34CCy-mmAq)F"FFE $$X.C++CI6FTJE	V++	+  *CKVd3	V((	(  'CJ%	V**	*  )=$EE  #//1T1!QqTW=S=S5S!A$!1TJ wwtR HS[(+z/?/?/AA}}++-6!8D:,,..FA	  MM'>D#89EL# 	Us   O,Oc            
      p    [         R                  " 5        V VVVs/ s H  u  pp#U PM
     snnnn $ s  snnnn f )zD
Returns a list of upper-case colour names.
:rtype: list of strings
r   colors_wx_list)r  rS   gr   s       r   getColorListr4    s+    
 '.&<&<&>?&>]TaD&>???s   0
c                  ,    [         R                  " 5       $ )z
Returns list of (name, red, gree, blue) tuples, where:
    name: upper-case color name.
    read, green, blue: integers in range 0..255.
:rtype: list of tuples
r1  r   r   r   getColorInfoListr6    s     !!##r   r  c                 h    [         R                  " 5       R                  U R                  5       S5      $ )zRetrieve RGB color in PDF format by name.

Returns:
    a triple of floats in range 0 to 1. In case of name-not-found, "white" is returned.
)r   r   r   )r   colors_pdf_dictr   r   )r  s    r   getColorr9    s&     ""$((yAAr   c                 >    [        5       [        5       R                  U R                  5       5         nUS   S-  nUS   S-  nUS   S-  n[        X#U5      n[        US-  S5      n[        X#U5      nXW-
  nUS:X  a  Sn	O3XR:X  a  SX4-
  U-  S	-  -  n	O XS:X  a  SXB-
  U-  S-   -  n	OSX#-
  U-  S
-   -  n	[        [        U	5      5      n
US:X  a  SnOX-  n[        [        US-  5      5      nXU4$ ! [         a#    [
        (       a  [        R                  " 5          gf = f)zRetrieve the hue, saturation, value triple of a color name.

Returns:
    a triple (degree, percent, percent). If not found (-1, -1, -1) is returned.
)rX   rX   rX   r   g     o@r   r   d   r   g      N@   r.   )r6  r4  indexupperr   r   r   r   rL   rN   r^   rM   )r  r   rS   r3  r   cmaxVcmindeltahueHsatSs                r   getColorHSVrG    s4   |~33DJJLAB
 	
!uA	!uA	!uAqQ<DdSj!AqQ<DKEz	!+,	!+,!+,E#JAqylE#)A!99  G$:$:$<s   2C/ /*DDdocc                 j   U R                  U5      u  p#pESnSnUS:X  a  X#XFU4$ U(       ag   [        R                  " US9nUR                  nUR                  nUR
                  n	Xg-
  S:  a!  U	R                  U:  a  U	R                  nSU-
  nX#XFU4$ US:w  a0   [        R                  " U5      nUR                  nUR                  nO
US-  nUS-  nX#XFU4$ ! [         a"    [        R                  " 5         US-  nUS-  n Nuf = f! [         a"    [        R                  " 5         US-  nUS-  n N_f = f)Ng?gɿrI   )
fontbufferr   g333333?zn/a)	extract_fontr   Fontascender	descenderr   r4   r   r   )
rH  r   fontnameextstypebufferascdscfontr   s
             r   _get_font_propertiesrV    sE   #&#3#3D#9 H5
C
C
bye#--	<<62D--C..C99Dy1}77S=''C#g
 e#--
e|	<<)D--C..C 	s
s
%c))#  	""$3JC3JC	  	""$3JC3JC	s$   A!C .D )DD)D21D2c                     SnSnU R                   R                  n U(       d  O!US-  nX#R                  -  nUR                  nM)  SU SU 3$ )Nr   r   z
num_spans=z num_chars=)
m_internalheadrP   next)rb   	num_spans	num_charsspans       r   _show_fz_textr^  ;  s^    
 II??D
Q	XX	yy  	{+i[99r   c                    U u  pUSS R                  S5      SS nUSSS.nSn[        U5       H  u  pPU(       a  SnM  U S	:X  a  X%S-      US
'   SnM$  U R                  S5      (       a+  U SS R                  SS5      R                  SS5      nXcS'   Me  U R                  S5      (       d  M}  [	        U SS 5      nXcS'   M     U$ )a  Make a Python dict from a PDF page label rule.

Args:
    item -- a tuple (pno, rule) with the start page number and the rule
            string like <</S/D...>>.
Returns:
    A dict like
    {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}.
r   r   r   NrI   )	startpageprefixfirstpagenumFrF  styleTP()rb  Strc  )split	enumerate
startswithr   rM   )itemr%  ruler   skipr-  r   s          r   	rule_dictro  ^  s     IC":C $DR;ADT?D3;!eAgJD??3QR  b)11#r:AhK??4  DHA !n # Hr   c                     U Vs/ s H  o"S   U ::  d  M  UPM     snS   n[        U5      nUR                  SS5      nUR                  SS5      nUS;   a  SOSnXS   -
  US   -   U-   n[        XeU5      $ s  snf )	zReturn the label for this page number.

Args:
    pgNo: page number, 0-based.
    labels: result of doc._get_page_labels().
Returns:
    The label (str) of the page number. Errors return an empty string.
r   rX   rb  rI   rd  )r   Ara  rc  )ro  r   construct_label)	pgNolabelsr   rl  rm  rb  rd  rB  
pagenumbers	            r   get_label_pnorv    s     .v!1Av.r2DT?DXXh#FHHWb!E:%B1E[))D,@@5HJ5*55 /s
   A6A6c                 4   SnU S:X  a  [        U5      nOU S:X  a  [        U5      R                  5       nO_U S:X  a  [        U5      R                  5       nO?U S:X  a  [	        U5      R                  5       nOU S:X  a  [	        U5      R                  5       nX-   nU$ )z9Construct a label based on style, prefix and page number.rI   DrS   Rr   rq  )strintegerToRomanr   r>  integerToLetter)rd  rb  r%  n_strresults        r   rr  rr    s     E|C	#s#))+	#s#))+	#$**,	#$**,^FMr   c           
      d   SSK nUR                  nSU pC[        SU5      U::  a:  U[        [        R                  " SU5      5      -  nUS-  n[        SU5      U::  a  M:  Sn[        [        U5      5       H8  n[        U[        [        R                  " SU5      5      5      u  pxXRU   -  nUnM:     U$ )z-Returns letter sequence string for integer i.r   Nr      rI   )stringascii_uppercasepowrM   mathreversedrangedivmod)	r-  r  lsr   r   str_tjr   r3  s	            r   r|  r|    s     			Baq
b!*/	S"a!!	Q b!*/ EeAhaTXXb!_-.A   Lr   numc                 l   ^ SmU4S jnSR                  U" U 5       Vs/ s H  o"PM     sn5      $ s  snf )z$Return roman numeral for an integer.))i  M)i  CM)i  rx  )i  CD)r;  C)Z   XC)2   L)(   XL)
   X)	   IX)r\   r@  )r.   IV)r   Ic              3   h   >#    T H'  u  p[        X5      u  p4X#-  v   XU-  -  n U S::  d  M'    g    g 7fr0   )r  )r  rS   ltrr   re   romans        r   	roman_num!integerToRoman.<locals>.roman_num  s9     FA#>DA'Mq5LCax s   &22rI   )join)r  r  r   r  s      @r   r{  r{    s6    E  77y~.~!A~.//.s   1line_dirr]  r   c                 b   U c  US   n U u  p4[         R                  " U5      n[         R                  R                  5       (       a  SnOUS   US   -
  nXQS   -  nXd-  nXc-  nUS:  aK  US::  aE  UR                  SU4-
  n	UR
                  US4-   n
UR                  US4-
  nUR
                  SU4-   nOUS::  aK  US::  aE  UR                  US4-   n	UR                  SU4-
  n
UR                  SU4-   nUR                  US4-
  nOUS::  aK  US:  aE  UR
                  SU4-
  n	UR                  US4-   n
UR
                  US4-
  nUR                  SU4-   nODUR                  US4-   n	UR                  SU4-
  n
UR                  SU4-   nUR                  US4-
  n[         R                  " XX5      $ )ao  Compute the quad located inside the bbox.

The bbox may be any of the resp. tuples occurring inside the given span.

Args:
    line_dir: (tuple) 'line["dir"]' of the owning line or None.
    span: (dict) the span. May be from get_texttrace() method.
    bbox: (tuple) the bbox of the span or any of its characters.
Returns:
    The quad which is wrapped by the bbox.
dirr   rM  rN  sizer   )	r   r2   r  set_small_glyph_heightsbltrbrtlQuad)r  r]  r   cossinr   r_   hshculurlllrs                r   recover_bbox_quadr    s    ;HC<<D}},,..tK00fF 
B	B	Qw27WW2wWWAwWWAwWW2w	qR1WWWAwWW2wWW2wWWAw	qR1WWW2wWWAwWWAwWW2wWWAwWW2wWW2wWWAw<<''r   c                     [        U 5      [        Ld  [        U 5      S:w  a  [        S5      e[        U5      [        La  [        S5      e[        XUS   5      $ )zRecover the quadrilateral of a text span.

Args:
    line_dir: (tuple) 'line["dir"]' of the owning line.
    span: the span.
Returns:
    The quadrilateral enveloping the span's text.
r   bad line dir argumentbad span argumentr   )r   r  rP   r#   r   r  )r  r]  s     r   recover_quadr    sP     H~U"c(mq&8011Dz,--XT&\::r   r:   spansc           	      2   Uc  U S   n[        U5      S:X  a  [        S5      eU S   nUu  p4[        X!S   5      n[        U5      S:  a  [        X!S   5      nOUnUR                  nUR                  n[
        R                  " Xx5      n	X-  n
[
        R                  R                  5       n[        U Vs/ s H  oS   U(       a  SO
US   US	   -
  -  PM     sn5      n[
        R                  " SU* U
R                  S5      nUR                  nX) -  nU$ s  snf )
a  Calculate the line quad for 'dict' / 'rawdict' text extractions.

The lower quad points are those of the first, resp. last span quad.
The upper points are determined by the maximum span quad height.
From this, compute a rect with bottom-left in (0, 0), convert this to a
quad and rotate and shift back to cover the text of the spans.

Args:
    spans: (list, optional) sub-list of spans to consider.
Returns:
    pymupdf.Quad covering selected spans.
r  r   zbad span listr  r   rX   r  rM  rN  )rP   r#   r  r  r  r   planish_liner  r  rL   r2   r   quad)r:   r  r  r  r  q0q1line_llline_lrmat0x_lrsmallsh	line_rect	line_quads                   r   recover_line_quadr    s    }W
5zQ))E{HHC	ha	)B
5zA~("I.eeGeeG1D >DMM113EQVWQVA65aq}q~'E	GQVW	A QDFFA.III 	Xs   3#Dcharsc                    U c  US   n Uc  [        X5      $ SUR                  5       ;  a  [        S5      e[        XUS   5      n[	        U5      S:  a  [        XUS   5      nOUnUR
                  nUR                  n[        R                  " XV5      nXg-  n[        R                  R                  5       n	US   U	(       a  SO
US   US	   -
  -  n
[        R                  " SU
* UR                  S5      nUR                  nX) -  nU$ )
a*  Calculate the span quad for 'dict' / 'rawdict' text extractions.

Notes:
    There are two execution paths:
    1. For the full span quad, the result of 'recover_quad' is returned.
    2. For the quad of a sub-list of characters, the char quads are
       computed and joined. This is only supported for the "rawdict"
       extraction option.

Args:
    line_dir: (tuple) 'line["dir"]' of the owning line.
    span: (dict) the span.
    chars: (list, optional) sub-list of characters to consider.
Returns:
    pymupdf.Quad covering selected characters.
r  r  z)need 'rawdict' option to sub-select charsr   r   rX   r  rM  rN  )r  r   r#   recover_char_quadrP   r  r  r   r  r  r  r2   r   r  )r  r]  r  r  r  span_llspan_lrr  r  r  r  	span_rect	span_quads                r   recover_span_quadr  K  s    " ;}H++diik!DEE	858	4B
5zA~xuRy9eeGeeG1D>DMM113EVUj)9D<M)MOAQDFFA.IIIr   charc                    U c  US   n [        U 5      [        Ld  [        U 5      S:w  a  [        S5      e[        U5      [        La  [        S5      e[        U5      [        L a  [
        R                  " US   5      nO7[        U5      [        L a  [
        R                  " US   5      nO[        S5      e[        XU5      $ )a$  Recover the quadrilateral of a text character.

This requires the "rawdict" option of text extraction.

Args:
    line_dir: (tuple) 'line["dir"]' of the span's line.
    span: (dict) the span dict.
    char: (dict) the character dict.
Returns:
    The quadrilateral enveloping the character.
r  r   r  r  r   r   )r   r  rP   r#   r   r   r2   r  )r  r]  r  r   s       r   r  r  x  s     ;H~U"c(mq&8011Dz,--DzT||DL)	du	||DG$,--XT22r   )NNNF)NNNFNr   )NNNr   )N)NN)r   engrz   FN)rb   )Cr  typingr   rI   r   r   r   format_gr   r   r   r   r   r	   
ByteStringAttributeErrorbytes	bytearray
memoryviewAnyAnyTypeUnionrM   OptIntOptionalr   OptFloatrz  OptStrr   OptDictOptBytesSequenceOptSeqr   TextPageboollistr'   rC   rh   rm   rs   r   r   r   r  r/  r4  r6  r  r9  rG  r   rV  r^  ro  rv  rr  r|  r{  r  r  r  r  r  r  r   r   r   <module>r     s      	33 
		0""J
 **	c4i	 ??5!		
//$
??:&		) !%
,,
  	
  
H !%L
,,L
L L 	L
 L 
Lb !%r
,,r
r r 	r 	rp "&
,,
  		( !%
,, 	 	
 * Q
,,QQ Q 
	Q
 Q Q Ql j !%j
,,jj 	j
 j j jZ8d 8v7S 7 7# 7tPgll P P# PB@d @$$ $B3 B5 B$c $e $N"*g.. "*c "*e "*J:"$D6*3 &# $0 0 0D/( /(T /( /(7<< /(d;5 ; ; ; *D * * *Z* *T *$ *',, *Z3 3T 3 3',, 3S#       0"Z/J0s3   K K* K; 
K'&K'*
K87K8;LL