
    9ip             !       &   S r SSKrSSKrSSKrSSKrSSKJr  SSKJr  SSK	J
r
  SSKrSSKrSSKJr  / q/ qSq\R"                  r\R&                  rS\R*                  -  \R,                  -  \R.                  -  \R0                  -  rS\R.                  -  \R4                  -  \R6                  -  \R0                  -  r\" \R<                  5      rS r S\!4S	 jr"S
 r#S r$SnS jr% " S S\&5      r'/ SQr(/ SQr)\'" S5      r*Sr+Sr,Sr-Sr.Sr/Sr0Sr1Sr2\
" SSSS5      r3SSSSSSSS.r4S\54S  jr6 " S! S"5      r7 " S# S$5      r8 " S% S&5      r9S'\5S\54S( jr:\Rv                  " \8Rx                  5      Rz                  R}                  5       r?\Rv                  " \95      Rz                  R}                  5       r@S'\5S\74S) jrAS'\5S\B4S* jrC\/4S+\5S\B4S, jjrDSoS'\5S\54S- jjrES. rFS\54S/ jrGS\54S0 jrHS\54S1 jrI   SpS\54S2 jjrJSqS\54S3 jjrKS\L4S4 jrMS\54S5 jrNS6\B4S7 jrOS8\BS\54S9 jrP\+\+4S: jrQS;\B4S< jrR\,4S=\B4S> jjrSS? rTS\L4S@ jrUS\L4SA jrVSB rWSC rX\.4SD\Y4SE jjrZSF r[\-4SD\Y4SG jjr\SrS\L4SH jjr]SI r^SJ r_S\54SK jr` " SL SM5      ra " SN SO\a5      rb " SP SQ5      rc " SR SS5      rd\ " ST SU5      5       re " SV SW5      rf SsSX jrgStSY jrhSZ riS[ rjSS\S\SS\+SS\,SSS\-\.SSSSSSSSSS4S]\BS^\BS_\5S`\5Sa\&Sb\&Sc\&Sd\&Se\&Sf\&Sg\&Sh\&Si\&Sj\&Sk\&Sl\&4 Sm jjrkg)ua  
Copyright (C) 2023 Artifex Software, Inc.

This file is part of PyMuPDF.

PyMuPDF is free software: you can redistribute it and/or modify it under the
terms of the GNU Affero General Public License as published by the Free
Software Foundation, either version 3 of the License, or (at your option)
any later version.

PyMuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
details.

You should have received a copy of the GNU Affero General Public License
along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>

Alternative licensing terms are available from the licensor.
For commercial licensing, see <https://www.artifex.com/> or contact
Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
CA 94129, USA, for further information.

---------------------------------------------------------------------
Portions of this code have been ported from pdfplumber, see
https://pypi.org/project/pdfplumber/.

The ported code is under the following MIT license:

---------------------------------------------------------------------
The MIT License (MIT)

Copyright (c) 2015, Jeremy Singer-Vine

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
---------------------------------------------------------------------
Also see here: https://github.com/jsvine/pdfplumber/blob/stable/LICENSE.txt
---------------------------------------------------------------------

The porting mainly pertains to files "table.py" and relevant parts of
"utils/text.py" within pdfplumber's repository on Github.
With respect to "text.py", we have removed functions or features that are not
used by table processing. Examples are:

* the text search function
* simple text extraction
* text extraction by lines

Original pdfplumber code does neither detect, nor identify table headers.
This PyMuPDF port adds respective code to the 'Table' class as method '_get_header'.
This is implemented as new class TableHeader with the properties:
* bbox: A tuple for the header's bbox
* cells: A tuple for each bbox of a column header
* names: A list of strings with column header text
* external: A bool indicating whether the header is outside the table cells.

    N)Sequence)	dataclass)
itemgetter)mupdfc           	      F   [        S[        U S   US   5      [        U S   US   5      -
  5      n[        S[        U S   US   5      [        U S   US   5      -
  5      nX#-  nU(       d  gU S   U S   -
  U S   U S   -
  -  nUS   US   -
  US   US   -
  -  nXEU-   U-
  -  $ )z2Compute intersection over union of two rectangles.r            maxmin)r1r2ixiyintersectionarea1area2s          M/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/pymupdf/table.py_iour   q   s    	QBqE2a5!C1r!u$55	6B	QBqE2a5!C1r!u$55	6B7LURU]r!ur!u}-EURU]r!ur!u}-E5=<788    returnc                 2   ^ ^ [        U U4S jU 5       5      $ )zMCheck whether any of the words in bbox are cut through by
horizontal line y.
c              3      >#    U  H5  oT;   d  M
  UR                   Ts=:  =(       a    UR                  :  Os  v   M7     g 7fN)y0y1).0rbboxys     r   	<genexpr>%intersects_words_h.<locals>.<genexpr>   s+     @
14iqtta!$$
s
   	A 0A )any)r    r!   
word_rectss   `` r   intersects_words_hr&   }   s     @
@@@r   c                 r    0 n[         R                  R                  U R                  R                  X!5        U$ )zAExtract MuPDF table structure information from a given rectangle.)pymupdfextramake_table_dictthis
m_internal)textpagerect
table_dicts      r   get_table_dict_from_rectr0      s+    JMM!!(--":":JMr   c           
        ^ / n[        X5      nUR                  S5      [        R                  :w  a  U$ [        R
                  " US   5      n[        US   S S9n[        US   S S9nUS   u  p/ n
U HI  u  pUS	:  a  M  [        X[U5      (       a  M   U
(       a  XS
   -
  S:  a  XS
'   M8  U
R                  U5        MK     [        S	[        [        U
5      S-
  S-  5      5      n	U Vs/ s H  oS   U	::  d  M  US	   PM     nnUR                  US
   S-   :  a  UR                  UR                  5        [        [        U
5      S-
  5       H  n[        R
                  " UR                  X   UR                  XS-      5      n[        U Vs/ s H  nUU;   d  M  UPM     snS S9nU V^s/ s H!  m[        U4S jU 5       5      (       a  M  TPM#     nn[        [        U5      S-
  5       HY  n[        R
                  " UU   X   UUS-      XS-      5      nUR                   (       a  M?  UR                  [#        U5      5        M[     GM     U$ s  snf s  snf s  snf )z0Detect table structure within a given rectangle.typer    xposc                     U S   $ Nr    xs    r   <lambda>&make_table_from_bbox.<locals>.<lambda>       qtr   keyyposc                     U S   $ r5   r6   )r!   s    r   r9   r:      r;   r   max_uncertainr   r	   r   gffffff?r
   c                     U R                   $ r   )x0r   s    r   r9   r:      s    RSRVRVr   c              3   t   >#    U  H-  oR                   Ts=:  =(       a    UR                  :  Os  v   M/     g 7fr   rC   x1)r   r   r8   s     r   r"   'make_table_from_bbox.<locals>.<genexpr>   s#     0T)QADD)s   58)r0   getr   FZ_STEXT_BLOCK_GRIDr(   Rectsortedr&   appendr   roundlenrG   rangerC   r$   is_emptytuple)r-   r%   r.   cellsblockr    r3   r>   xmaxuymaxunyposr!   yuncr8   nxposirow_boxr   	row_words	this_xposjcells                `        r   make_table_from_bboxr`      s   E %X4EyyE555<<f&D %-^4D%-^4D )LE E!8dz22a)ma'"ILLO  5#e*q.D012E  14aQ45=TQqT4E1wwrQTWW 3u:>",,tww$''5Q<HzBz!Q'\AzBW	 %U1S0T)0T-TQ	Us9~)*A<<	!eh	!a%8H%TUPU,WD===U4[) + # L! 2 CUs$   )I9	I
I
I
,IIc           	      T   SnU R                  5       S    GH  nUS   S:w  a  M  US   n US   US   :  d$  US   US   :  d  US   US   :  d  US   US   :  a  MG  US	    GH.  nUS   n US   US   :  d$  US   US   :  d  US   US   :  d  US   US   :  a  M<  U(       a  X2(       a  S
OS-  nUS   S:H  =(       d    US   S:H  nUS    GH  n	U	S   n
 U
S   US   :  d$  U
S   US   :  d  U
S   US   :  d  U
S   US   :  a  M<  SnU	S    HU  nUS   n[        R                  " US   5      n[        X-  5      S[        U5      -  :  a  X-  nMD  U[        ;   d  MP  US-  nMW     U(       d  M  U(       d  X;-  nM  SnSnU(       a  U	S   [
        -  (       a
  US-  nSU-   nU	S   [        -  (       a
  US-  nSU-   nU	S   [        R                  -  (       a
  US-  nSU-   nU	S   [        R                  -  (       a
  US-  nSU-   n[        U	S   5      S:  a  UR                  5       n[        U5      =n(       a%  UR                  U5      (       a  USU*  U-   U-   nGM  UR                  5       (       d  US-  nGM  X?U-   U-   -  nGM     GM1     GM     UR                  5       $ )a  Extract text from a rect-like 'cell' as plain or MD styled text.

This function should ultimately be used to extract text from a table cell.
Markdown output will only work correctly if extraction flag bit
TEXT_COLLECT_STYLES is set.

Args:
    textpage: A PyMuPDF TextPage object. Must have been created with
        TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES.
    cell: A tuple (x0, y0, x1, y1) defining the cell's bbox.
    markdown: If True, return text formatted for Markdown.

Returns:
    A string with the text extracted from the cell.
 blocksr2   r   r    r   r
   r	   lines<br>
dir)r   r
   )r
   r   spanscharscg      ? 
char_flagsz~~z**flags_`N)extractRAWDICTr(   rK   abswhite_spacesTEXT_STRIKEOUT	TEXT_BOLDTEXT_FONT_ITALICTEXT_FONT_MONOSPACEDrO   rstripendswithstrip)r-   r_   markdowntextrT   
block_bboxlinelbbox
horizontalspansbbox	span_textchar	this_charr    prefixsuffixlss                     r   extract_cellsr      s     D((*84=A6]
!}tAw&!}tAw&!}tAw&!}tAw&'NDLE8d1g%8d1g%8d1g%8d1g%(4 e.G$u+2GJWVQx$q')Qx$q')Qx$q')Qx$q') 	 MD $S	I"<<V5D4;'#D	/9!.	"l2!S(	 * !%D$|"4~"EdNF!F]F%	1dNF!F]F=7#;#;;cMF 6\F=7#?#??cMF 6\FtG}%) ) 0 0 2I f+%B%4==+@+@":	1F:D$??,, 2V ;;k &# # 5h ::<r   c                       \ rS rSrSrg)
UnsetFloati1  r6   N__name__
__module____qualname____firstlineno____static_attributes__r6   r   r   r   r   1      r   r   )snap_tolerancesnap_x_tolerancesnap_y_tolerancejoin_tolerancejoin_x_tolerancejoin_y_toleranceedge_min_lengthmin_words_verticalmin_words_horizontalintersection_toleranceintersection_x_toleranceintersection_y_tolerance)rd   lines_strictr{   explicitr	   r
   g      @   rC   toprG   bottomffffifflfiflst)u   ﬀu   ﬃu   ﬄu   ﬁu   ﬂu   ﬆu   ﬅc                     [        U [        5      (       a  U $ [        U [        5      (       a  [        U 5      $ [        U S5      (       a  U R	                  S5      nU$ [        U 5      $ )Nto_dictrecords)
isinstancelistr   hasattrr   )
collectionress     r   to_listr   ]  sZ    *d##	J	)	)J	Y	'	'  +
Jr   c            	       J    \ rS rSrSrSSS jjr   SS\S\S\S\4S	 jjr	S
r
g)TextMapii  z
A TextMap maps each unicode character in the text to an individual `char`
object (or, in the case of layout-implied whitespace, `None`).
Nr   c                 b    Xl         SR                  [        [        S5      U5      5      U l        g )Nrb   r   )tuplesjoinmapr   	as_stringselfr   s     r   __init__TextMap.__init__o  s"    Z]F!;<r   
main_groupreturn_groupsreturn_charsc                 6   U R                   UR                  U5      UR                  U5       nU VVs/ s H  u  pgUc  M
  UPM     nnn[        U5      u  ppUR	                  U5      U	U
UUS.nU(       a  UR                  5       US'   U(       a  XS'   U$ s  snnf )N)r{   rC   r   rG   r   groupsri   )r   startendobjects_to_bboxgroupr   )r   mr   r   r   subsetr{   rj   ri   rC   r   rG   r   results                 r   match_to_dictTextMap.match_to_dicts  s     QWWZ01553DE$*<FyaF<-e4 GGJ'
  xxzF8#7O# =s   	B B)r   r   r   r   N)r   TT)r   r   r   r   __doc__r   intbooldictr   r   r6   r   r   r   r   i  sI    
= "!  	
  
 r   r   c                   f    \ rS rSrSrSS jrSSSSS\\SS\SSS4S	\	S
\
S\
S\	S\	S\	S\4S jjrSrg)WordMapi  z
A WordMap maps words->chars.
r   Nc                     Xl         g r   r   r   s     r   r   WordMap.__init__  s    r   Fr   Tlayoutlayout_width_charslayout_height_charsuse_text_flow	presortedexpand_ligaturesc                 f   / n[        U R                  5      (       d  [        U5      $ U(       a  [        O0 nU(       ag  U(       a  U(       a  [	        S5      eO[        [        X&-  5      5      nU(       a  U(       a  [	        S5      eO[        [        X7-  5      5      nS/U-  nO/ nSnU(       d  U(       a  U R                  O[        U R                  S S9nUS   S   nUS   US   -
  n[        [        US	 U
5      5       GHV  u  nnU(       a  US   S   S   UU	-   -
  U-  OSn[        [        US:  5      [        U5      U-
  5      n[        U5       H5  n[        U5      (       a  US
   S   S:X  a  UU-  nUR                  S5        M7     UU-  nSnU(       d  U(       a  UO
[        US S9nU H  u  nnU(       a  US   U-
  U-  OSn[        [        SU5      [        U5      U-
  5      nUS/U-  -  nUU-  nU H<  nUR                  US   US   5      n U  H  n!UR                  U!U45        US-  nM     M>     M     U(       d  GMJ  US/UU-
  -  -  nGMY     U(       aD  UUS-   -
  n"[        U"5       H  nUS:  a  UU-  nUR                  S5        M!     US
   S:X  a  USS
 n[        U5      $ )a  
Given a list of (word, chars) tuples (i.e., a WordMap), return a list of
(char-text, char) tuples (i.e., a TextMap) that can be used to mimic the
structural layout of the text on the page(s), using the following approach:

- Sort the words by (doctop, x0) if not already sorted.

- Calculate the initial doctop for the starting page.

- Cluster the words by doctop (taking `y_tolerance` into account), and
  iterate through them.

- For each cluster, calculate the distance between that doctop and the
  initial doctop, in points, minus `y_shift`. Divide that distance by
  `y_density` to calculate the minimum number of newlines that should come
  before this cluster. Append that number of newlines *minus* the number of
  newlines already appended, with a minimum of one.

- Then for each cluster, iterate through each word in it. Divide each
  word's x0, minus `x_shift`, by `x_density` to calculate the minimum
  number of characters that should come before this cluster.  Append that
  number of spaces *minus* the number of characters and spaces already
  appended, with a minimum of one. Then append the word's text.

- At the termination of each line, add more spaces if necessary to
  mimic `layout_width`.

- Finally, add newlines to the end if necessary to mimic to
  `layout_height`.

Note: This approach currently works best for horizontal, left-to-right
text, but will display all words regardless of orientation. There is room
for improvement in better supporting right-to-left text, as well as
vertical text.
z;`layout_width` and `layout_width_chars` cannot both be set.z=`layout_height` and `layout_height_chars` cannot both be set.)rk   Nr   c                 $    [        U S   S   5      $ Nr   doctopfloatr7   s    r   r9   $WordMap.to_textmap.<locals>.<lambda>  s    51h3Hr   r<   r   r   c                 $    [        U S   S   5      $ r   r   r7   s    r   r9   r     s    uQqT(^/Dr   rA   rf   )rf   Nc                 $    [        U S   S   5      $ )Nr   rC   r   r7   s    r   r9   r     s    eAaDJ.?r   rC   r
   r{   N)rO   r   r   	LIGATURES
ValueErrorr   rN   rL   	enumeratecluster_objectsr   rP   rM   r   rI   )#r   r   layout_widthlayout_heightr   r   	x_density	y_densityx_shifty_shifty_tolerancer   r   r   _textmap
expansions
blank_linenum_newlineswords_sorted_doctop
first_worddoctop_startrZ   wsy_distnum_newlines_prependline_lenline_words_sorted_x0wordri   x_distnum_spaces_prependrj   lettersletternum_newlines_appends#                                      r   
to_textmapWordMap.to_textmap  s   f 4;;8$$"2Y
!$U   
 &)|/G)H%I"" $W  !
 '*%0I*J&K#%);;JJ M KK)HI 	 )+A.
!(+j.??#%Dk
EAr  Aq(#|g'=>)K 
 $'AE
f,	$  /08}}Q4(?
*H- 1
 00LH  B$?@ !  4e?E$t*w.);1%(Q)95=8;S%T"[M,>>>..A(nnQvY&	BG") 4 A #*   4 v[M-?(-JKK]
b "59I"J./q5
*H- 0 ||+#CR=x  r   r   r   )r   r   r   r   r   r   DEFAULT_X_DENSITYDEFAULT_Y_DENSITYDEFAULT_Y_TOLERANCEr   r   r   r  r   r6   r   r   r   r     s    
 "##$##'#!%V!V!
  V! !V! V! V! V! 
V! V!r   r   c            	           \ rS rSr\\SSSSSSS4	S\4S jjrS\4S jr	S	\4S
 jr
S rS rS rS	\4S jrS\S	\4S jrSrg)WordExtractori0  FTNkeep_blank_charsc
                     Xl         X l        X0l        X@l        XPl        X`l        Uc  / OUU l        USL a  [        R                  O
U=(       d    SU l	        U	(       a  [        U l        g 0 U l        g )NTrb   )x_tolerancer   r  r   horizontal_ltrvertical_ttbextra_attrsstringpunctuationsplit_at_punctuationr   r   )
r   r
  r   r  r   r  r  r  r  r   s
             r   r   WordExtractor.__init__1  sm     '& 0*,(!,!42+
 $t+ &," 	! (8)Rr   ordered_charsc           
        ^  [        U5      u  p#pEUS   S   US   S   -
  nUS   S   nU(       a  T R                  (       a  OT R                  (       a  SOSnUS   S   n	Sn
U(       d  U	S   S:  a  [        U5      nSn
U	S   S:  a  U	S	   S:  a  S
n
OU	S   S:  a  Sn
SR	                  U 4S jU 5       5      UUUX6-   UUUU
S.	nT R
                   H  nUS   U   X'   M     U$ )Nr   r   r   uprightr
   rA   matrix  r	      Z   rb   c              3   h   >#    U  H'  nTR                   R                  US    US    5      v   M)     g7fr{   N)r   rI   )r   rj   r   s     r   r"   ,WordExtractor.merge_chars.<locals>.<genexpr>a  s0      CPa##AfIqy99=s   /2)	r{   rC   rG   r   r   r   r  	directionrotation)r   r  r  reversedr   r  )r   r  rC   r   rG   r   
doctop_adjr  r  r  r  r   r=   s   `            r   merge_charsWordExtractor.merge_charsN  s   -m<"1%h/-2B52II
"9-18$---d>O>OAVX	q!(+6!9q=$]3MH!9q=VAY]HAY]H GG CP  &" 
 ##C%a(-DI $ r   r   c                    US   (       aV  U R                   nU R                  nUS   nUS   nU R                  (       a  US   nUS   nUS   n	OhUS   * nUS   * nUS   * n	OUU R                  nU R                   nUS   nUS   nU R                  (       a  US   nUS   nUS   n	OUS   * nUS   * nUS   * n	[	        X:  =(       d    XU-   :  =(       d    XeU-   :  5      $ )a  This method takes several factors into account to determine if
`curr_char` represents the beginning of a new word:

- Whether the text is "upright" (i.e., non-rotated)
- Whether the user has specified that horizontal text runs
  left-to-right (default) or right-to-left, as represented by
  self.horizontal_ltr
- Whether the user has specified that vertical text the text runs
  top-to-bottom (default) or bottom-to-top, as represented by
  self.vertical_ttb
- The x0, top, x1, and bottom attributes of prev_char and
  curr_char
- The self.x_tolerance and self.y_tolerance settings. Note: In
  this case, x/y refer to those directions for non-rotated text.
  For vertical text, they are flipped. A more accurate terminology
  might be "*intra*line character distance tolerance" and
  "*inter*line character distance tolerance"

An important note: The *intra*line distance is measured from the
*end* of the previous character to the *beginning* of the current
character, while the *inter*line distance is measured from the
*top* of the previous character to the *top* of the next
character. The reasons for this are partly repository-historical,
and partly logical, as successive text lines' bounding boxes often
overlap slightly (and we don't want that overlap to be interpreted
as the two lines being the same line).

The upright-ness of the character determines the attributes to
compare, while horizontal_ltr/vertical_ttb determine the direction
of the comparison.
r  r   rC   rG   r   )r
  r   r  r  r   )
r   	prev_char	curr_charr8   r!   aycyaxbxcxs
             r   char_begins_new_word"WordExtractor.char_begins_new_words  s'   N Y  A  A5!B5!B""t_t_t_o%o%o%   A  A4B4B  u%x(u%))&&))W !V !V
 	
r   c              #     ^#    / mSU4S jjnU H  nUS   nU R                   (       d'  UR                  5       (       a  U" S 5       S h  vN   M@  X@R                  ;   a"  U" U5       S h  vN   U" S 5       S h  vN   Mq  T(       a,  U R                  TS   U5      (       a  U" U5       S h  vN   M  TR	                  U5        M     T(       a  Tv   g g  N Nm N_ N.7f)Nc              3   6   >#    T(       a  Tv   U c  / mg U /mg 7fr   r6   )new_charcurrent_words    r   start_next_word:WordExtractor.iter_chars_to_words.<locals>.start_next_word  s!      ""!)!12LzL   r{   rA   r   )r  isspacer  r*  rM   )r   r  r0  r   r{   r/  s        @r   iter_chars_to_words!WordExtractor.iter_chars_to_words  s     	B "D<D((T\\^^*4000222*4000*4000$";";L<Ld"S"S*4000 ##D) ""   1 10 1sH   AC	C
 C*C+C:C;2C-C.)CCCCc              #     #    S[         4S jn[        [        U5      US5       H  nUS   S   nU(       a  SOSn[        U[        U5      U R                  5      nU Hj  nU(       a  SOSn[        U[        U5      S9n	U(       a  U R                  (       d'  OU R                  (       d  [        U	5       S h  vN   M`  U	 S h  vN   Ml     M     g  N N7f)Nr   c                      [        U S   5      * $ Nr  )r   r7   s    r   upright_key2WordExtractor.iter_sort_chars.<locals>.upright_key  s    )%%%r   r   r  r   rC   r<   )	r   r   r   r   r   rL   r  r  r  )
r   ri   r9  upright_clusterr  cluster_keysubclustersscsort_keyto_yields
             r   iter_sort_charsWordExtractor.iter_sort_chars  s     	&c 	&  /tE{KKO%a(3G&-(4K *K!8$:J:JK "#*4!"*X*>? 07+++D<M<M'111''' "  L  2's$   B3C5C6CCCCc              #     #    U R                   (       a  UOU R                  U5      n[        S/U R                  Q76 n[        R
                  " X#5      nU H2  u  pVU R                  U5       H  nU R                  U5      U4v   M     M4     g 7fr8  )r   rA  r   r  	itertoolsgroupbyr4  r   )r   ri   r  grouping_keygrouped_charskeyvals
char_group
word_charss           r   iter_extract_tuples!WordExtractor.iter_extract_tuples  s}     !%!3!39M9Me9T!)?d.>.>?!))-F#0G"66zB
''
3Z@@ C $1s   B
Bc                 H    [        [        U R                  U5      5      5      $ r   )r   r   rK  )r   ri   s     r   extract_wordmapWordExtractor.extract_wordmap  s    tD44U;<==r   ri   c                 H    [        S U R                  U5       5       5      nU$ )Nc              3   *   #    U  H	  u  pUv   M     g 7fr   r6   )r   r   rJ  s      r   r"   .WordExtractor.extract_words.<locals>.<genexpr>  s     R2Q.dT2Q   )r   rK  )r   ri   wordss      r   extract_wordsWordExtractor.extract_words  s"    R$2J2J52QRRr   )	r   r  r  r  r  r   r  r
  r   )r   r   r   r   DEFAULT_X_TOLERANCEr  r   r   r   r   r*  r4  rA  rK  r   rN  rU  r   r6   r   r   r  r  0  s~     ('!&"@ 	@:# #JI
 
	I
V>(0A> >4 D r   r  ri   c                 6    [        S0 UD6R                  U 5      $ )Nr6   )r  rU  )ri   kwargss     r   rU  rU    s    "6"0077r   c           
         UR                  SS05        [        S0 [         Vs0 s H  o"U;   d  M
  X!U   _M     snD6nUR                  U 5      nUR                  " S0 [
         Vs0 s H  o"U;   d  M
  X!U   _M     snD6nU$ s  snf s  snf )Nr   Tr6   )updater  WORD_EXTRACTOR_KWARGSrN  r  TEXTMAP_KWARGS)ri   rY  k	extractorwordmaptextmaps         r   chars_to_textmaprb    s    
MM;%& !6
F!6Av+<1Qi<!6
FI ''.G   !/
?A;<1Qi<
?G N G @s   	B	B%	B2	Bc           
      >   [        U 5      n [        U 5      S:X  a  gUR                  S5      (       a  [        U 40 UD6R                  $ UR                  S[
        5      n[        S0 [         Vs0 s H  o3U;   d  M
  X1U   _M     snD6nUR                  U 5      nU(       a	  US   S   nOSnUS:X  a6  UR                  S S9  S	R                  U Vs/ s H  owS
   PM	     sn5      nU$ US:X  a6  UR                  S S9  S	R                  U Vs/ s H  owS
   PM	     sn5      nU$ [        U[        S5      U5      nSR                  S U 5       5      nUS:X  a2  SR                  [        U5       V	s/ s H  oS:w  a  U	OS	PM     sn	5      nU$ s  snf s  snf s  snf s  sn	f )Nr   rb   r   r   r  r  c                     U S   U S   * 4$ NrG   r   r6   ws    r   r9   extract_text.<locals>.<lambda>0  s    ag%y%9r   r<   rk   r{   r  c                     U S   * U S   4$ re  r6   rf  s    r   r9   rh  3  s    qwh%%9r   r   rf   c              3   R   #    U  H  nS R                  S U 5       5      v   M     g7f)rk   c              3   *   #    U  H	  oS    v   M     g7fr  r6   )r   r   s     r   r"   )extract_text.<locals>.<genexpr>.<genexpr>7  s     &EF|rS  N)r   )r   r}   s     r   r"   extract_text.<locals>.<genexpr>7  s$     XRW$chh&E&EEERWs   %'r  r6   )r   rO   rI   rb  r   r  r  r\  rU  sortr   r   r   r  )
ri   rY  r   r^  r_  rT  r  rg  rd   rj   s
             r   extract_textro    s   ENE
5zQzz(00:::jj0CD! 
%:J%:6k|q)|%:J
	 ''.Qx
+HHr>JJ9J:HH7Ai78E  _JJ9J:HH7Ai78E  $E:h+?MEIIXRWXXE3HUO TOqDy!c"9O TU) K 8 8
 !Us   -	F:	FFF/F
line_charsc                     SnS n[        U [        S5      S9 H"  nUb  US   X1-   :  a  US-  nUS   nX$S   -  nM$     U$ )Nrb   rC   r<   rk   rG   r{   )rL   r   )rp  	tolerancecolllast_x1r   s        r   collate_lineru  >  s]     DGzz$'78d4jG4G&HCKDt*V	 9
 Kr   c                    ^^^ [        SSSS5      m[        SS5      mS[        4UUU4S jjnU" U 5      n[        X0R                  S	9$ )
u   
Removes duplicate chars — those sharing the same text, fontname, size,
and positioning (within `tolerance`) as other characters in the set.
fontnamesizer  r{   r   rC   ri   c              3     >#    [        U TS9n[        R                  " UTS9 HW  u  p#[        [	        U5      [        S5      T5       H/  n[        U[        S5      T5       H  n[        UTS9S   v   M     M1     MY     g 7f)Nr<   r   rC   r   )rL   rD  rE  r   r   r   )	ri   sorted_charsgrp	grp_chars	y_cluster	x_clusterr=   pos_keyrr  s	         r   yield_unique_chars(dedupe_chars.<locals>.yield_unique_charsT  s|     e-'//#FNC,YH!5y	 "1z$/"I !8;;" Gs   A<A?r<   )r   r   rL   index)ri   rr  r  dedupedr=   r  s    `  @@r   dedupe_charsr  L  sM    
 ZF
;C4(G	<$ 	< 	< !'G'{{++r   c                 B    [        U 5      nU S   U S   :X  a  SOSUS'   U$ )Nr   r   hvorientation)r   )r}   edges     r   line_to_edger  c  s+    :D"&u+h"?#cDKr   c           	      \   [        S5       Vs/ s H  n[        U 5      PM     snu  p#pEUR                  SSU S   U S   SS.5        UR                  SSU S   U S   U S	   -   U S
   U S	   -   SS.5        UR                  SSU S   SS.5        UR                  SSU S   SS.5        X#XE/$ s  snf )N   	rect_edger   r   r   r  )object_typeheightr   r   r  r   r  r   )r  r  r   r   r   r  rC   r  )r  widthrG   r  rG   )r  r  rC   r  )rP   r   r[  )r.   r8   r   r   leftrights         r   rect_to_edgesr  i  s    49!H=HqT
H=CJJ&t*5k	
 MM&t*;h/8ntH~5	
	 	KK&t*		
 
LL&t*		
 %%I  >s   B)c                    [        U S   U S   SS  5      nU VVs/ s H  u  p#S[        US   US   5      [        US   US   5      [        US   US   5      [        US   US   5      U S   U S   -
  -   [        US   US   5      [        US   US   -
  5      [        US   US   -
  5      US   US   :X  a  SOUS   US   :X  a  SOS S	.	PM     snn$ s  snnf )
Nptsr
   
curve_edger   r   r   r  r  )	r  rC   rG   r   r   r   r  r  r  )zipr   r   rq   )curvepoint_pairsp0p1s       r   curve_to_edgesr    s   eElE%L$45K " "FB (beRU#beRU#r!ube$"Q%A'5?U5\+IJ"Q%A'AA'"Q%"Q%-("$Q%2a5.3ber!unsRV
	
 "  s   B4Cc                 n    U S   nSU;   a  U /$ US:X  a  [        U 5      /$ [        [        S.U   " U 5      $ )Nr  _edger}   )r.   r  )r  r  r  )objts     r   obj_to_edgesr    sD    MA!|u	
fS!""%?B3GGr   c                 r   ^^^ TS;  a  [        S5      eS[        4UUU4S jjn[        [        X@5      5      $ )N)r  r  NOrientation must be 'v' or 'h'r   c                    > U S   S:X  a  SOSnTb  U S   T:H  OSnTS L =(       d    U S   T:H  n[        U=(       a    U=(       a    X   T:  5      $ )Nr  r  r  r  r  Tr   )edim
et_correctorient_correct	edge_type
min_lengthr  s       r   testfilter_edges.<locals>.test  sb    M*c1hw6?6KQ}%2QU
$,O-0@K0OJL>Lqv7KMMr   )r   r   r   filter)edgesr  r  r  r  s    ``` r   filter_edgesr    s@     **9::N4 N N t#$$r   c                    US:X  a  [        U 5       Vs/ s H  o"/PM     sn$ [        U 5      S:  a  [        U 5       Vs/ s H  o"/PM     sn$ / n[        [        U 5      5      n U S   /nU S   nU SS   H3  nX%U-   ::  a  UR                  U5        OUR                  U5        U/nUnM5     UR                  U5        U$ s  snf s  snf )Nr   r   r
   )rL   rO   r   rM   )xsrr  r8   r   current_grouplasts         r   cluster_listr    s    A~#BZ(ZZ((
2w{#BZ(ZZ((F	fRj	BUGMa5DV	!"  #MM-(CM  MM- M )(s
   B>Cc           
          [        [        [        U 5      5      U5      n[        U5       VVVs/ s H  u  p4U Vs/ s H  oUU4PM     snPM     nnnn[	        [
        R                  " U6 5      $ s  snf s  snnnf r   )r  r   setr   r   rD  chain)valuesrr  clustersrZ   value_clustervalnested_tupless          r   make_cluster_dictr    sq    DV-y9H ENhDWDW0@]+]cq]+DW   	/00 	,s   A3A.A3.A3c           
      D  ^^
 [        T5      (       d  [        T5      m[        TU 5      n[        X25      m
[        S5      [        S5      pT[	        U
U4S jU  5       US9n[
        R                  " XeS9nU VV	s/ s H  u  p[        [        XI5      5      PM     sn	n$ s  sn	nf )Nr   r
   c              3   T   >#    U  H  oTR                  T" U5      5      4v   M     g 7fr   )rI   )r   r8   cluster_dictkey_fns     r   r"   "cluster_objects.<locals>.<genexpr>  s#     Jr!!1!1&)!<=rs   %(r<   )callabler   r   r  rL   rD  rE  r   )r  r  rr  r  get_0get_1cluster_tuplesgroupedr^  r  r  s    `        @r   r   r     s    FF#_F$V7La=*Q-5JrJPUVN:G,34GDADUG444s   8 Baxisc                 D   US;   d   eUS:X  a  SU S   U-   4SU S   U-   4/nUS:X  aE  SU S   U-   4SU S   U-   4/nSU ;   a  USU S   U-   4/-  nS	U ;   a  US	U S	   U-
  4S
U S
   U-
  4/-  nU R                  [        U R                  5       5      [        W5      -   5      $ )N)r  r  r  rC   rG   r  r   r   r   r   r   )	__class__rR   items)r  r  value	new_itemss       r   move_objectr    s    :s{3t9u$%3t9u$%
	 s{CJ&'s8}u,-
	 s?8S]U%:;<<I3;s4y5()s4y5() I ==syy{+eI.>>??r   attrc                    SSSSS.U   n[        U 5      n[        U[        U5      U5      nU Vs/ s H-  n[        [	        [        U5      U5      5      [        U5      -  PM/     nn[        XW5       VVV	s/ s H'  u  phU V	s/ s H  n	[        XXU   -
  5      PM     sn	PM)     n
nnn	[        [        R                  " U
6 5      $ s  snf s  sn	f s  sn	nnf )Nr  r  rC   rG   r   r   )
r   r   r   sumr   rO   r  r  rD  r  )objsr  rr  r  	list_objsr  clusteravgsavgr  snapped_clusterss              r   snap_objectsr    s    S<TBDT
Iy*T*:IFHNVWh7CJt$g./#g,>hDW  //LG =DDGSS$i	0GD/   	!1233 XDs   4C5CCCCc                     / / S.nU  H  nX4S      R                  U5        M     [        US   SU5      n[        US   SU5      nXV-   $ )zg
Given a list of edges, snap any within `tolerance` pixels of one another
to their positional average.
r  r  r  r  rC   r  r   )rM   r  )r  r
  r   by_orientationr  	snapped_v	snapped_hs          r   
snap_edgesr    s_     B'N'(//2  ^C0$DI^C0%EI  r   r=   c                    US;   d   eX   nX#-
  nX4/nUS:X  a$  X S   ::  d   eUR                  SU S   U-
  45        OUS:X  a#  X S   :  d   eUR                  SX S   -
  45        OUS:X  a\  X S   ::  d   eUR                  SU S   U-   45        UR                  SU S   U-
  45        S	U ;   a  UR                  S	U S	   U-
  45        OHUS:X  aB  X S   :  d   eUR                  SU S   U-   45        S
U ;   a  UR                  S
U S
   U-
  45        U R                  [        U R                  5       5      [        U5      -   5      $ )Nr  rC   rG   r  r   r   r   r  r   r   )rM   r  rR   r  )r  r=   r  	old_valuediffr  s         r   resize_objectr  !  s   ////ID	I d{D	!!!'3t9u#456	D	!!!'5t9#456	H%%%(CMD$89:(CMD$89:3;dCI$456	E
"""(CMD$89:3;dCI$456==syy{+eI.>>??r   r  c                 (   US:X  a  Su  p4OUS:X  a  Su  p4O[        S5      e[        [        U [        U5      S95      nUS   /nUSS	  HE  nUS
   nXs   X   U-   ::  a  Xt   X   :  a  [	        XXt   5      US
'   M2  M4  UR                  U5        MG     U$ )zs
Given a list of edges along the same infinite line, join those that
are within `tolerance` pixels of one another.
r  rF   r  )r   r   r  r<   r   r
   NrA   )r   r   rL   r   r  rM   )	r  r  rr  min_propmax_propsorted_edgesjoinedr  r  s	            r   join_edge_groupr  <  s    
 c'(		,(9::u*X*>?@L1oF!"bz;4>I56{T^+*41;Gr
 ,
 MM!  Mr   c                    ^^ S nUS:  d  US:  a  [        XU5      n [        XS9n[        R                  " XeS9nUU4S jU 5       n[	        [        R
                  " U6 5      n U $ )zp
Using the `snap_edges` and `join_edge_group` methods above,
merge a list of edges into a more "seamless" list.
c                 0    U S   S:X  a  SU S   4$ SU S   4$ )Nr  r  r   r  rC   r6   )r  s    r   	get_groupmerge_edges.<locals>.get_groupc  s-    #%e%%d$$r   r   r<   c              3   \   >#    U  H!  u  p[        X!S    US    S:X  a  TOT5      v   M#     g7f)r   r  N)r  )r   r^  r  r   r   s      r   r"   merge_edges.<locals>.<genexpr>n  s?       $HA 	Q4adck*?O	
 	
 $s   ),)r  rL   rD  rE  r   r  )	r  r   r   r   r   r  _sortededge_groupsedge_gens	      ``    r   merge_edgesr  W  sl    % !/!354DEU*G##G;K $	H (+,ELr   c                 (    U S   U S   U S   U S   S.$ )zX
Return the rectangle (i.e a dict with keys "x0", "top", "x1",
"bottom") for an object.
r   r
   r   r	   )rC   r   rG   r   r6   )r    s    r   bbox_to_rectr  x  s#    
 q'$q'aDGLLr   c                 *    [        [        U 5      5      $ )z
Given an iterable of objects, return the smallest rectangle (i.e. a
dict with "x0", "top", "x1", and "bottom" keys) that contains them
all.
)r  r   objectss    r   objects_to_rectr    s     011r   c                 l    [        U 6 u  pp4[        U5      [        U5      [        U5      [        U5      4$ )z_
Given an iterable of bounding boxes, return the smallest bounding box
that contains them all.
)r  r   r   )bboxesrC   r   rG   r   s        r   merge_bboxesr    s1    
 v,BRGSXs2wF44r   c                 4    [        [        [        U 5      5      $ )zX
Given an iterable of objects, return the smallest bounding box that
contains them all.
)r  r   bbox_getterr  s    r   r   r     s    
 K122r   word_thresholdc           
        ^ [        U [        S5      S5      n[        U4S jU5      n[        [	        [
        U5      5      n[        U5      S:X  a  / $ [        [	        [        S5      U5      5      n[        [	        [        S5      U5      5      n/ nU H(  nUUUUS   US   Xe-
  SS.UUUS	   US	   Xe-
  SS./-  nM*     U$ )
z]
Find (imaginary) horizontal lines that connect the tops
of at least `word_threshold` words.
r   r
   c                     > [        U 5      T:  $ r   rO   r8   r   s    r   r9   "words_to_edges_h.<locals>.<lambda>      c!f&>r   r   rC   rG   r  )rC   rG   r   r   r  r  r   )	r   r   r  r   r   r  rO   r   r   )	rT  r   by_toplarge_clustersrectsmin_x0max_x1r  r   s	    `       r   words_to_edges_hr    s    
 UJu$5q9F>GN_n56E
5zQ	Z%u-.FZ%u-.FE xE(" {H+"
 	
 0 Lr   c                     U u  p#pEUu  pgp[        X&5      n
[        XH5      n[        XY5      n[        X75      nX-
  nX-
  nUS:  a  US:  a  X-   S:  a  XX4$ g r5   r   )aba_lefta_topa_righta_bottomb_leftb_topb_rightb_bottomo_lefto_righto_bottomo_topo_widtho_heights                   r   get_bbox_overlapr    ss    '($F7'($F7 F'#G8&HEGH1}A(*<q*@w11r   c           
        ^^ [        U [        S5      S5      n[        U [        S5      S5      nS n[        XS5      nX#-   U-   n[        US S9n[        U4S jU5      n[	        [        [        U5      5      n	/ n
U	 H2  m[        U4S jU
 5       5      nU(       a  M!  U
R                  T5        M4     U
(       d  / $ [        [        U
5      n[	        [        U[        S5      S95      n[        [        [        S5      U5      5      n[        [        [        S	5      U5      5      n[        [        [        S
5      U5      5      nU Vs/ s H  nUS   US   UUUU-
  SS.PM     snUUUUUU-
  SS./-   $ s  snf )zm
Find (imaginary) vertical lines that connect the left, right, or
center of at least `word_threshold` words.
rC   r
   rG   c                 0    [        U S   U S   -   5      S-  $ )NrC   rG   r   r   )r   s    r   
get_center$words_to_edges_v.<locals>.get_center  s    T$Z$t*,-11r   c                     [        U 5      * $ r   r  r7   s    r   r9   "words_to_edges_v.<locals>.<lambda>  s
    c!fWr   r<   c                     > [        U 5      T:  $ r   r  r  s    r   r9   r$    r  r   c              3   <   >#    U  H  n[        TU5      v   M     g 7fr   )r  )r   rj   r    s     r   r"   #words_to_edges_v.<locals>.<genexpr>  s     J9IA&tQ//9Is   r   r   r  rC   rG   r   r   r  r  )r   r   rL   r  r   r   r   r$   rM   r  r   r   )rT  r   by_x0by_x1r!  	by_centerr  sorted_clustersr  r  condensed_bboxesoverlapcondensed_rectssorted_rectsr  min_top
max_bottomr  r    s    `                @r   words_to_edges_vr3    s    E:d#3Q7EE:d#3Q7E2  15I}y(H X+<=O>PN #o~67F J9IJJw##D) 
 	,(89OJt4DEFLZ%|45F#j'67GSH-|<=J 
 A D'D'  7*	
 
   7*	
		  
s   E3c           	        ^ 0 nS V^s/ s H  m[        [        U4S jU 5      5      PM     snu  pV[        U[        SS5      S9 H  n[        U[        SS5      S9 H  nUS   US   U-   ::  d  M  US   US   U-
  :  d  M%  US   US   U-
  :  d  M6  US   US   U-   ::  d  MG  US   US   4n	X;  a  / / S.X9'   X9   S   R	                  U5        X9   S	   R	                  U5        M     M     U$ s  snf )
z]
Given a list of edges, return the points at which they intersect
within `tolerance` pixels.
r  c                    > U S   T:H  $ )Nr  r6   )r8   os    r   r9   (edges_to_intersections.<locals>.<lambda>  s    a.!3r   rC   r   r<   r   rG   r  r  )r   r  rL   r   rM   )
r  r
  r   intersectionsr6  v_edgesh_edgesr  r  vertexs
       `     r   edges_to_intersectionsr<    s   
 MFPFPV3U;<jG GD%!89Zt%<=A5ah45x[QuX%;<tW4;!67tW4;!67D'1U8,.242,>M)%c*11!4%c*11!4 > : !s   #C/c                     [        U 5      $ )z(
Return the bounding box for an object.
)r  )r  s    r   obj_to_bboxr>  (  s     sr   c                 
  ^ ^^^^ S[         4U 4S jjm[        [        T R                  5       5      5      m[	        T5      mS[
        4UU U4S jjmUU4S j[        [	        T5      5       5       n[        [        SU5      5      $ )a   
Given a list of points (`intersections`), return all rectangular "cells"
that those points describe.

`intersections` should be a dictionary with (x0, top) tuples as keys,
and a list of edge objects as values. The edge objects should correspond
to the edges that touch the intersection.
r   c                 $  > S nU S   US   :X  a:  U" TU    S   5      R                  U" TU   S   5      5      n[        U5      (       a  gU S   US   :X  a:  U" TU    S   5      R                  U" TU   S   5      5      n[        U5      (       a  gg)Nc                 4    [        [        [        U 5      5      $ r   )r  r   r>  )r  s    r   edges_to_setCintersections_to_cells.<locals>.edge_connects.<locals>.edges_to_set:  s    s;.//r   r   r  Tr
   r  F)r   rO   )r  p2rB  commonr8  s       r   edge_connects-intersections_to_cells.<locals>.edge_connects9  s    	0 a5BqE>!-"3C"89FF]2.s34F 6{{a5BqE>!-"3C"89FF]2.s34F 6{{r   rZ   c                   > UTS-
  :X  a  g X   nXS-   S  nU Vs/ s H  oDS   US   :X  d  M  UPM     nnU Vs/ s H  oDS   US   :X  d  M  UPM     nnU Hn  nT
" X'5      (       d  M  U HV  nT
" X(5      (       d  M  US   US   4n	U	T;   d  M$  T
" X5      (       d  M3  T
" X5      (       d  MB  US   US   U	S   U	S   4s  s  $    Mp     g s  snf s  snf )Nr
   r   r6   )pointsrZ   ptrestr8   belowr  below_ptright_ptbottom_rightrF  r8  n_pointss             r   find_smallest_cell2intersections_to_cells.<locals>.find_smallest_cellO  s    1Y!eg 2DqaDBqEMD2 2DqaDBqEMD2H ..!$R22 (Xa[9 "]2%l==%l==qE2a5,q/<?KK "	   % 32s   C
C
CCc              3   6   >#    U  H  nT" TU5      v   M     g 7fr   r6   )r   rZ   rQ  rI  s     r   r"   )intersections_to_cells.<locals>.<genexpr>i  s     J7I!"61--7Ir2  N)r   r   rL   keysrO   r   rP   r  )r8  cell_genrF  rQ  rP  rI  s   ` @@@@r   intersections_to_cellsrW  /  sk     & &++-./F6{Hc  4 KuS[7IJHtX&''r   c                   ^ S[         4S jn[        U5      n[        5       m/ n/ n[        U5      (       Ga  [        U5      n[        U5       H  nU" U5      n[        U5      S:X  a2  T[        U5      -  mUR	                  U5        UR                  U5        ML  [        U4S jU 5       5      n	U	S:  d  Mi  T[        U5      -  mUR	                  U5        UR                  U5        M     [        U5      U:X  a:  UR	                  [        U5      5        TR                  5         UR                  5         [        U5      (       a  GM  [        U5      (       a  UR	                  [        U5      5        [        [        U5      S-
  SS5       H  n
[        R                  " 5       n[        5       n[        5       nXZ    H/  nX-  nUR                  US   5        UR                  US   5        M1     [        U5      S:  d=  [        U5      S:  d.  [        R                  U R                  U[        S95      (       d  M  XZ	 M     [!        US	 S
9nU$ )zy
Given a list of bounding boxes (`cells`), return a list of tables that
hold those cells most simply (and contiguously).
r   c                      U u  pp4X4X4X24X444$ r   r6   )r    rC   r   rG   r   s        r   bbox_to_corners(cells_to_tables.<locals>.bbox_to_cornerss  s#    "	B<"RLAAr   r   c              3   ,   >#    U  H	  oT;   v   M     g 7fr   r6   )r   rj   current_cornerss     r   r"   "cells_to_tables.<locals>.<genexpr>  s     "NA#7s   r
   rA   r   r-   c                 &    [        S U  5       5      $ )Nc              3   4   #    U  H  oS    US   4v   M     g7f)r
   r   Nr6   r   rj   s     r   r"   4cells_to_tables.<locals>.<lambda>.<locals>.<genexpr>  s     .GQ!ad|Qs   )r   )r  s    r   r9   !cells_to_tables.<locals>.<lambda>  s    3.GQ.G+Gr   r<   )rR   r   r  rO   rM   remover  clearrP   r(   
EMPTY_RECTaddrr   
issupersetget_textboxTEXTPAGErL   )pagerS   rZ  remaining_cellscurrent_cellstablesinitial_cell_countr_   cell_cornerscorner_countrZ   r   x1_valsx0_valsrj   r  r]  s                   @r   cells_to_tablesru  m  s   B B 5kO
 eOMF
o

 /)D*40L=!Q&3|#44$$T*&&t,  #"N"NN  !##s<'88O!((.#**40# *( }!33MM$}-.!!#!5 o

> =d=)* 3v;?B+ %%AFAKK!KK! 
 L17|a&&  % !   	% ,, V!GHGNr   c                       \ rS rSrS rSrg)	CellGroupi  c                 V   Xl         [        [        [        S5      [	        S U5      5      5      [        [        [        S5      [	        S U5      5      5      [        [        [        S5      [	        S U5      5      5      [        [        [        S5      [	        S U5      5      5      4U l        g Nr   r
   r   r	   )rS   r   r   r   r  r   r    )r   rS   s     r   r   CellGroup.__init__  sx    
JqM6$#678JqM6$#678JqM6$#678JqM6$#678	
	r   )r    rS   N)r   r   r   r   r   r   r6   r   r   rw  rw    s    
r   rw  c                       \ rS rSrSrg)TableRowi  r6   Nr   r6   r   r   r|  r|    r   r   r|  c                       \ rS rSrSrS rSrg)TableHeaderi  z9PyMuPDF extension containing the identified table header.c                 4    Xl         X l        X0l        X@l        g r   )r    rS   namesexternal)r   r    rS   r  aboves        r   r   TableHeader.__init__  s    	

r   )r    rS   r  r  N)r   r   r   r   r   r   r   r6   r   r   r~  r~    s
    Cr   r~  c                       \ rS rSrS r\S 5       r\S\4S j5       r\S\	4S j5       r
\S\	4S j5       rS\4S jrSS	 jrS
 rSS jrSrg)Tablei  c                 F    Xl         X l        U R                  5       U l        g r   )rl  rS   _get_headerheader)r   rl  rS   s      r   r   Table.__init__  s    	
&&(r   c           
         U R                   n[        [        [        S5      U5      5      [        [        [        S5      U5      5      [	        [        [        S5      U5      5      [	        [        [        S5      U5      5      4$ ry  )rS   r   r   r   r   )r   rj   s     r   r    
Table.bbox  sa    JJJqM1%&JqM1%&JqM1%&JqM1%&	
 	
r   r   c                    [        U R                  [        SS5      S9n[        [        [	        [        [        S5      U R                  5      5      5      5      n/ n[        R                  " U[        S5      5       HU  u  pEU Vs0 s H  ofS   U_M
     nn[        U Vs/ s H  oR                  U5      PM     sn5      n	UR                  U	5        MW     U$ s  snf s  snf )Nr
   r   r<   )rL   rS   r   r   r  r   rD  rE  r|  rI   rM   )
r   r  r  rowsr!   	row_cellsr_   xdictr8   rows
             r   r  
Table.rows  s    Aq)9:&SA

;<=>%--gz!}ELA/89yt!Wd]yE9"5"QIIaL"56CKK F  :5s   CC
c                 ,    [        U R                  5      $ r   )rO   r  )r   s    r   	row_countTable.row_count  s    499~r   c                 ~    [        U R                   Vs/ s H  n[        UR                  5      PM     sn5      $ s  snf r   )r   r  rO   rS   )r   r   s     r   	col_countTable.col_count  s+    $))4)QCL)4554s   :c           	         [         n/ nS[        4S jnU R                   H  n/ nU Vs/ s H  ot" XuR                  5      (       d  M  UPM     nnUR                   H  n	U	c  S n
OqU Vs/ s H  ot" Xy5      (       d  M  UPM     nn[        U5      (       a?  U	S   US'   U	S   US'   SU;   a  U	S   U	S   -
  US	'   U	S
   U	S   -
  US'   [        U40 UD6n
OSn
UR                  U
5        M     UR                  U5        M     U$ s  snf s  snf )Nr   c                     U S   U S   -   S-  nU S   U S   -   S-  nUu  pEpg[        X4:  =(       a    X6:  =(       a    X%:  =(       a    X':  5      $ )Nr   r   r   rC   rG   r  )r   r    v_midh_midrC   r   rG   r   s           r   char_in_bbox#Table.extract.<locals>.char_in_bbox  sa    %[4>1Q6E$Z$t*,1E"&BRV5:VELVu~ r   r   r   r
   r   r   r   r   r	   r   rb   )CHARSr   r  r    rS   rO   ro  rM   )r   rY  ri   	table_arrr  r  arrr   	row_charsr_   	cell_text
cell_charss               r   extractTable.extract  s)   		 	 99CC*/P%$<hh3O%IP		< $I *3")2l46N  " :,0Gy),0Gy)#v-59!WtAw5FF>26:1gQ6GF?3$0$Fv$F	$&	

9%# "$ S!- 0 - Q"s   DD(D<Dc                 ^   SnU R                   nU R                  nU R                   VVs/ s H  ofR                   Vs/ s H  owPM     snPM      nnn[	        U5       V	V
s/ s H  n	[	        U5       V
s/ s H  n
SPM     sn
PM!     nn	n
[        U5       H7  u  p[        U5       H#  u  pUc  M
  [        [        X   U	   SS9X   U	'   M%     M9     U(       a  [	        U5       H4  n	[	        US-
  5       H  n
X   U
S-      b  M  X   U
   X   U
S-   '   M!     M6     [	        U5       H4  n
[	        US-
  5       H  n	XS-      U
   b  M  X   U
   XS-      U
'   M!     M6     [        U R                  R                  5       HZ  u  pU(       d  SU
S-    3nUR                  SS5      nU(       a&  [        R                  " UR                  S	S
5      5      nX>S-   -  nM\     US-  nUSSR                  S [	        U R                  5       5       5      -   S-   -  nU R                  R                  (       a  SOSn	XS  H[  nSn[        U5       H>  u  pUc  SnU(       a&  [        R                  " UR                  S	S
5      5      nXS-   -  nM@     US-  nX?-  nM]     US-   $ s  snf s  snnf s  sn
f s  sn
n	f )a  Output table content as a string in Github-markdown format.

If "clean" then markdown syntax is removed from cell content.
If "fill_empty" then cell content None is replaced by the values
above (columns) or left (rows) in an effort to approximate row and
columns spans.

|NT)rz   r
   Colrf   re   -z&#45;c              3   &   #    U  H  nS v   M	     g7f)z---Nr6   )r   rZ   s     r   r"   $Table.to_markdown.<locals>.<genexpr>S  s      F0E10Es   z|
r   rb   )r  r  r  rS   rP   r   r   rk  r  r  replacehtmlescaper   r  )r   clean
fill_emptyoutputr  colsr   rj   
cell_boxesr^   rZ   rS   r  r_   namer}   s                   r   to_markdownTable.to_markdown!  s    ~~~~ 59II>Iq''*'Qq'*I
> 7<DkBkd,1$,kB
+FA$S>#"/ *-"2T#EHQK * ,  4[taxAxA.*/(1+Q ) ! 4[taxAU|A.*/(1+!eQ ) ! !!2!23GAQqSE{<<f-D{{4<<W#=>Sj F 4 	$# Fdnn0E FFFNN %%A1 9CD$S> <D;;t||C'ABDs
" * DLDNF  }m +> -Bs.   JJJ J)3J$?J)J$J)c                     SSK n0 nU R	                  5       nU R
                  nU R
                  R                  n[        U5      n[        U5       H  nXh   n	U	(       a  M  SU 3Xh'   M     U[        [        U5      5      :w  a*  [        U5       H  nXh   n	U	SU 3:w  d  M  U SU	 3Xh'   M     UR                  (       d  USS n[        U5       H>  nXh   n
/ n[        [        U5      5       H  nUR                  XL   U   5        M     XU
'   M@     UR                  U5      $ ! [         a    [        R                  " S5        e f = f)z/Return a pandas DataFrame version of the table.r   Nz!Package 'pandas' is not installedr  r  r
   )pandasModuleNotFoundErrorr(   messager  r  r  rO   rP   r  r  rM   	DataFrame)r   rY  pdpd_dictr  hdrr  hdr_lenrZ   r  r=   r  r^   s                r   	to_pandasTable.to_pandasg  s6   	
 ,,.kk!!e*wA8D4 9   c#e*o%7^xS9$"#AdV}EH $
 ||abkGwA(CE3w<(WZ]+ ) CL   ||G$$= # 	OO?@	s   D! !"Ec                 \  ^( U R                   m(UnU(4S jnU(4S jn U R                  S   nUR                  n[        R                  " UR
                  5      n[        XvU R                  5       S   S5      n[        U R                  5      S:  a  U$ [        U5      S:  a  U$ U R                  S   n	[        S U	R                   5       5      (       a  U$ U" U5      n
U
(       a  U" U	R
                  5      (       d  U$ U" U 5      (       a  U$ USS	  Vs/ s H  ob  US   OSPM     nnU7nSUl        UR                  Ul        T(R                  S
U[        R                  S9S   n[        U VVVs/ s H[  nUS     HN  nUS     HA  n[         R#                  US   5      (       a  M"  US   [        R$                  -  (       a  M?  UPMC     MP     M]     snnnS SS9n/ n/ n/ n['        [        U5      5       GH  nUU   nUS   S   nUUS   S   -
  nUS   [        R(                  -  nUS:X  a5  UR+                  U5        UR+                  U5        UR+                  U5        Mm  US	   nUS	   nUS	   nU(       a	  U(       d    OUU-
  U::  d  [-        UU-
  US   S   -
  5      U::  a+  US   S   UU-
  US   S   U4US'   UUU'   U(       a  UUS	'   M  UU-
  SU-  :  a    O8UR+                  U5        UR+                  U5        UR+                  U5        GM     U/ :X  a  U$ USS nUR                  US   -
  US   :  a  U$ U
(       a  US   (       d  U$ U/ :X  a  U$ [        R.                  " 5       nU Vs/ s H  nUS   S   US	   :  d  M  UPM     sn H  nUUS   -  nM     UR0                  (       d  UnUR                  Ul        T(R                  SUS9 Vs/ s H  n[        R                  " USS 5      PM     nn[        [3        [5        U V s/ s H  n U S   PM
     sn 5      5      SS9n!/ nU! Hb  n"U V#V s/ s H7  n#U#c  M  U  H(  n U S   U":X  d  M  U S   U#:  d  M  U S   U#:  d  M$  U#U 4PM*     M9     n$n#n U$/ :X  a  UR+                  U"5        Mb    O   U/ :X  a  U$ U7n%US	   U%l        U Vs/ s H(  nUb   US   U%R                  US   U%R                  4OSPM*     n&nU R
                  S   U%l        U R
                  S   U%l        U& Vs/ s HG  nUb?  T(R;                  U5      R=                  SS5      R=                  SS5      R?                  5       OSPMI     n'n[        [A        U%5      U&U'S5      $ ! [         a     gf = fs  snf s  snnnf s  snf s  snf s  sn f s  sn n#f s  snf s  snf ) a  Identify the table header.

*** PyMuPDF extension. ***

Starting from the first line above the table upwards, check if it
qualifies to be part of the table header.

Criteria include:
* A one-line table never has an extra header.
* Column borders must not intersect any word. If this happens, all
  text of this line and above of it is ignored.
* No excess inter-line distance: If a line further up has a distance
  of more than 1.5 times of its font size, it will be ignored and
  all lines above of it.
* Must have same text properties.
* Starting with the top table line, a bold text property cannot change
  back to non-bold.

If not all criteria are met (or there is no text above the table),
the first table row is assumed to be the header.
c                 *  > [         R                  " U R                  S   R                  5      nUSUR                  * SUR                  * 4-   nTR                  US9R                  5       S   nTR                  US9R                  5       S   nX4:w  a  gg)z
Compare top row background color with color of same-sized bbox
above. If different, return True indicating that the original
table top row is already the header.
r   clipr
   TF)r(   rK   r  r    r  
get_pixmapcolor_topusage)r   bbox0bboxt
top_color0
top_colortrl  s        r   top_row_bg_color+Table._get_header.<locals>.top_row_bg_color  s     LL1!2!23EQq5<<-@@Ee4CCEaHJe4CCEaHJ'r   c                    > TR                  S[        R                  U S9S   nU VVVs/ s H  o"S     H  o3S     H  oDPM     M     M     nnnn[        S U 5       5      $ s  snnnf )a,  Check if a row contains some bold text.

If e.g. true for the top row, then it will be used as (internal)
column header row if any of the following is true:
* the previous (above) text line has no bold span
* the second table row text has no bold span

Returns True if any spans are bold else False.
r   )rm   r  rc   rd   rh   c              3   L   #    U  H  oS    [         R                  -  v   M     g7f)rm   N)r(   TEXT_FONT_BOLD)r   ss     r   r"   :Table._get_header.<locals>.row_has_bold.<locals>.<genexpr>  s     JEqzG$:$::Es   "$)get_textr(   TEXTFLAGS_TEXTr$   )r    rc   r  lr  rh   rl  s         r   row_has_bold'Table._get_header.<locals>.row_has_bold  sh     ]]61G1Gd]SF !'O1'
1gJqQJQ
QEOJEJJJ Ps   #A#r   NFr   r
   c              3   (   #    U  H  oS L v   M
     g 7fr   r6   rb  s     r   r"   $Table._get_header.<locals>.<genexpr>  s     -*QDy*s   rA   r   r  rm   rc   rd   rh   r{   rm   c                     U S   S   $ )Nr    r	   r6   r  s    r   r9   #Table._get_header.<locals>.<lambda>  s    !F)A,r   T)r=   reverser    r	   g      ?   rT  r  r  )r  rf   rk   z  rb   )!rl  r  rS   r(   rK   r    
IndexErrorr~  r  rO   allr   r   r  r  rL   rr   ri  TEXT_FONT_SUPERSCRIPTrP   r  rM   rq   rg  rQ   r   r  rC   rG   rj  r  ry   rR   ))r   r   y_deltar  r  r  rS   r    header_top_rowrow2top_row_boldrj   col_xr  rc   r  r  r  rh   selectline_heights
line_boldsrZ   r   r  boldr   h0bold0ncliprg  r%   r   	word_topsr   r8   intersectinghdr_bbox	hdr_cells	hdr_namesrl  s)                                           @r   r  Table._get_header  s*   , yy		K"	))A,CIIE<<)D
 %T$,,.2CUK tyy>A!! u:>!! yy|-$**---!! $D) TYY 7 7!!D!! "! ;@*E*Q1D0*E u''vD8N8NO
   	A7A7A ++AfI6  zG$A$AA  $ # 	 '
 
 s5z"AaA61BQvYq\!AW: 6 66D Avb!##A&!!$' Bb!BrNET Bw'!S"r'QvYq\)A%Bg%MvYq\27AfIaL"E&	a%)JrNb38#MM""d#A #D R<!! 77VAY,q/1!! 
1!!B;!! ""$"AUailfRj&@!UAAQvYE B~~D'' 48==t=3TU3Tagll1Ra5)3T
U4:$>:aQqT:$> ?@$O	 C A  $AQ43;	  $%Q4!8	  12!q	 A $    r!c"  R<!!5Rj 
 78mQqT8;;!hkk2M 	 
 iiliil 
  =   #++D#6>>tSIOOQ  	 
 5?Iy$GGO  		B F	N B V$>"

sl   ;U3 V/3V&VV2V	V#VV8VVVV*V//V$AV)3
V ?V )rS   r  rl  N)FT)r	   )r   r   r   r   r   propertyr    r   r  r   r  r  r  r  r  r  r   r6   r   r   r  r    s    )
 
 
 d   3   63 6 6$4 $LDL"%HcHr   r  c                   "   \ rS rSr% Sr\\S'   Sr\\S'   Sr\	\S'   Sr
\	\S'   \r\\S'   \r\\S	'   \r\\S
'   \r\\S'   \r\\S'   \r\\S'   Sr\\S'   \r\\S'   \r\\S'   Sr\\S'   \r\\S'   \r\\S'   Sr\\S'   SS jr\ SS j5       r!Sr"g)TableSettingsiq  rd   vertical_strategyhorizontal_strategyNexplicit_vertical_linesexplicit_horizontal_linesr   r   r   r   r   r   r	   r   r   r   r   r   r   text_settingsc           	      H   [          H*  n[        X5      =(       d    SS:  d  M  [        SU S35      e   S H?  n[        XS-   5      nU[        ;  d  M  [        U SSR	                  [        5       S35      e   U R
                  c  0 U l        S	 H=  nX@R
                  ;  d  M  U R
                  R                  S
S5      U R
                  U'   M?     S
U R
                  ;   a  U R
                  S
	 S H.  u  pE[        X5      [        L d  M  [        X[        X5      5        M0     U $ )ao  Clean up user-provided table settings.

Validates that the table settings provided consists of acceptable values and
returns a cleaned up version. The cleaned up version fills out the missing
values with the default values in the provided settings.

TODO: Can be further used to validate that the values are of the correct
    type. For example, raising a value error when a non-boolean input is
    provided for the key ``keep_blank_chars``.

:param table_settings: User-provided table settings.
:returns: A cleaned up version of the user-provided table settings.
:raises ValueError: When an unrecognised key is provided.
r   zTable setting 'z' cannot be negative)r   vertical	_strategyz_strategy must be one of{,})r
  r   rr  r	   ))r   r   )r   r   )r   r   )r   r   )r   r   )r   r   )	NON_NEGATIVE_SETTINGSgetattrr   TABLE_STRATEGIESr   r  rI   UNSETsetattr)r   settingr  strategyr  fallbacks         r   __post_init__TableSettings.__post_init__  s)     -G&+!q0 ?7);O!PQQ - 6Kt;%>?H// "m $"234B8  6 %!#D 3D---+/+=+=+A+A+q+Q""4( 3 $,,,"";/
ND t"e+GD$;<
 r   c                     Uc  U " 5       $ [        X5      (       a  U$ [        U[        5      (       a?  0 n0 nUR                  5        H  u  pEUS S S:X  a	  XSUSS  '   M  XRU'   M     X2S'   U " S0 UD6$ [        SU 35      e)Nr  text_r  zCannot resolve settings: r6   )r   r   r  r   )clssettingscore_settingsr  r^  r  s         r   resolveTableSettings.resolve  s    5L&&O$''MM (Ra5G#+,!AB%('(!$	 )
 .;/*'''8
CDDr   )r  )r   r  r   )#r   r   r   r   r  str__annotations__r  r  r   r  DEFAULT_SNAP_TOLERANCEr   r   r  r   r   DEFAULT_JOIN_TOLERANCEr   r   r   r   DEFAULT_MIN_WORDS_VERTICALr   DEFAULT_MIN_WORDS_HORIZONTALr   r   r   r   r  r   r
  classmethodr  r   r6   r   r   r  r  q  s    $s$&&$(T(&*t*2NE2#e##e#2NE2#e##e#OU ::">%>$%E%&+e+&+e+M43j E Er   r  c                   6    \ rS rSrSrS	S jrS\4S jrS rSr	g)
TableFinderi  a  
Given a PDF page, find plausible table structures.

Largely borrowed from Anssi Nurminen's master's thesis:
http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3

... and inspired by Tabula:
https://github.com/tabulapdf/tabula-extractor/issues/16
Nc                    [         R                  " U5      U l        [        R	                  U5      U l        U R                  5       U l        [        U R                  U R
                  R                  U R
                  R                  5      U l        [        U R                  5      U l        [        U R                  U R                  5       Vs/ s H  n[        U R                  U5      PM     snU l        g s  snf r   )weakrefproxyrl  r  r  r  	get_edgesr  r<  r   r   r8  rW  rS   ru  r  ro  )r   rl  r  
cell_groups       r   r   TableFinder.__init__  s    MM$'	%--h7^^%
3JJMM22MM22

 ,D,>,>?
 .diiD
D
 $))Z(D
 
s    C4r   c           
         U R                   nS HL  n[        XS-   5      nUS:X  d  M  [        USU-   S-   5      n[        U5      S:  d  M<  [        SU SU S	35      e   UR                  nUR
                  nUS
:X  d  US
:X  a$  [        [        40 UR                  =(       d    0 D6nO/ n/ nUR                  =(       d    /  H  n	[        U	[        5      (       a0  [        U	5       H  n
U
S   S:X  d  M  UR                  U
5        M!     MH  UR                  U	U	U R                  R                  S   U R                  R                  S   U R                  R                  S   U R                  R                  S   -
  SS.5        M     US:X  a  [!        ["        S5      nO;US:X  a  [!        ["        SSS9nO%US
:X  a  [%        XqR&                  S9nOUS:X  a  / nO/ nX-   n/ nUR(                  =(       d    /  H  n	[        U	[        5      (       a0  [        U	5       H  n
U
S   S:X  d  M  UR                  U
5        M!     MH  UR                  U R                  R                  S   U R                  R                  S   U R                  R                  S   U R                  R                  S   -
  U	U	SS.5        M     US:X  a  [!        ["        S5      nO;US:X  a  [!        ["        SSS9nO%US
:X  a  [+        XqR,                  S9nOUS:X  a  / nO/ nX-   n[/        U5      [/        U5      -   n[1        UUR2                  UR4                  UR6                  UR8                  S9n[!        UUR:                  S9$ )N)r  r   r  r   	explicit__linesr   zIf z"_strategy == 'explicit', explicit_zD_lines must be specified as a list/tuple of two or more floats/ints.r{   r  r  r
   r	   r(  rd   r   r}   )r  )r   r  r   )rC   rG   r  r   r   r  )r   r   r   r   )r  )r  r  rO   r   r  r  rU  r  r  r  r   r   r  rM   rl  r.   r  EDGESr3  r   r  r  r   r   r  r   r   r   r   r   )r   r  r  r  rd   v_strath_stratrT  
v_explicitdescr  v_baser  
h_explicith_baser  r  s                    r   r  TableFinder.get_edges  s<   ==5Kx{)BCH:%+*Ch*NOu:>$k] +$$/= 1'(  6 ,,..f6 1!%JH,B,B,HbJEE
44::D$%%%d+A'3."))!, , !!""#yy~~a0"&)).."3"&)).."3diinnQ6G"G'*	 ;" g!%-F&!%?F%e<W<WXF
"FF
66<"<D$%%%d+A'3."))!, , !!"iinnQ/"iinnQ/!%!2TYY^^A5F!F#"&'*	 =" g!%-F&!%?F%&C&CF 
"FFQ$q'!%66%66%66%66
 Eh.F.FGGr   c                     [        U R                  5      nX:  a  [        S5      eUS:  a  X-  nUS:  a  M  U R                  U   $ )Nztable not on pager   )rO   ro  r  )r   rZ   tcounts      r   __getitem__TableFinder.__getitem__K  sF    T[[!;011!eKA !e{{1~r   )rS   r  r8  rl  r  ro  r   )
r   r   r   r   r   r   r   r  r0  r   r6   r   r   r  r    s"    
aH4 aHFr   r  c                 r   U R                   S-   nU R                  R                  nU R                  nU R	                  U[
        S9qU R                  S[        S9S   nX0R                   -  nU GHF  nUS    GH8  nUS   n	[        U	S   S	5      [        U	S   S	5      4n	[        R                  " U	S   U	S   * U	S   U	S   SS5      n
U	S   S:X  a  S
nOSn[        US   S S9 GH  nUS   nUS   n[        R                  " US   5      n[        US   S S9 GH  n[        R                  " US   5      nUU-  n[        R                  " US   5      U-  nUR                  U
l        UR"                  U
l        US   n0 SU(       a  UR&                  UR(                  -
  OUR*                  UR,                  -
  _SUR*                  _SUR,                  U-   _SU_SUR*                  UR,                  -
  _S[/        U
5      _SS_SU_S S!_S"S#_S$U_SU(       a  UOUR*                  UR,                  -
  _S%U_S&S!_S'U_S(UR,                  _S)U_UR&                  UR(                  -
  UR(                  UR&                  UR,                  UR*                  S*.En[0        R3                  U5        GM     GM     GM;     GMI     g!)+z(Extract text as "rawdict" to fill CHARS.r
   r  rawdictr_  rc   rd   rg   r   r  TFrh   c                     U S   S   $ Nr    r   r6   r  s    r   r9   make_chars.<locals>.<lambda>{  s    AfIaLr   r<   fontrx  colorri   c                     U S   S   $ r5  r6   )rj   s    r   r9   r6    s    &	!r   r    originrj   advr   r   rw  r  r  ncs	DeviceRGBnon_stroking_colornon_stroking_patternNr  r   page_numberstroking_colorstroking_patternr{   r   r  )r  rC   rG   r   r   )numberr.   r  transformation_matrixget_textpageFLAGSrk  r  rN   r(   MatrixrL   sRGB_to_pdfrK   Pointr8   r  r!   frG   rC   r   r   rR   r  rM   )rl  r  r@  page_heightctmrc   doctop_baserT   r}   ldirr  r  r   rw  fontsizer8  r   r    bbox_ctmr:  r{   	char_dicts                         r   
make_charsrR  i  s    ++/K))""K

$
$C  d% 8H]]9x]8BF+K'ND;D$q'1%uT!Wa'89D^^DGd1gXtAwQANFAw!|tG}2HI<<++DM:"4=6LMD"<<V5D#czH$]]4>:S@F%xxFH%xxFH9D!Gtww0477AR! $''! !$''K"7! #H	!
 !$''DGG"3! !%-! {! -e! /! &v! &{! G4779J! )%! +D! !  tww!!" "7#!$ "&477!2"gg"gg&kk&kk-!I0 LL+? N	 J # r   c                   ^ ^^^ ^!^"^#^$^% UR                   m$UR                  m%UR                  nUR                  S:H  =(       d    UR                  S:H  m!T R
                  R                  m"T R                  T"-  mT R                  S-   m#T R
                  nT R                  S;   a&  UR                  u  p[        R                  " SSX5      nUb  [        R                  " U5      nOUnU$U%4S jmSUU!U U$U%4S jjn
U
" US9u  pU$U%4S jm UU U"U#4S	 jnU GH.  nUS
   nUS   (       a8  US   S   S:X  a,  US   S   S:X  a   UR                  SUS   S   US   S   45        U GH  nUS   S;  a  M  US   S:X  a<  USS  u  nnU" UUUU5      nU(       a   [        R                  [        U5      5        MR  MT  US   S:X  Ga  US   R                  5       nUR                   U::  a  UR                   UR                  :  a  [#        UR$                  UR&                  -   5      S-  n[        R(                  " UUR*                  5      n[        R(                  " UUR,                  5      nU" UUUU5      nU(       a  [        R                  [        U5      5        GM5  UR                  U::  a  UR                  UR                   :  a  [#        UR,                  UR*                  -   5      S-  n[        R(                  " UR&                  U5      n[        R(                  " UR$                  U5      nU" UUUU5      nU(       a  [        R                  [        U5      5        GM  U" UUR.                  UR0                  U5      nU(       a  [        R                  [        U5      5        U" UUR0                  UR                  U5      nU(       a  [        R                  [        U5      5        U" UUR                  UR2                  U5      nU(       a  [        R                  [        U5      5        U" UUR2                  UR.                  U5      nU(       a!  [        R                  [        U5      5        GM  GM  US   u  nnnnU" UUUU5      nU(       a  [        R                  [        U5      5        U" UUUU5      nU(       a  [        R                  [        U5      5        U" UUUU5      nU(       a  [        R                  [        U5      5        U" UUUU5      nU(       d  GM  [        R                  [        U5      5        GM     GM1     SS SS.nU GH  nU" UUR.                  UR2                  U5      nU(       a  [        R                  [        U5      5        U" UUR0                  UR                  U5      nU(       a  [        R                  [        U5      5        U" UUR.                  UR0                  U5      nU(       a  [        R                  [        U5      5        U" UUR2                  UR                  U5      nU(       d  M  [        R                  [        U5      5        GM     Ub  [5        U[6        [8        45      (       d   eO/ nU Hd  u  nn[        R(                  " U5      n[        R(                  " U5      nU" UUUU5      nU(       d  MF  [        R                  [        U5      5        Mf     Ub  [5        U[6        [8        45      (       d   eO/ nU GH-  n[        R                  " U5      nU" UUR.                  UR0                  U5      nU(       a  [        R                  [        U5      5        U" UUR0                  UR                  U5      nU(       a  [        R                  [        U5      5        U" UUR                  UR2                  U5      nU(       a  [        R                  [        U5      5        U" UUR2                  UR.                  U5      nU(       d  GM  [        R                  [        U5      5        GM0     g )Nr   r
   r  r  r   c                 F  > UR                   T-
  U R                   s=::  a  UR                  T-   ::  d6  O  UR                   T-
  U R                  s=::  a  UR                  T-   ::  al  O  OiUR                  T-
  U R                  s=::  a  UR                  T-   ::  d7  O  UR                  T-
  U R                  s=::  a  UR                  T-   ::  a   g  OgU R                   T-
  UR                   s=::  a  U R                  T-   ::  d6  O  U R                   T-
  UR                  s=::  a  U R                  T-   ::  al  O  gU R                  T-
  UR                  s=::  a  U R                  T-   ::  d7  O  U R                  T-
  UR                  s=::  a  U R                  T-   ::  a   g  ggg)a  Detect whether r1, r2 are neighbors.

Defined as:
The minimum distance between points of r1 and points of r2 is not
larger than some delta.

This check supports empty rect-likes and thus also lines.

Note:
This type of check is MUCH faster than native Rect containment checks.
TF)rC   rG   r   r   )r   r   snap_xsnap_ys     r   are_neighbors!make_edges.<locals>.are_neighbors  s6    EEFNbee5ruuv~5uuv~8"%%&.8EEFNbee5ruuv~5uuv~8"%%&.8 9 EEFNbee5ruuv~5uuv~8"%%&.8 	 EEFNbee5ruuv~5uuv~8"%%&.8 9  r   c           	        > U c  TR                  5       nOU SS n/ nU HL  nT
(       a1  US   S:X  a(  US   R                  T:  a  US   R                  T:  a  M;  UR                  U5        MN     [	        [        U Vs/ s H  o3S   PM	     sn5      S S9n/ nU(       a  US   nSnU(       ac  S	n[        [        U5      S
-
  SS5       H;  nT	" XdU   5      (       d  M  XdU   R                  -  nXdU   R                  -  nXH	 SnM=     U(       a  Mc  [        R                  TR                  U[        S95      (       d  UR                  U5        US	 U(       a  M  XR4$ s  snf )z:Detect and join rectangles of "connected" vector graphics.Nr2   rJ  r.   c                 2    U R                   U R                  4$ r   )r   rC   rD   s    r   r9   4make_edges.<locals>.clean_graphics.<locals>.<lambda>  s    add|r   r<   r   TFr
   rA   r_  )get_drawingsr  r  rM   rL   r  rP   rO   tlbrrr   ri  rj  rk  )npathsallpathspathspprects	new_rectsprect0repeatrZ   rX  r   rl  rV  rW  s            r   clean_graphics"make_edges.<locals>.clean_graphics  s^   >((*HayHA fI$fIOOf,fI$$v-LLO  61vY67=ST	 AYFFs6{Q26A$VAY77),,.),,."I!% 7 &  **4+;+;FX+;+VWW  (q	! f$ 3 7s   =E)r`  c                    > [        U R                  UR                  -
  5      T::  d&  [        U R                  UR                  -
  5      T::  a  gg)z'Check if line is roughly axis-parallel.TF)rq   r8   r!   )r  rD  rV  rW  s     r   is_parallelmake_edges.<locals>.is_parallel	  s:    rttbdd{v%RTTBDD[)9V)Cr   c                 d  > T" X5      (       d  0 $ [        UR                  UR                  5      n[        UR                  UR                  5      n[        UR                  UR                  5      n[        UR                  UR                  5      nXCR                  :  d-  XSR
                  :  d  XcR                  :  d  XsR                  :  a  0 $ XCR
                  :  a  UR
                  nXSR                  :  a  UR                  nXcR                  :  a  UR                  nXsR                  :  a  UR                  nXT-
  nXv-
  n	Xs=:X  a  S:X  a   0 $   0 SU_STU-
  _SU_STU-
  _SU_SU	_SXF4XW4/_S	U S   _S
S_SS_SS_SU S   (       a  U S   OU S   _SS_SS_ST_SS_SS_UUUT-   S.En
U
$ )z;Given 2 points, make a line dictionary for table detection.r   rC   r   rG   r   r  r  r  	linewidthstrokeTfillFevenoddrA  r8  r>  Nr  r}   r@  rB  r?  )r   r   r   )r   r8   r   r!   rG   rC   r   r   )rc  r  rD  r  rC   rG   r   r   r  r  	line_dictdoctop_basisrk  rK  r@  s              r   	make_linemake_edges.<locals>.make_line	  s   2""Irtt_rtt_rtt_rtt_ <2<2<2<I<B<B<B<BaI  
"
+"
 "
 +"	

 U
 f
 RHrh'
 7
 d
 E
 u
 AgJajAfI
 !$
 6
 ;
  !
" #D#
$ <')
	, r   r  	closePathr  rA   r   )r  requrw  )r   r   r   )r8  rp  r  r   )r   r   r   r  r  r.   r  rC  r  r_  r(   rK   rM   r%  r  	normalizer  rq   rG   rC   rI  r   r   r^  bltrr   rR   r   )&rl  r  tsetrb  	add_lines	add_boxesr  prectrg  r  rh  r  rt  rc  r  rZ   r  rD  rr  r.   r8   r!   ulurlllrpathr    boxr   rX  rs  rk  r   rK  r@  rV  rW  s&   `                             @@@@@@@@r   
make_edgesr    sd   ""F""F%%J.0 	6##~5  ))""K;;,L++/KIIE}}	!xxQ1(||D!@-  - ^ #%0MF4 4l '
 [>eAhqkS0U2Yq\S5HLL#uRy|U1Xa[9:At,,ts{12B%aR6	LLi!89  1 t~~' JJ*,dkk1IDGGdgg-.2A q$''2B q$''2B )!RT :I \)%<= KK:-$++

2JDGGdgg-.2A tww2B tww2B )!RT :I \)%<=%a$''4@	LLi!89%a$''4@	LLi!89%a$''4@	LLi!89%a$''4@	LLi!89 
 "#1BB%aR6	LLi!89%aR6	LLi!89%aR6	LLi!89%aR6	9LLi!89O  ` q9DdDGGTWWd;	LLi01dDGGTWWd;	LLi01dDGGTWWd;	LLi01dDGGTWWd;	9LLi01 " )eT]3333	B]]2]]2dBD1	9LLi01  )eT]3333	LLdADD!$$5	LLi01dADD!$$5	LLi01dADD!$$5	LLi01dADD!$$5	9LLi01 r   c           	      B   U R                   nU R                  nU R                   nUS:X  aM  [        R                  " SSSSUR                  UR
                  -
  UR                  -
  UR                  -
  S5      nOUS:X  aM  [        R                  " SSSSSUR
                  UR                  -
  UR                  -
  UR                  -
  5      nO5[        R                  " SSSSSUR                  -  SUR                  -  5      nX@R                  -  nS[        U5      -  n[        R                  R                  XS5      nUS;   a.  Uu  ppXl        Xl        Xl        Xl        U R                  U5        U R                  S5        U R                  nU R                  nX   n XX!4$ )zNullify page rotation.

To correctly detect tables, page rotation must be zero.
This function performs the necessary adjustments and returns information
for reverting this changes.
r  r
   r   r  s   %g %g %g %g %g %g cm rT  )mediaboxr  r(   rG  r   rG   rC   r   derotation_matrixrR   TOOLS_insert_contentsset_mediaboxset_rotationparentrC  )rl  r  rotmbmat0matcmdxrefrC   r   rG   r   docpnos                 r   page_rotation_set0r  	  s^    }}H
--C	B
by~~aAq"%%"%%-"%%*?"%%*GK	~~aAq!RUURUU]RUU-BRUU-JK~~aAq"ruu*b255jA ''
'C
"U3Z
/C==))$Q7D i"a ++C
++C8Ds$$r   c                     U R                   nUR                  US5        U R                  U5        U R                  U5        U R                  nXE   n U $ )zLReset page rotation to original values.

To be used before we return tables.    )r  update_streamr  r  rC  )rl  r  r  r  r  r  s         r   page_rotation_resetr  	  sN     ++CdD!hc
++C8DKr   rd   r  r  vertical_lineshorizontal_linesr   r   r   r   r   r   r   r   r   r   r   r   c           	      :	  ^#^$ [         R                  " 5         / q/ q[	        [         R
                  R                  5       5      n[         R
                  R                  S5        U R                  S:w  a  [        U 5      u  n nnnOSu  nnnUc  [        nUc  [        nU
c  [        n
Uc  [        nUc  [        nUc  [        nUb  UnUn0 SU_SU_SU_SU_SU_S	U_S
U_SU	_SU
_SU_SU_SU_SU_SU_SU_SU_SU_UUS.En[         R
                  R                  5       n U R                  5         U R                  (       a^  [         R
                  R                  S5        U R                   Vs/ s H'  nUS   S:X  d  M  [         R                  " US S 5      PM)     n nO/ n U (       a  OiU R                  b\  [        U 5      n!U![         R
                  R                  U5        Ub  [        U UUU5      n [         R
                  R                  U5        $ [         R#                  US9n"U"U l        ['        XS9  [)        U UU"UUUS9  [        U U"S9n!U (       a?  U!R*                   V#^#s/ s H!  m#[-        U#4S jU  5       5      (       d  M  T#PM#     sn#U!l        U  V$^$s/ s H+  m$[/        U$4S jU!R*                   5       5      (       d  M)  T$PM-     n%n$U%(       aN  [0        R3                  5        V&s/ s H  n&[         R                  " U&S S 5      PM     n'n&U R5                  [6        S9n(U% H5  n)[9        W(W'U)5      n*U!R*                  R;                  [=        U U*5      5        M7      [         R
                  R                  U5        Ub  [        U UUU5      n [         R
                  R                  U5        U!$ s  snf s  sn#f s  sn$f s  sn&f ! [>         a{  n+[         R@                  " S[C        U+5      -  5         S n+A+[         R
                  R                  U5        Ub  [        U UUU5      n [         R
                  R                  U5        g S n+A+ff = f! [         R
                  R                  U5        Ub  [        U UUU5      n [         R
                  R                  U5        f = f) NTr   )NNNr  r  r  r  r   r   r   r   r   r   r   r   r   r   r   r   text_tolerance)text_x_tolerancetext_y_tolerancerA   tabler  )r  r  )r  r|  rb  r}  r~  c              3   V   >#    U  H  n[        TR                  U5      S :  v   M      g7fg333333?Nr   r    )r   r   tabs     r   r"   find_tables.<locals>.<genexpr>t
  s!     ?AtCHHa(C/   &)c              3   V   >#    U  H  n[        TUR                  5      S :  v   M      g7fr  r  )r   r  r   s     r   r"   r  x
  s!     #RzDCHH$5$;zr  )rm   z#find_tables: exception occurred: %s)"r(   _warn_layout_oncer  r%  r   r  set_small_glyph_heightsr  r  r  unset_quad_corrections
get_layoutlayout_informationrK   r  r  r  r  table_settingsrR  r  ro  r$   r  rk  extractWORDSrE  TABLE_DETECTOR_FLAGSr`   rM   r  	Exceptionr  r  ),rl  r  r  r  r  r  r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r}  r~  rb  	old_smallold_xrefold_rotold_mediaboxr  old_quad_correctionsr  boxestbfr|  r  r   my_boxesrg  r%   tp2r.   rS   r  s,                                      ``       r   find_tablesr  
  s   6 EEW]]::<=IMM))$/}}0B40H-h*:''<    '#( '#( $&.2 	"> 	$%5	
 	. 	, 	, 	. 	, 	, 	? 	0 	 4 	!"8 	#$<  	#$<!" 	.#$ -,'H, #==??A9C""MM006-1-D-D-D"QXHX#QrU#-D  E E$$0 d#CL 	--i8&tXwMD,,-ABO $$h$7"4#	
 $. ::%C??? %CJ 
!#Rszz#R RAu 	 
 7?7L7L7NO7N!',,q!u-7NJO##*>#?CD(j$?EJJeD%01  	--i8&tXwMD,,-ABJo>
 P
  =AFG--i8&tXwMD,,-AB 	--i8&tXwMD,,-ABs|   AO -N->N-&O AO "N2N2
O (N7N7	O (#N<AO -O 
Q"Q-Q	 QQ	 	AR)F)r
   )NNr
   )r   )r
   r
   r   )NNNNN)lr   inspectrD  r  r  collections.abcr   dataclassesr   operatorr   r  r(   r   r%  r  rk  FZ_STEXT_BOLDrt   FZ_STEXT_STRIKEOUTrs   r  TEXT_COLLECT_STYLESTEXT_ACCURATE_BBOXESTEXT_MEDIABOX_CLIPrF  TEXT_SEGMENTTEXT_COLLECT_VECTORSr  r  
whitespacerr   r   r   r&   r0   r`   r   r   r   r  r  r  r  r  r  r  rW  r  r  r  r  r   r   r   r   r   r  rU  	signaturer  
parametersrU  r]  r\  rb  r  ro  ru  r  r  r  r  r  r  r  r   r  r   r  r  r  r  r  r  r  r  r  r   r   r  r  r3  r<  r>  rW  ru  rw  r|  r~  r  r  r  rR  r  r  r  r  r6   r   r   <module>r     s%  HT     $ !     	
	))!!" ""#   	!  ""# ""#   	!  6$$%	9At A6reZ	 	   A 1         udH5 		 4 	 # #L^! ^!BT Tn8 8D 8 ""7#5#56AAFFH))-8CCHHJ D w  3 F " 	, ,d ,.%&4 %&PT $H H 	%
 
%$T (1D 15d 5 @3 @,	4S 	4 	4 '&!$@C @6 8N  6BM$ M2 253 3O &C &R 3M :C :z4 2;(|OD Od
 
	y 	 VH VHr YE YE YExC CL*5,z_2D	)%X
 
$&!2""2"" :">$%&*&*
3H H 	H
 H H H H H H H H H H  H  "!H" $#H$ $%Hr   