
    9iz                     B   S SK r S SKrS SKrS SKrS SKJr  S SKJr  S SKJ	r	J
r
JrJrJr  S SKrS SKJr  S SKJrJrJr  SSKJrJrJr  S	\
\\\
   4   S
\
\\\
   4   4S jrS rS rS rS4S jrS r S r!S r"S r# " S S\5      r$ " S S\5      r%Sr&Sr'Sr(Sr)Sr*Sr+Sr,\RZ                  " \, S\, S 3\R\                  S!9r/S"\4S# jr0S$ r1S%\%4S& jr2S'\S
\4S( jr3S)\4S* jr4S"\S
\\S4   4S+ jr5 S5S"\S,\6S-\6S
\\\\\64   S4   4S. jjr7 S6S/\S0\6S1\6S,\6S
\4
S2 jjr8S3 r9g)7    N)Counter)deepcopy)AnyDictListTupleUnion)Image)	BaseModelcomputed_fieldmodel_validator   )calculate_bbox_areacalculate_overlap_ratio"calculate_projection_overlap_ratiolayout_det_resreturnc                    [        U 5      nUS    Vs/ s H  o"S   S:w  d  M  UPM     nn[        5       n[        [        U5      5       H  n[        US-   [        U5      5       H  nXT;   d  Xd;   a  M  [	        X5   S   X6   S   S5      nUS:  d  M.  [        X5   S   5      n[        X6   S   5      n	X5   S   S:X  d  X6   S   S:X  a  X5   S   X6   S   :w  a  Mv  X:  a  UR                  U5        M  UR                  U5        M     M     [        U5       V
Vs/ s H  u  pX;  d  M  UPM     snn
US'   U$ s  snf s  snn
f )	a  
Remove overlapping boxes from layout detection results based on a given overlap ratio.

Args:
    layout_det_res (Dict[str, List[Dict]]): Layout detection result dict containing a 'boxes' list.

Returns:
    Dict[str, List[Dict]]: Filtered dict with overlapping boxes removed.
boxeslabel	reference   
coordinatesmallgffffff?image)r   setrangelenr   r   add	enumerate)r   layout_det_res_filteredboxr   dropped_indexesijoverlap_ratio
box_area_i
box_area_jidxs              n/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/paddlex/inference/pipelines/paddleocr_vl/uilts.pyfilter_overlap_boxesr+   "   s`    '~6.w77w<;;V7 
  eO3u:q1uc%j)A#q';3&(>M s"0,1GH
0,1GH
HW%0EHW4E4Phw'58G+<<+#''*#''*! * & &e,(,0J,(G$ #"5.(s   D7D7D<(D<c                 p    [        U [        R                  5      (       a  U $ [        R                  " U 5      $ )z
Convert the input to a PIL Image.

Args:
    img (PIL.Image or numpy.ndarray): Input image.

Returns:
    PIL.Image: PIL Image object.
)
isinstancer
   	fromarrayimgs    r*   to_pil_imager1   L   s)     #u{{##
??3    c                 p    [        U [        R                  5      (       a  [        R                  " U 5      $ U $ )z
Convert the input to a numpy array.

Args:
    img (PIL.Image or numpy.ndarray): Input image.

Returns:
    numpy.ndarray: Numpy array image.
)r-   r
   nparrayr/   s    r*   to_np_arrayr6   [   s(     #u{{##xx}Jr2   c                     U  Vs/ s H  n[        U5      R                  PM     nnU  Vs/ s H  n[        U5      R                  PM     nn[        U5      n[	        U5      nXE4$ s  snf s  snf )z
Calculate width (max of all) and height (sum) for a vertical merge of images.

Args:
    images (List[PIL.Image or np.ndarray]): List of images.

Returns:
    Tuple[int, int]: (width, height) of merged image.
)r1   widthheightmaxsum)imagesr0   widthsheightswhs         r*   calc_merged_whrA   j   sc     288#l3%%F839:6C|C ''6G:FAGA4K	 9:s
   A$A)c                    U (       d  g[        U 5      S:X  a  [        U S   5      $ [        U[        5      (       a  U/[        U 5      S-
  -  n[        U5      [        U 5      S-
  :w  a  [	        S5      e[        U S   5      n[        S[        U 5      5       H  n[        X   5      nXS-
     n[        UR                  UR                  5      nUR                  UR                  -   n[        R                  " SXg4S5      nUS:X  a#  XbR                  -
  S-  n	XdR                  -
  S-  n
O&US	:X  a  XbR                  -
  n	XdR                  -
  n
OS=pUR                  X)S45        UR                  XJUR                  45        UnM     [        U5      $ )
a  
Merge images vertically with given alignment.

Args:
    images (List[PIL.Image or np.ndarray]): List of images to merge.
    aligns (str or List[str]): Alignment(s) for each merge step ('center', 'right', 'left').

Returns:
    np.ndarray: Merged image as numpy array.
Nr   r   z,The length of aligns must be len(images) - 1RGB   rE   rE   centerr   right)r   r6   r-   str
ValueErrorr1   r   r:   r8   r9   r
   newpaste)r<   alignsmergedr$   img2alignr?   r@   new_imgx1x2s              r*   merge_imagesrS   {   s`    
6{a6!9%%&#S[1_-
6{c&kAo%GHH&)$F1c&k"FI&1udjj)MMDKK'))EA6?;Hll"q(Bjj.Q&Bg\\!BZZBKBf1g&d/0! #" vr2   c                 n	  ^(^) / n0 n[        U 5       H&  u  pEUS   U;   a  XSU'   M  UR                  XE45        M(     / n/ n/ n/ n	S m(U(4S jn
S n[        U5       GH  u  nu  pEU(       d
  U/nU/n/ n	M  X,S-
     u  pUS   nUS   nUS   nUS   n[        UUS5      nUS:H  =(       ai    US	:H  =(       a]    UU:H  =(       aQ    US   US
   :  =(       a?    US   US   :  =(       a-    US   US
   -
  [        US
   US   -
  US
   US   -
  5      S-  :  nUS:  =(       a    US;   =(       a    UU:H  =(       a~    US   US   :  =(       al    [	        US   US   -
  5      [        US   US   -
  US   US   -
  5      S-  :  =(       a/    T(" US   US   5      T(" US
   US
   5      -  =(       a	    U" XMU 5      nU(       a  SnOU(       a
  U
" UU5      nOSnU(       d  U(       a6  UR                  U5        UR                  U5        U	R                  U5        GM  UR                  X45        U/nU/n/ n	GM     U(       a  UR                  X45        / nU H1  u  nn[        U5      [        U5      nnUR                  UUUU45        M3     / n[        5       m)SnU[        U 5      :  Ga
  SnU GH  u  nnnnUU:X  d  M  [        U)4S jU 5       5      (       d  M-  SnU Vs/ s H
  oU   S   PM     nnU(       a  UO/ n[        U5      u  n n!U S:w  a  U!U -  O
[        S5      n"U"S:  a[  [        U5       HK  u  n#n$U U$   R                  5       nU U$   S   US'   SUS'   UR                  U5        T)R                  U$5        MM     Ox[        UU5      n%[        U5       H]  u  n#n$U U$   R                  5       nU#S:X  a  U%OSUS'   U#S:X  a  UOSUS'   US   US'   UR                  U5        T)R                  U$5        M_     / n&[        US-   U5       H  n'U'U;   d  M  U&R                  U'5        M     U& H(  n'UR                  UU'   5        T)R                  U'5        M*     US-   n  O   U(       a  GM  XC;   a*  UT);  a$  UR                  X4   5        T)R                  U5        US-  nU[        U 5      :  a  GM
  U$ s  snf )a/  
Merge blocks based on alignment and overlap logic, except for those with labels in non_merge_labels.

Args:
    blocks (List[Dict]): List of block dicts.
    non_merge_labels (List[str]): Block labels that should not be merged.

Returns:
    List[Dict]: List of processed (and possibly merged) blocks.
r   c                 "    [        X-
  5      S:*  $ )N   abs)a1a2s     r*   
is_aligned merge_blocks.<locals>.is_aligned   s    27|q  r2   c                 Z   > T" U S   US   5      (       a  gT" U S   US   5      (       a  gg)Nr   leftr   rG   rF    )
block_bbox	prev_bboxr[   s     r*   get_alignment#merge_blocks.<locals>.get_alignment   s6    jmYq\22
1y|44r2   c                    X!   S   nX    S   n[        US   US   5      n[        US   US   5      n[        US   US   5      n[        US   US   5      nXVXx/n	[        U5       H$  u  pXU4;   a  M  US   n[        X5      S:  d  M$    g   g)Nr"   r   r   r      TF)minr:   r    r   )	block_idxprev_idxblocksra   r`   rQ   y1rR   y2min_boxr)   other_block
other_bboxs                r*   overlapwith_other_box+merge_blocks.<locals>.overlapwith_other_box   s    $U+	&u-
1z!}-1z!}-1z!}-1z!}-2" )& 1C(++$U+J&w;a? !2 r2   r   r"   
horizontalr   textr   re   g333333?)rr         ?rF   NFc              3   ,   >#    U  H	  oT;  v   M     g 7fNr_   ).0r$   used_indicess     r*   	<genexpr>merge_blocks.<locals>.<genexpr>  s     #Q=a\$9=s   Tr0   infmerge_alignsgroup_id)r    appendr   r:   rX   rf   r   r   allrA   floatcopyr   rS   r   )*ri   non_merge_labelsblocks_to_mergenon_merge_blocksr)   blockmerged_groupscurrent_groupcurrent_indicescurrent_alignsrb   ro   r$   rh   
prev_blockra   
prev_labelr`   block_labeliou_his_crossis_updown_align
align_modegroup_rangesgroup_indicesrL   startendresult_blocksgroup_foundimgsr{   r?   r@   aspect_ratior%   rg   
merged_imginsert_listn_idxr[   rw   s*                                           @@r*   merge_blocksr      sc    O'
>--$)S!""C<0	 ( MMON!  %_5<C"GM"eON.1u5u%	(
5\
Gn2:y,WQJ Tv%Tz)T 1	!,T 1	!,	T
 1	!,)A,1-z!}z!}/LMPSST 	 AI 
=x'
=z)
= 11-
= JqMIaL01)A,1-z!}z!}/LMPSST	
= :a=)A,7Z]IaL9:
= &cV< 	 !J&z9=JJ  '""3'!!*-  /!BC"GM"eONg 6h o>?L!.v']);sUC?@ "/ M5L
C
F
1=-E3ve|#Q=#Q Q Q"2?@-Qq	%(-@)/vR%d+1()Qq1uE%L1$(1-(@9 &y 1 6 6 8'-i'8'?e04n-%,,U3$((3 )A ".dL!AJ(1-(@9 &y 1 6 6 856!Vze@AQDn-,9!,<j)%,,U3$((3 )A !"519c2E 00#**51 3 )E!(()9%)@A $$U+ ) Ag? 2>@ "s,'>  !1!67S!qO F
P G As   R2c                   ^ SSK mSU4S jjnU Vs/ s H  n[        U5      PM     snu  pVpxXu-
  n	X-
  n
U R                  5       nTR                  XU4Xx4SSS9  TR                  nSnU" X,[        X5      SS	9u  pn[        S
[        R                  " X-  5      5      nXYU-
  S-  -   nXjU-   S-  -   nTR                  UUUU4UUSUTR                  S9  U$ s  snf )a$  
Fill a rectangular area in the image with a white background and write the given token string.

Args:
    image (np.ndarray): Image to paint on.
    box (tuple): (x1, y1, x2, y2) coordinates of rectangle.
    token_str (str): Token string to write.

Returns:
    np.ndarray: Modified image.
r   N?c                    > Su  pEUnXT-
  S:  a>  XE-   S-  nTR                  XUSS9u  u  pn
XU-  :  a  XU-  :  a  UnUnOUnXT-
  S:  a  M>  UWW	4$ )N)g?
   g{Gz?r   r   )	thickness)getTextSize)rr   fontFacesquare_size
fill_ratior^   rG   optimal_scalemidr?   r@   _cv2s              r*   get_optimal_font_scale+paint_token.<locals>.get_optimal_font_scaleN  s     lT!<1$CqIIFQA++*4L0L # lT! a""r2   rD   )colorr      )r   r   r   )r   r   r   )lineType)r   )r   intr   	rectangleFONT_HERSHEY_SIMPLEXrf   r:   mathfloorputTextLINE_AA)r   r"   	token_strr   vrQ   rj   rR   rk   box_wbox_hr0   fontthickness_scale_ratio
font_scaletext_wtext_hfont_thicknesstext_xtext_yr   s                       @r*   paint_tokenr   @  s    #  '**cc!fc*NBBGEGE
**,CMM#Bx"BMO ##D!7U*s"J DJJz'IJKN 6>a''F6>a''FKK	  	 J; +s   Cc                 2   S nSSK nUR                  S5        0 nUu  pgp/ n
U" [        U5      5      nUR                  U5        [	        U5       H  u  pUS   u  pnnX:  d  M  X:  d  M  UU::  d  M$  UU	::  d  M,  U
R                  U5        [        UU-
  UU-
  5      S:  a  MU  X-
  X-
  UU-
  UU-
  /nS[        X   5      -   S-   n[        U UU5      n S	US
    S3UU'   M     [	        U5       VVs/ s H  u  nnUU
;   d  M  US
   PM     nnnXU4$ s  snnf )a  
Replace figures in a table area with tokens, return new image and token map.

Args:
    table_block_img (np.ndarray): Table image.
    table_box (list): Table bounding box [x_min, y_min, x_max, y_max].
    figures (List[Dict]): List of figure dicts (must contain 'coordinate', 'path').

Returns:
    Tuple[np.ndarray, Dict[str, str], List[str]]:
        - New table image,
        - Token-to-img HTML map,
        - List of figure paths dropped.
c                     1 Skn/ nSn[        U5      U :  aC  [        [        U5      5      U-  (       d  UR                  U5        US-  n[        U5      U :  a  MC  U$ )N>   019r   r   )r   r   rH   r}   )numexclude_digitsseqr$   s       r*   gen_random_map0tokenize_figure_of_table.<locals>.gen_random_map  sT    (#hnAK.0

1FA #hn 
r2   r   Ni   r      [F]z
<img src="pathz" >)	randomseedr   shuffler    r}   rf   rH   r   )table_block_img	table_boxfiguresr   r   	token_maptable_x_mintable_y_mintable_x_maxtable_y_max
drop_idxes
random_map	figure_idfigurefigure_x_minfigure_y_minfigure_x_maxfigure_y_maxdraw_boxr   r$   fdrop_figuress                          r*   tokenize_figure_of_tabler   ~  sO     
KKI9B6KkJG-J
NN:&w/	AGAU>L,'+++i(<,.|0KLrQ**{*{*	H s:#899C?I)/8YOO%/v/?s#CIi ) 0* +4G*<P*<$!QZIAfI*<LP|33 Qs   2D	Dc                 B   ^ U4S jnSn[         R                  " X2U 5      $ )z
Replace tokens in a string with their HTML image equivalents.

Args:
    table_res_str (str): Table string with tokens.
    figure_token_map (dict): Mapping from tokens to HTML img tags.

Returns:
    str: Untokenized string.
c                 r   > U R                  S5      nSU S3nTR                  X R                  S5      5      $ )Nr   r   r   r   )groupget)matchtoken_idtokenfigure_token_maps      r*   repl(untokenize_figure_of_table.<locals>.repl  s7    ;;q>XJa ##E;;q>::r2   z
\[F(\d+)\])resub)table_res_strr   r   patterns    `  r*   untokenize_figure_of_tabler     s     ;
 G66'//r2   c                       \ rS rSr% SrSr\\S'   Sr\\S'   \\S'   \\S'   \\S'   \\S	'   \	\S
'   Sr
\\S'   Sr\\S'   Sr\\S'   \" SS9\S\S\4S j5       5       rSrg)	TableCelli  aK  
TableCell represents a single cell in a table.

Attributes:
    row_span (int): Number of rows spanned.
    col_span (int): Number of columns spanned.
    start_row_offset_idx (int): Start row index.
    end_row_offset_idx (int): End row index (exclusive).
    start_col_offset_idx (int): Start column index.
    end_col_offset_idx (int): End column index (exclusive).
    text (str): Cell text content.
    column_header (bool): Whether this cell is a column header.
    row_header (bool): Whether this cell is a row header.
    row_section (bool): Whether this cell is a row section.
r   row_spancol_spanstart_row_offset_idxend_row_offset_idxstart_col_offset_idxend_col_offset_idxrr   Fcolumn_header
row_headerrow_sectionbefore)modedatar   c                 
   [        U[        5      (       am  SU;   a  U$ US   R                  SS5      n[        U5      (       d<  UR	                  SS5      nU(       a  U H  nX$S   S-   -  nM     UR                  5       nX!S'   U$ )z
Create TableCell from dict, extracting 'text' property correctly.

Args:
    data (Any): Input data.

Returns:
    Any: TableCell-compatible dict.
rr   bboxr    text_cell_bboxesN )r-   r   r   r   popstrip)clsr  rr   
text_cellsels        r*   from_dict_formatTableCell.from_dict_format  s     dD!!~<##GR0Dt99!XX&8$?
(7c 11 )zz|Lr2   r_   N)__name__
__module____qualname____firstlineno____doc__r   r   __annotations__r   rH   r   boolr   r  r   classmethodr   r  __static_attributes__r_   r2   r*   r   r     s      HcHc
IM4JK(#C C   $r2   r   c                   x    \ rS rSr% Sr/ r\\   \S'   Sr	\
\S'   Sr\
\S'   \\S\\\      4S j5       5       rS	rg
)	TableDatai  z
TableData holds a table's cells, row and column counts, and provides a grid property.

Attributes:
    table_cells (List[TableCell]): List of table cells.
    num_rows (int): Number of rows.
    num_cols (int): Number of columns.
table_cellsr   num_rowsnum_colsr   c                 D   [        U R                  5       VVs/ s H:  n[        U R                  5       Vs/ s H  n[        SUUS-   UUS-   S9PM     snPM<     nnnU R                   H  n[        [        UR                  U R                  5      [        UR                  U R                  5      5       HY  n[        [        UR                  U R                  5      [        UR                  U R                  5      5       H
  nXCU   U'   M     M[     M     U$ s  snf s  snnf )zn
Returns a 2D grid of TableCell objects for the table.

Returns:
    List[List[TableCell]]: Table as 2D grid.
r  r   )rr   r   r   r   r   )
r   r  r  r   r  rf   r   r   r   r   )selfr$   r%   
table_datacells        r*   gridTableData.grid  s   ( 4==)
 * t}}-	 .A )*'(1u)*'(1u .	 * 	 
 $$DD--t}}=D++T]]; 114==A//?A (,qM!$		 % -	
s   DDDDr_   N)r  r  r  r  r  r  r   r   r  r  r   r  r   propertyr#  r  r_   r2   r*   r  r    sU     $&Ki%HcHcd4	?+   r2   r  z<nl>z<fcel>z<ecel>z<lcel>z<ucel>z<xcel>z+(?:<fcel>|<ecel>|<nl>|<lcel>|<ucel>|<xcel>)z.*?(?=z|$))flagssc           	      $   SSR                  [        [        [        [        [
        [        /5      -   S-   n[        R                  " X5      n[        R                  " X5      nU Vs/ s H  oDR                  5       (       d  M  UPM     nnX#4$ s  snf )z
Extract OTSL tags and text parts from the input string.

Args:
    s (str): OTSL string.

Returns:
    Tuple[List[str], List[str]]: (tokens, text_parts)
(|))joinOTSL_NL	OTSL_FCEL	OTSL_ECEL	OTSL_LCEL	OTSL_UCEL	OTSL_XCELr   findallsplitr  )r'  r   tokens
text_partsr   s        r*   otsl_extract_tokens_and_textr7  C  sz     	
))WiIy)T
U	V
	 
 ZZ#F'%J%/AZE;;=%ZJA Bs   &BBc                   ^ [         m[        R                  " UU4S j5       VVs/ s H  u  p#U(       a  M  [        U5      PM     nnn/ nSnSnU(       Ga%  [	        S U 5       5      nU H:  n	[        U	5      U:  d  M  U	R                  [        5        [        U	5      U:  a  M&  M<     / n
SnU H  n	U	 H  nU
R                  U5        U[        U 5      :  d  M%  X   U:X  d  M/  US-  nU[        U 5      :  d  ME  X   [         [        [        [        [        [        4;  d  Mm  U
R                  X   5        US-  nM     U
R                  [         5        U[        U 5      :  d  M  X   [         :X  d  M  US-  nM     U
n S nS n[        U 5       GH=  u  nnSnU[        [        4;   a  SnSnSnU[        :w  a	  XS-      nSnUU-   [        U 5      :  a  XU-      OSnSnUS-   [        U5      :  a  U[        XFS-      5      :  a
  XFS-      U   nU[        [        4;   a  UU" XGS-   U[        [        /5      -  nU[        [        4;   a  UU" XGUS-   [        [        /5      -  nUR                  [        UR                  5       UUUUU-   UUU-   S	95        U[        [        [        [        [        4;   a  US-  nU[         :X  d  GM6  US-  nSnGM@     XT4$ s  snnf )
z
Parse OTSL text and tags into TableCell objects and tag structure.

Args:
    texts (List[str]): List of tokens and text.
    tokens (List[str]): List of OTSL tags.

Returns:
    Tuple[List[TableCell], List[List[str]]]: (table_cells, split_row_tokens)
c                    > U T:H  $ ru   r_   )z
split_words    r*   <lambda>"otsl_parse_texts.<locals>.<lambda>f  s	    Zr2   r   c              3   8   #    U  H  n[        U5      v   M     g 7fru   r   rv   rows     r*   rx   #otsl_parse_texts.<locals>.<genexpr>o  s     <+;Cs3xx+;   r   c                 x    SnUnX   U   U;   a*  US-  nUS-  nU[        X   5      :  a  U$ X   U   U;   a  M*  U$ Nr   r   r?  )r5  c_idxr_idxwhich_tokensspan
c_idx_iters         r*   count_right%otsl_parse_texts.<locals>.count_right  sY    
mJ'<7!OJAIDS//	 mJ'<7
 r2   c                 t    SnUnX   U   U;   a(  US-  nUS-  nU[        U 5      :  a  U$ X   U   U;   a  M(  U$ rE  r?  )r5  rF  rG  rH  rI  
r_idx_iters         r*   
count_down$otsl_parse_texts.<locals>.count_down  sX    
 '<7!OJAIDS[(	  '<7
 r2   r  r   )rr   r   r   r   r   r   r   )r-  	itertoolsgroupbylistr:   r   r}   r/  r.  r0  r1  r2  r    r   r  )textsr5  xysplit_row_tokensr  rG  rF  max_colsrA  	new_textstext_idxr   rK  rO  r$   rr   	cell_textr   r   right_offsetnext_right_cellnext_bottom_cellr;  s                          @r*   otsl_parse_textsr_  X  s    J %%f.GHHDA 	QH  
 KEE <+;<<#Cc(X%

9% c(X% $ 	#C  'c%j(U_-EMH#e*,!!!!!I 2 "((9 A  W%#e*$G)CA# $$  U#4	Iy))HHLy !a%L	  ,-|+;c%j+H,&'b   "qy3/003/	:;;'7	'B5'I$9i"88K$aiI8N   Iy#99J$UQYI8N  "*%%).',x'7).',x'7
 Iy)Y	JJQJE7?QJEEU $V ((Is
   KKr!  c           
      V   U R                   nU R                  n[        U R                  5      S:X  a  gSnU R                  n[        U5       H  nUS-  n[        U5       H  nXE   U   nUR                  UR                  pUR                  UR                  pX:w  d  X:w  a  MD  [        R                  " UR                  R                  5       5      nUR                  (       a  SOSnU nUS:  a	  USU S3-  nU
S:  a	  US	U
 S3-  nUS
U SU SU S3-  nM     US-  nM     SU S3nU$ )zv
Export TableData to HTML table.

Args:
    table_data (TableData): TableData object.

Returns:
    str: HTML string.
r   r  z<tr>thtdr   z
 rowspan=""z
 colspan="<>z</z</tr>z<table>z</table>)r  r  r   r  r#  r   r   r   r   r   htmlescaperr   r  r   )r!  nrowsncolsbodyr#  r$   r%   r"  rowspanrowstartcolspancolstartcontentcelltagopening_tags                  r*   export_to_htmlrr    s>    EE
:!!"a'D??D5\uA"gajD!%0I0IX!%0I0IX}kk$))//"34G"00ddG$IK{G9A66{G9A66a}AgYb	;;D  	! " TF(#DKr2   otsl_strc                   ^ [        U [        5      (       d   eU R                  5       n [        U ;  a	  U [        -   $ U R	                  [        5      n/ nU H  nU(       d  M  [
        R                  U5      nU(       d  M*  [        U5      nSn[        U5       H&  u  pxUR                  [        5      (       d  M!  US-   nM(     UR                  XEUS.5        M     U(       d  [        $ U(       a  [        S U 5       5      OSn	U(       a  [        S U 5       5      OSn
U	n[        X5      n[        S5      nUn[        XS-   5       H#  m[        U4S jU 5       5      nX:  d  M  UnTnM%     / nU HQ  nUS   n[        U5      nUU:  a  US	U nO[         /UU-
  -  nUU-   nUR                  S
R#                  U5      5        MS     [        R#                  U5      [        -   $ )z
Pad OTSL string to a square (rectangular) format, ensuring each row has equal number of cells.

Args:
    otsl_str (str): OTSL string.

Returns:
    str: Padded OTSL string.
r   r   )	raw_cells	total_lenmin_lenc              3   *   #    U  H	  oS    v   M     g7f)rw  Nr_   r@  s     r*   rx   %otsl_pad_to_sqr_v2.<locals>.<genexpr>  s     >Xcy>X   c              3   *   #    U  H	  oS    v   M     g7frv  Nr_   r@  s     r*   rx   ry    s     =HSK(Hrz  rz   c              3   F   >#    U  H  n[        US    T-
  5      v   M     g7fr|  rW   )rv   rA  r8   s     r*   rx   ry    s#      S(3S%5%=!>!>(s   !ru  Nr  )r-   rH   r  r-  r4  OTSL_FIND_PATTERNr3  r   r    
startswithr.  r}   r:   r   r   r;   r/  r,  )rs  linesrow_datalineru  rv  rw  r$   cell_strglobal_min_widthmax_total_lensearch_start
search_endmin_total_costoptimal_widthcurrent_total_costrepaired_linesrA  cellscurrent_len	new_cellspaddingr8   s                         @r*   otsl_pad_to_sqr_v2r    s    h$$$$~~Hh'!!NN7#EH%--d3		N	$Y/KA""9--a% 0 	#P	
  BJs>X>>PQAIC=H==qM#L%5J5\NM|!^4  S( SS./N!M	 5 NK %j&n}-I k][%@AGIbggi01  <<''11r2   otsl_contentc                     [        U 5      n [        U 5      u  p[        X!5      u  p4[        [	        U5      U(       a  [        S U 5       5      OSUS9n[        U5      $ )z
Convert OTSL-v1.0 string to HTML. Only 6 tags allowed: <fcel>, <ecel>, <nl>, <lcel>, <ucel>, <xcel>.

Args:
    otsl_content (str): OTSL string.

Returns:
    str: HTML table.
c              3   8   #    U  H  n[        U5      v   M     g 7fru   r?  r@  s     r*   rx   'convert_otsl_to_html.<locals>.<genexpr>9  s     ;*:3c#hh*:rC  r   )r  r  r  )r  r7  r_  r  r   r:   rr  )r  r5  mixed_textsr  rW  r!  s         r*   convert_otsl_to_htmlr  *  s_     &l3L6|DF$4[$I!K%&?O#;*:;;UVJ
 *%%r2   c                     [        U 5      n[        SUS-  S-   5       H!  nX-  S:X  d  M  U SU nX1U-  -  U :X  d  M  Us  $    g)z
Find the shortest substring that repeats to form the entire string.

Args:
    s (str): Input string.

Returns:
    str or None: Shortest repeating substring, or None if not found.
r   r   r   N)r   r   )r'  nr$   	substrings       r*   !find_shortest_repeating_substringr  ?  sT     	AA1a1fqj!5A:"1IF#q(  	 "
 r2   rw  min_repeatsc                 0   [        [        U 5      U-  US-
  S5       Hv  nX* S nU R                  XB-  5      (       d  M"  SnU nUR                  U5      (       a#  USU*  nUS-  nUR                  U5      (       a  M#  [        U 5      XS-  -
  nU SU XE4s  $    g)a  
Detect if string ends with a repeating phrase.

Args:
    s (str): Input string.
    min_len (int): Minimum length of unit.
    min_repeats (int): Minimum repeat count.

Returns:
    Tuple[str, str, int] or None: (prefix, unit, count) if found, else None.
r   r   Nr   )r   r   endswith)r'  rw  r  r$   unitcounttemp_sstart_indexs           r*   find_repeating_suffixr  R  s     3q6k*GaK<v::d())EF//$''!
 //$'' a&EI.K\k?D// = r2   ro  line_thresholdchar_thresholdc                    U R                  5       nU(       d  U $ SU;  aF  [        U5      S:  a7  [        USSS9nU(       a%  Uu  pgn[        U5      U-  [        U5      S-  :  a  U$ SU;  a?  [        U5      U:  a0  [        U5      nU(       a  [        U5      [        U5      -  nX:  a  U$ U R	                  S5       V	s/ s H)  oR                  5       (       d  M  U	R                  5       PM+     n
n	U
(       d  U $ [        U
5      nX:  a  U $ [        U
5      nUR                  S5      S   u  pX:  a
  X-  S	:  a  U$ U $ s  sn	f )
as  
Detect and truncate character-level, phrase-level, or line-level repetition in content.

Args:
    content (str): Input text.
    line_threshold (int): Min lines for line-level truncation.
    char_threshold (int): Min repeats for char-level truncation.
    min_len (int): Min length for char-level check.

Returns:
    Union[str, str]: (truncated_content, info_string)

d      rV   )rw  r  rs   r   r   g?)r  r   r  r  r4  r   most_common)ro  r  r  rw  stripped_contentsuffix_matchprefixrepeating_unitr  r  r  total_linesline_countsmost_common_lines                 r*   truncate_repetitive_contentr  m  sD    }} ##,<(=(C,-=qVWX,8)FE>"U*S1A-BS-HH ##,<(=(G:;KL()S-@@E&%% '.mmD&9J&9dZZ\\TZZ\&9EJe*K#%.K)55a8;E$7C#?N Ks   >D=D=c                 f   SS K n[        U R                  5      S:X  a  UR                  XR                  5      nOU R                  5       nUR                  [        R                  :w  a  UR                  [        R                  5      nUR                  5       nUR                  5       nX4:X  a  U $ X$-
  X4-
  -  S-  nUR                  [        R                  5      nUR                  USSUR                  5      u  pgUR                  U5      nUc  U $ UR                  U5      u  ppX
X-   2XU-   24   nU$ )Nr   re   rE      )r   r   shapecvtColorCOLOR_BGR2GRAYr   dtyper4   uint8astyper:   rf   	thresholdTHRESH_BINARY_INVfindNonZeroboundingRect)r0   r   graymax_valmin_valr  r   binarycoordsrU  rV  r?   r@   croppeds                 r*   crop_marginr    s   
399~||C!3!34xxzzzRXX{{288$hhjGhhjG
Nw01C7D;;rxx DdCc.C.CDIA__V$F~
!!&)JA!ae)QQY&'GNr2   )rF   )r  rV   )r   r   r   ):rf  rQ  r   r   collectionsr   r   r   typingr   r   r   r   r	   numpyr4   PILr
   pydanticr   r   r   layout_parsing.utilsr   r   r   rH   r+   r1   r6   rA   rS   r   r   r   r   r   r  r-  r.  r/  r0  r1  r2  NON_CAPTURING_TAG_GROUPcompileDOTALLr~  r7  r_  rr  r  r  r  r   r  r  r  r_   r2   r*   <module>r     s      	   0 0   ? ? '#d4j)'#	#tDz/'#T "%PZz;|84v0*3	 3l.	 .d 					G JJv&=%>cB")) 
C *p)f"y "J72 72 72t&s &* sDy1A ( 23
+.
5c3%&8 VX//"%/<?/OR//dr2   