
    9i0                     6    S SK r S rS rS rS rS rS rS rg)	    Nc                 $   X   S:w  a  gUS:  a  XS-
     OSnUS-   [        U 5      :  a  XS-      OSnUR                  5       (       d  UR                  5       (       a  gUR                  5       (       d  UR                  5       (       a  gUS;   a  gg)z@
Check if the given character is a sentence ending punctuation.
.Fr       )r    	
"'u   ”u   ’)u   】u   」u   》T)lenisdigitisalpha)textiprevnexts       s/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/paddlex/inference/pipelines/pp_doctranslation/utils.py_is_sentence_dotr      s~    
 w#~a%4A;RDa%#d)+4A;D||~~||~~VV    c                 >   [        U 5      S-  n/ SQn[        U[        U 5      5       H  nX   U;   ad  US-   nU[        U 5      :  a&  X   S;   a  US-  nU[        U 5      :  a
  X   S;   a  M  U[        U 5      :  a  [        U SU 5      U::  a  XE4s  $ Mm  Mo  X   S:X  d  My  [        X5      (       d  M  US-   nU[        U 5      :  a&  X   S;   a  US-  nU[        U 5      :  a
  X   S;   a  M  U[        U 5      :  d  M  [        U SU 5      U::  d  M  XE4s  $    [        USS5       H  nX   U;   aS  US-   nU[        U 5      :  a&  X   S;   a  US-  nU[        U 5      :  a
  X   S;   a  M  [        U SU 5      U::  a  XE4s  $ M^  X   S:X  d  Mh  [        X5      (       d  Mz  US-   nU[        U 5      :  a&  X   S;   a  US-  nU[        U 5      :  a
  X   S;   a  M  [        U SU 5      U::  d  M  XE4s  $    [        U[        U 5      5      [        U[        U 5      5      4$ )	z
Find the position to split the text into two chunks.

Args:
    text (str): The original text to be split.
    chunk_size (int): The maximum size of each chunk.

Returns:
    int: The index where the text should be split.
   )r	   u   。;u   ；!u   ！?u   ？r   z 	
Nr   r   )r   ranger   min)r   
chunk_sizecentersplit_charsr   js         r   _find_split_posr#   )   s    Y!^FCK 63t9%7k!AAc$i-DGw$6Q c$i-DGw$63t9}T"1X*!<t "=}W^ 0 9 9AAc$i-DGw$6Q c$i-DGw$63t9}T"1X*!<t &  61b!7k!AAc$i-DGw$6Q c$i-DGw$648}
*t +W^ 0 9 9AAc$i-DGw$6Q c$i-DGw$648}
*t " z3t9%s:s4y'AAAr   c                     U R                  5       n [        U 5      U::  a  U" U 5      $ [        X5      u  p4U SU nXS nXU nU(       a  [        XQU5      nU(       a  [        XaU5      n	WU-   W	-   $ )aB  
Split the text recursively and translate each chunk.

Args:
    text (str): The original text to be split.
    chunk_size (int): The maximum size of each chunk.
    translate_func (callable): A function that translates a single chunk of text.
    results (list): A list to store the translated chunks.

Returns:
    None
N)stripr   r#   split_text_recursive)
r   r   translate_func	split_posend_whitespaceleftright
whitespace	left_text
right_texts
             r   r&   r&   Z   s     ::<D
4yJd##$3D$E!	JY_%N3
,T~NI-ePJ:%
22r   c                    U R                  5       R                  S5      nUS   R                  S5      (       d  US   R                  S5      (       an  US   nUS   R                  S5      (       d  US   R                  S5      (       a  US   OSnU(       a  SR                  USS 5      OSR                  USS 5      nOSnSnU n[	        XqU5      nUR                  S5       V	s/ s HQ  n	U	R                  5       R                  S5      (       a  M)  U	R                  5       R                  S5      (       a  MO  U	PMS     n
n	SR                  U
5      nU(       a
  U SU SU 3OUnUR                  U5        gs  sn	f )	aW  
Translate a code block and append the result to the results list.

Args:
    code_block (str): The code block to be translated.
    chunk_size (int): The maximum size of each chunk.
    translate_func (callable): A function that translates a single chunk of text.
    results (list): A list to store the translated chunks.

Returns:
    None
r	   r   z```z~~~r   r   r   N)r%   split
startswithjoinr&   append)
code_blockr   r'   resultslinesheaderfootercode_contenttranslated_code_lineslinefiltered_code_linestranslated_coderesults                r   translate_code_blockr?   x   sj    $$T*EQx5!!U1X%8%8%?%?q b	$$U++uRy/C/CE/J/J "I 	
 28tyyq-TYYuQRy=Q!0. *//55D

''. 	26**,2I2I%2P 	5  
 ii 34O9?xr/*"VH5_FNN6s   %(E5"E57E5c                    SSK Jn  SSKnU R                  S5      S:  ab  U R                  S5      S:  aM  U R                  S5      U R                  S5      :X  a)  [	        U 5      U:  a  U" U 5      nUR                  U5        gU" U S5      n[        5       n/ n	/ n
UR                  SSS	9 H  nUR                  S
S/5      nU(       d  M  [        U5      U;  d  M0  UR                  5       R                  5       nU(       a"  U	R                  U5        U
R                  U5        UR                  [        U5      5        M     UnSnU[	        U	5      :  Ga)  / n/ nSnU[	        U	5      :  at  U[	        X   5      -   U::  a`  UR                  X   5        UR                  X   5        U[	        X   5      -  nUS-  nU[	        U	5      :  a  U[	        X   5      -   U::  a  M`  SnUR                  U5      nU" U5      nUR                  U5      n[        UU5       HR  u  nnUR!                  5         U" US5      nUR"                   H#  nUR                  UR%                  U5      5        M%     MT     U[	        U	5      :  a  GM)  / nUR                  SSS	9 HE  nUR                  S
S/5      (       a  M  UR                  5       (       d  M4  UR                  U5        MG     Sn[	        U5      nUU:  Gad  / n/ n[	        S5      nUU:  a  UU   R                  5       n[	        U5      U:  a'  ['        UX5      n UU   R)                  U 5        US-  nMO  SU S3n!U[	        U!5      -   U:  a  O@UR                  UU   5        UR                  U!5        U[	        U!5      -  nUS-  nUU:  a  M  U(       d(  UUS-
     R                  5       nSU S3n!UUS-
     /nU!/nU(       an  SSR                  U5      -   S-   nU" U5      nU" US5      n"U"R                  S5      n#[        UU#5       H%  u  n$n%U$R)                  U%R                  5       5        M'     UU:  a  GMd  UR                  [+        U5      5        g)aW  
Translate a HTML block and append the result to the results list.

Args:
    html_block (str): The HTML block to be translated.
    chunk_size (int): The maximum size of each chunk.
    translate_func (callable): A function that translates a single chunk of text.
    results (list): A list to store the translated chunks.

Returns:
    None
r   BeautifulSoupN<   >html.parserT)string	recursivetdthr   __TD__z	<ol></ol>z<li>z</li>z<ol>r   z</ol>li)bs4rB   copycountr   r3   setfind_allfind_parentiddecode_contentsr%   addr2   r0   zipclearcontentsdeepcopyr&   replace_withstr)&
html_blockr   r'   r5   rB   rN   
translatedsouptd_seentd_batch_nodestd_batch_textsnode	parent_tdtd_text
batch_sizer   batch_nodesbatch_textscurrent_lengthplaceholder
batch_texttranslated_batchtranslated_linestd_noder;   fragchild
text_nodesidxtotalli_texts	node_texttranslated_textli_str
trans_souptranslated_lis	orig_nodeli_tags&                                         r   translate_html_blockr{      sC    " 	!S!A%S!Z%5%5c%::
Oj(#J/
z"]3DeGNN TT:$$dD\2	9Ig5//1779G%%i0%%g.KK9& ; J	A
c.!
!#n%%.3~?P;Q*QU_*_~01~01c."344NFA	 #n%%.3~?P;Q*QU_*_  %%k2
)*5+11+> .>?MGTMMO }5Dt}}U34 ' @# c.!
!0 JTT:t--$**,,d# ; C
OE
+[)Ek"3--/I9~
*"6y*"]3,,_=qI;e,FF+j8z#/OOF#c&k)N1HC Ek "37+113II;e,F%cAg./KxH"''("33g=J'
3J&z=AJ'006N%(n%E!	6&&v'='='?@ &F= +B NN3t9r   c                 F   SSK Jn  U" U S5      n/ n/ nSnSnSnUR                   Hx  n[        US5      (       aV  UR                  bI  [        U5      n	UR                  U5      n
UR                  U	5        UR                  U
5        Xj-  nUS-  nMj  U[        U5      -  nMz     / n[        X5      nSn[        U5       Ha  u  pUu  nnU[        U5      :  d  M  XL   U;   d  M%  UR                  XL   X<   5      nUS-  nSU4X'   U[        U5      :  d  MW  XL   U;   a  M<  Mc     U$ )	z&
Split the original text into chunks.
r   rA   rF   z<<HTML_BLOCK_{}>>r   namer   html)rM   rB   rX   hasattrr}   r[   formatr3   split_and_append_text	enumerater   replace)r   rB   r^   html_blockshtml_placeholdersplaceholder_fmttext_after_placeholderindexelemhtml_strri   splited_blockcurrent_indexrq   block_contents                    r   split_original_textsr     sJ    " }-DK)OE4  TYY%:4yH)007Kx($$[1"1"QJE"c$i/"  M)-PM M.

7C 122!0G;oo!0+2LG QM"('!2M C 122!0G;	 / r   c                    UR                  5       (       GaY  [        R                  " S[        R                  5      nSnUR	                  U5       H  nUR                  5       U:  aj  XUR                  5        n[        R                  " SU5      nU H;  nUR                  5       (       d  M  U R                  SUR                  5       45        M=     U R                  SUR                  5       45        UR                  5       nM     U[        U5      :  a\  XS n[        R                  " SU5      nU H;  nUR                  5       (       d  M  U R                  SUR                  5       45        M=     U $ )z
Split the text and append the result to the result list.

Args:
    result (list): The current result list.
    text_content (str): The text content to be processed.

Returns:
    list: The updated result list after processing the text content.
z(```.*?\n.*?```|~~~.*?\n.*?~~~)r   z\n{2,}r   codeN)r%   recompileDOTALLfinditerstartr0   r3   groupendr   )r>   text_contentcode_patternlast_posmnon_code
paragraphsps           r   r   r   C  s    zz"DbiiP&&|4Awwy8#'1779=XXi:
#Awwyyvqwwy&9: $ MM61779-.uuwH 5 c,''#I.H)X6J7799MM61779"56   Mr   )r   r   r#   r&   r?   r{   r   r    r   r   <module>r      s3    
..Bb3<(Vod+\!r   