
    9i,                         S r SSKJrJrJr  SSKrSSKrSSKrS rS r	S r
S rS r " S	 S
\5      r " S S\5      r " S S\5      rS rS rS rg)zTokenization classes.    )absolute_importdivisionprint_functionNc                    [         R                  (       aU  [        U [        5      (       a  U $ [        U [        5      (       a  U R                  SS5      $ [        S[        U 5      -  5      e[         R                  (       aU  [        U [        5      (       a  U R                  SS5      $ [        U [        5      (       a  U $ [        S[        U 5      -  5      e[        S5      e)zGConverts `text` to Unicode (if it's not already), assuming utf-8 input.utf-8ignoreUnsupported string type: %s#Not running on Python2 or Python 3?)
sixPY3
isinstancestrbytesdecode
ValueErrortypePY2unicodetexts    q/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/multi_modal/diffusion/tokenizer.pyconvert_to_unicoder      s    
wwdC  Ke$$;;w11:d4jIJJ	dC  ;;w11g&&K:d4jIJJ>??    c                    [         R                  (       aU  [        U [        5      (       a  U $ [        U [        5      (       a  U R                  SS5      $ [        S[        U 5      -  5      e[         R                  (       aT  [        U [        5      (       a  U $ [        U [        5      (       a  U R                  S5      $ [        S[        U 5      -  5      e[        S5      e)zAReturns text encoded in a way suitable for print or `tf.logging`.r   r   r	   r
   )r   r   r   r   r   r   r   r   r   r   encoder   s    r   printable_textr   +   s    
 wwdC  Ke$$;;w11:d4jIJJ	dC  Kg&&;;w'':d4jIJJ>??r   c                     [         R                  " 5       nSn[        U SSS9 n [        UR	                  5       5      nU(       d  OUR                  5       nX!U'   US-  nM<  SSS5        U$ ! , (       d  f       U$ = f)z*Loads a vocabulary file into a dictionary.r   rr   )encoding   N)collectionsOrderedDictopenr   readlinestrip)
vocab_filevocabindexreadertokens        r   
load_vocabr+   B   sy    ##%EE	j#	0F&v'89EKKME %LQJE  
1 L 
1	0 Ls   >A++
A:c                 B    / nU H  nUR                  X   5        M     U$ )z7Converts a sequence of tokens into ids using the vocab.)append)r'   tokensidsr*   s       r   convert_tokens_to_idsr0   Q   s$    
C

5<  Jr   c                 X    U R                  5       n U (       d  / $ U R                  5       nU$ )z@Runs basic whitespace cleaning and splitting on a piece of text.)r%   split)r   r.   s     r   whitespace_tokenizer3   Y   s%    ::<D	ZZ\FMr   c                   4    \ rS rSrSrS	S jrS rS rS rSr	g)
FullTokenizerb   zRuns end-to-end tokenziation.c                     [        U5      U l        U R                  R                  5        VVs0 s H  u  p4XC_M	     snnU l        [	        US9U l        [        U R                  S9U l        g s  snnf )Ndo_lower_case)r'   )r+   r'   items	inv_vocabBasicTokenizerbasic_tokenizerWordpieceTokenizerwordpiece_tokenizer)selfr&   r9   kvs        r   __init__FullTokenizer.__init__e   sZ    
+
+/::+;+;+=>+=41!$+=>-MJ#5DJJ#G  ?s   A,c                     / nU R                   R                  U5       H6  nU R                  R                  U5       H  nUR                  U5        M     M8     U$ N)r=   tokenizer?   r-   )r@   r   split_tokensr*   	sub_tokens        r   rG   FullTokenizer.tokenizek   sT    ))2248E!55>>uE	##I. F 9 r   c                 .    [        U R                  U5      $ rF   )r0   r'   )r@   r.   s     r   r0   #FullTokenizer.convert_tokens_to_idss   s    $TZZ88r   c                 J    U Vs/ s H  o R                   U   PM     sn$ s  snf rF   )r;   )r@   r/   is      r   convert_ids_to_tokens#FullTokenizer.convert_ids_to_tokensv   s!    +./3aq!3///s    )r=   r;   r'   r?   NT)
__name__
__module____qualname____firstlineno____doc__rC   rG   r0   rO   __static_attributes__ r   r   r5   r5   b   s    'H90r   r5   c                   F    \ rS rSrSrSS jrS rS rS rS r	S r
S	 rS
rg)r<   z   zDRuns basic tokenization (punctuation splitting, lower casing, etc.).c                     Xl         g)zVConstructs a BasicTokenizer.

Args:
  do_lower_case: Whether to lower case the input.
Nr8   )r@   r9   s     r   rC   BasicTokenizer.__init__}   s
     +r   c                 d   [        U5      nU R                  U5      nU R                  U5      n[        U5      n/ nU HU  nU R                  (       a!  UR                  5       nU R                  U5      nUR                  U R                  U5      5        MW     [        SR                  U5      5      nU$ )zTokenizes a piece of text. )
r   _clean_text_tokenize_chinese_charsr3   r9   lower_run_strip_accentsextend_run_split_on_puncjoin)r@   r   orig_tokensrH   r*   output_tokenss         r   rG   BasicTokenizer.tokenize   s    !$'% ++D1)$/ E!!//6 7 7 >?	 ! ,CHH\,BCr   c                     [         R                  " SU5      n/ nU H2  n[         R                  " U5      nUS:X  a  M!  UR                  U5        M4     SR	                  U5      $ )z$Strips accents from a piece of text.NFDMn )unicodedata	normalizecategoryr-   re   )r@   r   outputcharcats        r   rb   !BasicTokenizer._run_strip_accents   sY    $$UD1D&&t,Cd{MM$	 
 wwvr   c                 j   [        U5      nSnSn/ nU[        U5      :  am  X#   n[        U5      (       a  UR                  U/5        SnO.U(       a  UR                  / 5        SnUS   R                  U5        US-  nU[        U5      :  a  Mm  U Vs/ s H  nSR	                  U5      PM     sn$ s  snf )z&Splits punctuation on a piece of text.r   TFr    rl   )listlen_is_punctuationr-   re   )r@   r   charsrN   start_new_wordrp   rq   xs           r   rd   !BasicTokenizer._run_split_on_punc   s    T
#e*n8Dt$$tf%!%!MM"%!&r
!!$'FA #e*n %++Fq
F+++s   B0c                    / nU Hj  n[        U5      nU R                  U5      (       a5  UR                  S5        UR                  U5        UR                  S5        MY  UR                  U5        Ml     SR                  U5      $ )z)Adds whitespace around any CJK character.r^   rl   )ord_is_chinese_charr-   re   r@   r   rp   rq   cps        r   r`   &BasicTokenizer._tokenize_chinese_chars   sk    DTB$$R((c"d#c"d#  wwvr   c                     US:  a  US::  dT  US:  a  US::  dH  US:  a  US::  d<  US:  a  US::  d0  US	:  a  US
::  d$  US:  a  US::  d  US:  a  US::  d  US:  a  US::  a  gg)z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TFrX   )r@   r   s     r   r   BasicTokenizer._is_chinese_char   sq     6\bFlfv'MbGm'MbGm'MbGm'MbGm&LR6\'MbGmr   c                     / nU H`  n[        U5      nUS:X  d  US:X  d  [        U5      (       a  M,  [        U5      (       a  UR                  S5        MO  UR                  U5        Mb     SR	                  U5      $ )zBPerforms invalid character removal and whitespace cleanup on text.r   i  r^   rl   )r~   _is_control_is_whitespacer-   re   r   s        r   r_   BasicTokenizer._clean_text   sg    DTBQw",+d*;*;d##c"d#  wwvr   r8   NrQ   )rR   rS   rT   rU   rV   rC   rG   rb   rd   r`   r   r_   rW   rX   r   r   r<   r<   z   s(    N+,	,(*r   r<   c                   (    \ rS rSrSrSS jrS rSrg)r>      zRuns WordPiece tokenization.c                 (    Xl         X l        X0l        g rF   )r'   	unk_tokenmax_input_chars_per_word)r@   r'   r   r   s       r   rC   WordpieceTokenizer.__init__   s    
"(@%r   c                 N   [        U5      n/ n[        U5       GH  n[        U5      n[        U5      U R                  :  a  UR                  U R                  5        ME  SnSn/ nU[        U5      :  ax  [        U5      nSn	Xh:  a<  SR                  XFU 5      n
US:  a  SU
-   n
XR                  ;   a  U
n	OUS-  nXh:  a  M<  U	c  SnO$UR                  U	5        UnU[        U5      :  a  Mx  U(       a  UR                  U R                  5        M  UR                  U5        GM
     U$ )a  Tokenizes a piece of text into its word pieces.

This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.

For example:
  >>> input = "unaffable"
  >>> output = ["un", "##aff", "##able"]

Args:
  text: A single token or whitespace separated tokens. This should have
    already been passed through `BasicTokenizer.

Returns:
  A list of wordpiece tokens.
Fr   Nrl   z##r    T)
r   r3   rv   rw   r   r-   r   re   r'   rc   )r@   r   rg   r*   ry   is_badstart
sub_tokensend
cur_substrsubstrs              r   rG   WordpieceTokenizer.tokenize   s#   $ "$'(.EKE5zD999$$T^^4FEJ#e*$%j!
kWWU%56Fqy!%+%+
1HC k %!F!!*- #e*$" $$T^^4$$Z0; /< r   )r   r   r'   N)z[UNK]d   )rR   rS   rT   rU   rV   rC   rG   rW   rX   r   r   r>   r>      s    &A
3r   r>   c                 p    U S:X  d  U S:X  d  U S:X  d  U S:X  a  g[         R                  " U 5      nUS:X  a  gg)z1Checks whether `chars` is a whitespace character.r^   	
TZsF)rm   ro   rq   rr   s     r   r   r   (  s=     s{ddlddlddl


t
$C
d{r   c                     U S:X  d  U S:X  d  U S:X  a  g[         R                  " U 5      nUR                  S5      (       a  gg)z.Checks whether `chars` is a control character.r   r   r   FCT)rm   ro   
startswithr   s     r   r   r   4  s?     t|tt|tt|


t
$C
~~cr   c                     [        U 5      nUS:  a  US::  d$  US:  a  US::  d  US:  a  US::  d  US:  a  US::  a  g	[        R                  " U 5      nUR                  S
5      (       a  g	g)z2Checks whether `chars` is a punctuation character.!   /   :   @   [   `   {   ~   TPF)r~   rm   ro   r   )rq   r   rr   s      r   rx   rx   @  sg    	TB
 
rbBhB"HrbR2X29s


t
$C
~~cr   )rV   
__future__r   r   r   r!   rm   r   r   r   r+   r0   r3   objectr5   r<   r>   r   r   rx   rX   r   r   <module>r      sk     @ @   
@(@.0F 00mV m`; ;|		r   