
    9iG                         S r SSKrSSKrSSKrSSKJr  SSKrSSKrSSK	r	\" 5       S 5       r
\" 5       S 5       rS rS rS r " S	 S
\5      rSS jrg)zCLIP Tokenizer.    N)	lru_cachec                      [         R                  R                  [         R                  R                  [         R                  R	                  [
        5      5      S5      $ )Nzbpe_simple_vocab_16e6.txt.gz)ospathjoindirnameabspath__file__     l/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/multi_modal/gemm/tokenizer.pydefault_bper      s7    77<<
12&( (r   c            	         [        [        [        S5      [        S5      S-   5      5      [        [        [        S5      [        S5      S-   5      5      -   [        [        [        S5      [        S5      S-   5      5      -   n U SS nS	n[        S
5       H4  nX0;  d  M
  U R                  U5        UR                  S
U-   5        US-  nM6     U Vs/ s H  n[	        U5      PM     nn[        [        X5      5      $ s  snf )a  
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
!~      ¡   ¬   ®   ÿNr      )listrangeordappendchrdictzip)bscsnbs       r   bytes_to_unicoder#      s     
eCHHqL" 
#%)%D	D	A+' &(
( +/uSY47IM0C +D
DB
 
AB	A4[;IIaLIIdQhFA	 
 	"Q#a&"B	B 
s   C:c                 d    [        5       nU S   nU SS  H  nUR                  X#45        UnM     U$ )zwReturn set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
r   r   N)setadd)wordpairs	prev_charchars       r   	get_pairsr+   5   s?     EEQIQR		9#$	  Lr   c                     [         R                  " U 5      n [        R                  " [        R                  " U 5      5      n U R	                  5       $ N)ftfyfix_texthtmlunescapestriptexts    r   basic_cleanr5   A   s3    ==D==t,-D::<r   c                 V    [         R                  " SSU 5      n U R                  5       n U $ )Nz\s+ )resubr2   r3   s    r   whitespace_cleanr:   G   s$    66&#t$D::<DKr   c                   D    \ rS rSr\" 5       4S\4S jjrS rS rS r	Sr
g)	SimpleTokenizerM   bpe_pathc           
         [        5       U l        U R                  R                  5        VVs0 s H  u  p#X2_M	     snnU l        [        R
                  " U5      R                  5       R                  S5      R                  S5      nUSS nU Vs/ s H  n[        UR                  5       5      PM     nn[        [        5       R                  5       5      nXf Vs/ s H  o3S-   PM	     sn-   nU H#  nUR                  SR                  U5      5        M%     UR                  SS/5        [        [!        U[#        [%        U5      5      5      5      U l        U R&                  R                  5        VVs0 s H  u  p#X2_M	     snnU l        [        [!        U[#        [%        U5      5      5      5      U l        SSS	.U l        [.        R0                  " S
[.        R2                  5      U l        g s  snnf s  snf s  snf s  snnf )Nutf-8
r   i  </w> <|startoftext|><|endoftext|>)rD   rE   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)r#   byte_encoderitemsbyte_decodergzipopenreaddecodesplittupler   valuesr   r   extendr   r   r   lenencoderdecoder	bpe_rankscacher8   compile
IGNORECASEpat)selfr>   kvmergesmergevocabs          r   __init__SimpleTokenizer.__init__O   s   ,..2.?.?.E.E.GH.GdaQT.GH8$))+227;AA$G-.4:;F5%&F;%'..01U3UVU33ELL( '9:CuSZ'89:)-););)=>)=)=>c&%F*<=>0,

 ::nMM! I <3
 ?s   G)#G/G41G9c                 z  ^  UT R                   ;   a  T R                   U   $ [        US S 5      US   S-   4-   n[        U5      nU(       d  US-   $ / n [        UU 4S jS9nUT R                  ;  a  OUu  pg/ nSn	U	[        U5      :  a   UR                  Xi5      n
UR                  X)U
 5        U
n	X)   U:X  a7  U	[        U5      S-
  :  a%  UU	S-      U:X  a  UR                  Xg-   5        U	S-  n	OUR                  X)   5        U	S-  n	U	[        U5      :  a  M  [        U5      nUn[        U5      S:X  a  O[        U5      nM  [        U5      S:  a  [        US   5        S	R                  U5      nUT R                   U'   U$ ! [         a/  nUR                  U5        UR                  X)S  5         S nAM  S nAff = f)
NrB   c                 N   > TR                   R                  U [        S5      5      $ )Ninf)rT   getfloat)pairrY   s    r   <lambda>%SimpleTokenizer.bpe.<locals>.<lambda>q   s    (:(:4u(Nr   )keyr   r      d   r7   )rU   rN   r+   minrT   rQ   indexrP   	Exceptionr   printr   )rY   tokenr'   r(   
error_listbigramfirstsecondnew_wordijerrs   `           r   bpeSimpleTokenizer.bpee   s   DJJ::e$$U3BZ E"I$6#99$6>!
NPFT^^+"MEHAc$i-

5,AOOD1I.A 7e#CIM(9dA?"(?)OOEN3FAOODG,FA! c$i-" XHD4yA~!$= > z?S *R.!xx~ 

5- ! %%c*OODH-s   
&F 
F:$F55F:c                 f  ^  / n[        [        U5      5      R                  5       n[        R                  " T R
                  U5       Hf  nSR                  U 4S jUR                  S5       5       5      nUR                  U 4S jT R                  U5      R                  S5       5       5        Mh     U$ )NrC   c              3   B   >#    U  H  nTR                   U   v   M     g 7fr-   )rF   ).0r"   rY   s     r   	<genexpr>)SimpleTokenizer.encode.<locals>.<genexpr>   s#      <%: !--a0%:   r@   c              3   B   >#    U  H  nTR                   U   v   M     g 7fr-   )rR   )r~   	bpe_tokenrY   s     r   r   r      s"      K/I) #ll95/Ir   r7   )r:   r5   lowerr8   findallrX   r   encoderP   rz   rM   )rY   r4   
bpe_tokensrq   s   `   r   r   SimpleTokenizer.encode   s    
D 1288:ZZ$/EGG <%*\\'%:< <E K/3xx/D/DS/IK K 0
 r   c                    SR                  U Vs/ s H  o R                  U   PM     sn5      n[        U Vs/ s H  o@R                  U   PM     sn5      R	                  SSS9R                  SS5      nU$ s  snf s  snf )NrC   r@   replace)errorsrB   r7   )r   rS   	bytearrayrH   rL   r   )rY   tokensrq   r4   cs        r   rL   SimpleTokenizer.decode   s{    ww@U+@A=1++A.=>EEI F ''.wvs'; 	 A=s
   A7A<)rT   rH   rF   rU   rS   rR   rX   N)__name__
__module____qualname____firstlineno__r   strr_   rz   r   rL   __static_attributes__r   r   r   r<   r<   M   s#    '2}  ,-^r   r<   c                     [        U[        5      (       a  U/nU R                  S   nU R                  S   nU Vs/ s H  nU/U R                  U5      -   U/-   PM     nn[        R
                  " [        U5      U[        R                  S9n[        U5       H]  u  p[        U
5      U:  a$  U(       a
  U
SU n
XZS'   O[        SX    SU 35      e[        R                  " U
5      XS[        U
5      24'   M_     U$ s  snf )a\  
Returns the tokenized representation of given input string(s)
Parameters
----------
texts : Union[str, List[str]]
    An input string or a list of input strings to tokenize
context_length : int
    The context length to use; all CLIP models use 77 as the context length
truncate: bool
    Whether to truncate the text in case its encoding is longer than the context length
Returns
-------
A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length].
We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long.
rD   rE   )dtypeNrb   zInput z  is too long for context length )
isinstancer   rR   r   torchzerosrQ   int	enumerateRuntimeErrortensor)	tokenizertextscontext_lengthtruncate	sot_token	eot_tokenr4   
all_tokensresultrw   r   s              r   clip_tokenizer      s     %!!"34I!!/2I#%#d +	 0 0 66)D#  %[[Z.		JFz*	v;'0&r
"UXJ&F~FVW  #(,,v"6,3v;, + M%s   #C;)r=   T)__doc__rI   r0   r   	functoolsr   r.   regexr8   r   r   r#   r+   r5   r:   objectr<   r   r   r   r   <module>r      sn       	     ( (  4	Uf Up$r   