
    9i                     z    S SK r S SKrS SKJr  S SKrS SKrS SKr\" 5       S 5       rS r	S r
S r " S S\5      rg)	    N)	lru_cachec            	         [        [        [        S5      [        S5      S-   5      5      [        [        [        S5      [        S5      S-   5      5      -   [        [        [        S5      [        S5      S-   5      5      -   n U S S  nSn[        S	5       H4  nX0;  d  M
  U R                  U5        UR                  S	U-   5        US-  nM6     U Vs/ s H  n[	        U5      PM     nn[        [        X5      5      $ s  snf )
N!~      ¡   ¬   ®   ÿr      )listrangeordappendchrdictzip)bscsnbs       n/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/multi_modal/soonet/tokenizer.pybytes_to_unicoder      s    	eCHHqL" 
#%)%D	D	A+' &(
( +/uSY47IM0C +D
DB
 
AB	A4[;IIaLIIdQhFA	 
 	"Q#a&"B	B 
s   C:c                 d    [        5       nU S   nU SS   H  nUR                  X#45        UnM     U$ )Nr   r   )setadd)wordpairs	prev_charchars       r   	get_pairsr!      s=    EEQIQR		9#$	  L    c                     [         R                  " U 5      n [        R                  " [        R                  " U 5      5      n U R	                  5       $ N)ftfyfix_texthtmlunescapestriptexts    r   basic_cleanr,   (   s3    ==D==t,-D::<r"   c                 V    [         R                  " SSU 5      n U R                  5       n U $ )Nz\s+ )resubr)   r*   s    r   whitespace_cleanr1   .   s$    66&#t$D::<DKr"   c                   6    \ rS rSrS rS rS rS rS	S jrSr	g)
SimpleTokenizer4   c           
         [        5       U l        U R                  R                  5        VVs0 s H  u  p#X2_M	     snnU l        [        R
                  " U5      R                  5       R                  S5      R                  S5      nUSS nU Vs/ s H  n[        UR                  5       5      PM     nn[        [        5       R                  5       5      nXf Vs/ s H  o3S-   PM	     sn-   nU H#  nUR                  SR                  U5      5        M%     UR                  SS/5        [        [!        U[#        [%        U5      5      5      5      U l        U R&                  R                  5        VVs0 s H  u  p#X2_M	     snnU l        [        [!        U[#        [%        U5      5      5      5      U l        SSS	.U l        [.        R0                  " S
[.        R2                  5      U l        g s  snnf s  snf s  snf s  snnf )Nutf-8
r   i  </w> <|startoftext|><|endoftext|>)r:   r;   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)r   byte_encoderitemsbyte_decodergzipopenreaddecodesplittupler   valuesr   joinextendr   r   r   lenencoderdecoder	bpe_rankscacher/   compile
IGNORECASEpat)selfbpe_pathkvmergesmergevocabs          r   __init__SimpleTokenizer.__init__6   s   ,..2.?.?.E.E.GH.GdaQT.GH8$))+227;AA$G-.4:;F5%&F;%'..01U3UVU33ELL( '9:CuSZ'89:)-););)=>)=)=>c&%F*<=>0,

 ::nMM! I <3
 ?s   G)#G/G41G9c                   ^  UT R                   ;   a  T R                   U   $ [        US S 5      US   S-   4-   n[        U5      nU(       d  US-   $  [        UU 4S jS9nUT R                  ;  a  OUu  pV/ nSnU[        U5      :  a   UR                  XX5      n	UR                  X(U	 5        U	nX(   U:X  a7  U[        U5      S-
  :  a%  UUS-      U:X  a  UR                  XV-   5        US-  nOUR                  X(   5        US-  nU[        U5      :  a  M  [        U5      nUn[        U5      S:X  a  O[        U5      nM  SR                  U5      nUT R                   U'   U$ ! [         a    UR                  X(S  5         Ml  f = f)	Nr8   c                 N   > TR                   R                  U [        S5      5      $ )Ninf)rK   getfloat)pairrP   s    r   <lambda>%SimpleTokenizer.bpe.<locals>.<lambda>W   s    (:(:4u(Nr"   )keyr   r      r.   )rL   rD   r!   minrK   rH   indexrG   
ValueErrorr   rF   )
rP   tokenr   r   bigramfirstsecondnew_wordijs
   `         r   bpeSimpleTokenizer.bpeL   s   DJJ::e$$U3BZ E"I$6#99$6>!NPFT^^+"MEHAc$i-

5,AOOD1I.A
 7e#CIM(9dA?"(?)OOEN3FAOODG,FA c$i-  XHD4yA~!$; < xx~ 

5' " OODH-s   &E" "FFc                 f  ^  / n[        [        U5      5      R                  5       n[        R                  " T R
                  U5       Hf  nSR                  U 4S jUR                  S5       5       5      nUR                  U 4S jT R                  U5      R                  S5       5       5        Mh     U$ )Nr9   c              3   B   >#    U  H  nTR                   U   v   M     g 7fr$   )r<   ).0r   rP   s     r   	<genexpr>)SimpleTokenizer.encode.<locals>.<genexpr>{   s#      <%: !--a0%:   r6   c              3   B   >#    U  H  nTR                   U   v   M     g 7fr$   )rI   )rr   	bpe_tokenrP   s     r   rs   rt   }   s"      K/I) #ll95/Iru   r.   )r1   r,   lowerr/   findallrO   rF   encoderG   rn   rC   )rP   r+   
bpe_tokensrg   s   `   r   rz   SimpleTokenizer.encodew   s    
D 1288:ZZ$/EGG <%*\\'%:< <E K/3xx/D/DS/IK K 0
 r"   c                    SR                  U Vs/ s H  o R                  U   PM     sn5      n[        U Vs/ s H  o@R                  U   PM     sn5      R	                  SSS9R                  SS5      nU$ s  snf s  snf )Nr9   r6   replace)errorsr8   r.   )rF   rJ   	bytearrayr>   rB   r~   )rP   tokensrg   r+   cs        r   rB   SimpleTokenizer.decode   s{    ww@U+@A=1++A.=>EEI F ''.wvs'; 	 A=s
   A7A<c                    [        U[        5      (       a  U/nU R                  S   nU R                  S   nU Vs/ s H  nU/U R                  U5      -   U/-   PM     nn[        R
                  " [        U5      U[        R                  S9n[        U5       HB  u  p[        U	5      U:  a	  U	S U n	XIS'   [        R                  " U	5      XxS [        U	5      24'   MD     U$ s  snf )Nr:   r;   )dtyperZ   )

isinstancestrrI   rz   torchzerosrH   int	enumeratetensor)
rP   textscontext_length	sot_token	eot_tokenr+   
all_tokensresultrl   r   s
             r   tokenizeSimpleTokenizer.tokenize   s    eS!!GELL!23	LL1	"')"'$ !kDKK$55C"' 	 )S_nEIIN":.IA6{^+0&r
&+ll6&:Fls6{l?# / )s   #C )rK   r>   r<   rL   rJ   rI   rO   N)M   )
__name__
__module____qualname____firstlineno__rW   rn   rz   rB   r   __static_attributes__ r"   r   r3   r3   4   s    ,)Vr"   r3   )r?   r'   	functoolsr   r%   regexr/   r   r   r!   r,   r1   objectr3   r   r"   r   <module>r      sL           "df dr"   