
    9i`                     Z   S SK JrJrJrJr  S SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS r " S S\5      r \R                   " \5      rS rS r " S S	\5      r " S
 S\5      r " S S\5      rS rS rS r  S SKJr  \" 5       S 5       rS r " S S\5      r g! \ a    S r N'f = f)    )absolute_importdivisionprint_functionunicode_literalsNc           	      p    SSSSSSSSS	.nUR                  5        H  u  p#U R                  X#5      n M     U $ )
N-'zn'tz'mz don'tz'sz'vez're)z - z ' z n'tz 'mz do notz 'sz 'vez 're)itemsreplace)string
replace_mpkvs       l/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/preprocessors/nlp/space/tokenizer.pyclean_stringr      sK    	J   "% #M    c                   @    \ rS rSr/ S4S jrS rS rS r/ 4S jrSr	g	)
	Tokenizer   Bertc                   ^  UT l         US:X  GaW  SSS.T l        U HC  nUT R                  ;  d  M  US;  d  M  S[        T R                  5       S3T R                  U'   ME     T R                  R                  5        VVs0 s H  u  pVXe_M	     snnT l        U Vs/ s H  nT R                  R                  Xw5      PM      nnST l        T =R                  [        U 4S	 jU 5       5      -  sl        [        UT R                  S
9T l	        T R                   H(  nUT R                  R                  ;   a  M   SU S35       e   [        T R                  R                  5      T l        g US:X  a  SS0T l        T R                  R                  5        VVs0 s H  u  pVXe_M	     snnT l        U Vs/ s H  nUT R                  ;  d  M  UPM     nn[        R                  R                  US5      n[        R                  R                  US5      n	[        XUS9T l	        [        U5      T l        [        T R                  5      T l        g ["        es  snnf s  snf s  snnf s  snf )Nr   z	[unused0]z	[unused1])z[BOS]z[EOS])[PAD][UNK]z[unused]r   z[SEP]r   z[CLS]z[MASK]c              3   L   >#    U  H  nUTR                   ;  d  M  Uv   M     g 7fNspecial_tokens).0xselfs     r   	<genexpr>%Tokenizer.__init__.<locals>.<genexpr>7   s'      )JNq,-T5H5H,H *+Ns   $	$)never_splitzspecial token 'z' is not in the vocabularyGPT2r   z<unk>z
vocab.jsonz
merges.txtr   )tokenizer_typespec_convert_dictlenr
   spec_revert_dictgetr   tupleBertTokenizer
_tokenizervocab
vocab_sizeospathjoinGPT2Tokenizernum_specials
ValueError)
r"   
vocab_pathr   r'   tokenr   r   tok
vocab_filemerges_files
   `         r   __init__Tokenizer.__init__!   sb   ,V#$$&D" ( 6 665 I < $+3t/E/E+F*Gq!I **	 ( !2288:%:DA :%D!
 AO@N&&**34  #-D5 )JN )J $J J ,(;(;=DO**doo333fseKe5ff3 +!$//"7"78DOv%&-w%7D" !2288:%:DA :%D!
  .-d444 ~   j,?J'',,z<@K+HDO #N 3D!$//2DOE%%s    I%II.IIc                 8    U R                   R                  U5      $ r   )r.   tokenizer"   texts     r   r?   Tokenizer.tokenizeR   s    ''--r   c                    U R                   S:X  aG  U Vs/ s H  o R                  R                  X"5      PM     nnU R                  R	                  U5      nU$ U Vs/ s H  o R                  R                  X"5      PM     nnU R                  R	                  U5      nU Vs/ s H  oDU R
                  -   U R                  -  PM      nnU$ s  snf s  snf s  snf Nr   )r'   r(   r+   r.   convert_tokens_to_idsr5   r0   )r"   tokensr9   idsis        r   rE   Tokenizer.convert_tokens_to_idsU   s    &(FLMfs,,00:fFM//77?CJFLMfs,,00:fFM//77?CFIJc)))T__<cCJJ N NJs   $C
$C!%Cc                    U R                   S:X  aG  U R                  R                  U5      nU Vs/ s H  o0R                  R	                  X35      PM     nnU$ U Vs/ s H  oDU R
                  -
  U R                  -  PM      nnU R                  R                  U5      nU Vs/ s H  o0R                  R	                  X35      PM     nnU$ s  snf s  snf s  snf rD   )r'   r.   convert_ids_to_tokensr*   r+   r5   r0   )r"   rG   rF   r9   rH   s        r   rK   Tokenizer.convert_ids_to_tokens`   s    &(__::3?FEKLVc++//9VFLMFIJc)))T__<cCJ__::3?FEKLVc++//9VFLM M KLs   $C
%C"$Cc                    U R                  U5      n[        U5      S:  a$  [        U5      nU Vs/ s H  oDU;  d  M
  UPM     nnU R                  S:X  a"  SR	                  U5      R                  SS5      nORSR	                  U5      n[        U Vs/ s H  o`R                  R                  U   PM     sn5      R                  S5      n[        U5      nU$ s  snf s  snf )Nr   r    z ## utf-8)rK   r)   setr'   r3   r   	bytearrayr.   byte_decoderdecoder   )r"   rG   ignore_tokensrF   r9   r   cs          r   rT   Tokenizer.decodek   s    ++C0}!.M%+HVc-/GcVFH&(XXf%--eR8FWWV_F9? 9?A,,Q/  vg  f% I
 s   	CC"C)r.   r5   r(   r*   r   r'   r0   N)
__name__
__module____qualname____firstlineno__r<   r?   rE   rK   rT   __static_attributes__ r   r   r   r      s&    24V /b.		 )+ r   r   c                     [         R                  " 5       nSn[        U SSS9 n UR                  5       nU(       d  OUR	                  5       nX!U'   US-  nM3  SSS5        U$ ! , (       d  f       U$ = f)z*Loads a vocabulary file into a dictionary.r   rrP   encoding   N)collectionsOrderedDictopenreadlinestrip)r:   r/   indexreaderr8   s        r   
load_vocabrj      st    ##%EE	j#	0FOO%EKKME %LQJE  
1 L 
1	0 Ls   5A""
A1c                 X    U R                  5       n U (       d  / $ U R                  5       nU$ )z@Runs basic whitespace cleaning and splitting on a piece of text.)rg   split)rA   rF   s     r   whitespace_tokenizerm      s%    ::<D	ZZ\FMr   c                   <    \ rS rSrSr    S	S jrS rS rS rSr	g)
r-      z?Runs end-to-end tokenization: punctuation splitting + wordpieceNc                    [         R                  R                  U5      (       d  [        SR	                  U5      5      e[        U5      U l        [        R                  " U R                  R                  5        VVs/ s H  u  pgXv4PM
     snn5      U l
        X@l        U(       a  [        X%S9U l        [        U R                  S9U l        Ub  X0l        g[!        S5      U l        gs  snnf )a  Constructs a BertTokenizer.

Args:
  vocab_file: Path to a one-wordpiece-per-line vocabulary file
  do_lower_case: Whether to lower case the input
                 Only has an effect when do_wordpiece_only=False
  do_basic_tokenize: Whether to do basic tokenization before wordpiece.
  max_len: An artificial maximum length to truncate tokenized sequences to;
                 Effective maximum length is always the minimum of this
                 value (if specified) and the underlying BERT model's
                 sequence length.
  never_split: List of tokens which will never be split during tokenization.
                 Only has an effect when do_wordpiece_only=False
zCan't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`do_lower_caser%   )r/   N   mB)r1   r2   isfiler6   formatrj   r/   rc   rd   r
   ids_to_tokensdo_basic_tokenizeBasicTokenizerbasic_tokenizerWordpieceTokenizerwordpiece_tokenizerintmax_len)r"   r:   rr   r}   rw   r%   r9   rG   s           r   r<   BertTokenizer.__init__   s    ( ww~~j))_
#% %  
+
(44'+zz'7'7'96
'983SJ'96
  "3#1+$FD #5DJJ#G ")"5w3t96
s   <C&
c                    / nU R                   (       aW  U R                  R                  U5       H6  nU R                  R                  U5       H  nUR	                  U5        M     M8     U$ U R                  R                  U5      nU$ r   )rw   ry   r?   r{   append)r"   rA   split_tokensr8   	sub_tokens        r   r?   BertTokenizer.tokenize   s{    !!--66t<!%!9!9!B!B5!II ''	2 "J =
   33<<TBLr   c                     / nU H!  nUR                  U R                  U   5        M#     [        U5      U R                  :  a8  [        R                  SR                  [        U5      U R                  5      5        U$ )z7Converts a sequence of tokens into ids using the vocab.zToken indices sequence length is longer than the specified maximum  sequence length for this BERT model ({} > {}). Running this sequence through BERT will result in indexing errors)r   r/   r)   r}   loggerwarningru   )r"   rF   rG   r8   s       r   rE   #BertTokenizer.convert_tokens_to_ids   sg    EJJtzz%() s8dll"NNHHNHdllI,-
 
r   c                 X    / nU H!  nUR                  U R                  U   5        M#     U$ )z?Converts a sequence of ids in wordpiece tokens using the vocab.)r   rv   )r"   rG   rF   rH   s       r   rK   #BertTokenizer.convert_ids_to_tokens   s-    AMM$,,Q/0 r   )ry   rw   rv   r}   r/   r{   )TNTr   )
rX   rY   rZ   r[   __doc__r<   r?   rE   rK   r\   r]   r   r   r-   r-      s)    I  $#'K"EHr   r-   c                   J    \ rS rSrSr  SS jrS rS rS rS r	S r
S	 rS
rg)rx      zDRuns basic tokenization (punctuation splitting, lower casing, etc.).c                     Xl         X l        g)zVConstructs a BasicTokenizer.

Args:
  do_lower_case: Whether to lower case the input.
Nrq   )r"   rr   r%   s      r   r<   BasicTokenizer.__init__   s     +&r   c                 l   U R                  U5      nU R                  U5      n[        U5      n/ nU Hd  nU R                  (       a0  X@R                  ;  a!  UR                  5       nU R                  U5      nUR                  U R                  U5      5        Mf     [        SR                  U5      5      nU$ )zTokenizes a piece of text.rN   )
_clean_text_tokenize_chinese_charsrm   rr   r%   lower_run_strip_accentsextend_run_split_on_puncr3   )r"   rA   orig_tokensr   r8   output_tokenss         r   r?   BasicTokenizer.tokenize   s    % ++D1)$/ E!!e3C3C&C//6 7 7 >?	 ! ,CHH\,BCr   c                     [         R                  " SU5      n/ nU H2  n[         R                  " U5      nUS:X  a  M!  UR                  U5        M4     SR	                  U5      $ )z$Strips accents from a piece of text.NFDMnrO   )unicodedata	normalizecategoryr   r3   )r"   rA   outputcharcats        r   r   !BasicTokenizer._run_strip_accents  sY    $$UD1D&&t,Cd{MM$	 
 wwvr   c                    XR                   ;   a  U/$ [        U5      nSnSn/ nU[        U5      :  am  X#   n[        U5      (       a  UR	                  U/5        SnO.U(       a  UR	                  / 5        SnUS   R	                  U5        US-  nU[        U5      :  a  Mm  U Vs/ s H  nSR                  U5      PM     sn$ s  snf )z&Splits punctuation on a piece of text.r   TFrb   rO   )r%   listr)   _is_punctuationr   r3   )r"   rA   charsrH   start_new_wordr   r   r!   s           r   r   !BasicTokenizer._run_split_on_punc  s    ###6MT
#e*n8Dt$$tf%!%!MM"%!&r
!!$'FA #e*n %++Fq
F+++s   $Cc                    / nU Hj  n[        U5      nU R                  U5      (       a5  UR                  S5        UR                  U5        UR                  S5        MY  UR                  U5        Ml     SR                  U5      $ )z)Adds whitespace around any CJK character.rN   rO   )ord_is_chinese_charr   r3   r"   rA   r   r   cps        r   r   &BasicTokenizer._tokenize_chinese_chars/  sk    DTB$$R((c"d#c"d#  wwvr   c                    US:  =(       a    US:*  nU=(       d    US:  =(       a    US:*  nU=(       d    US:  =(       a    US:*  nU=(       d    US:  =(       a    US:*  nU=(       d    US	:  =(       a    US
:*  nU=(       d    US:  =(       a    US:*  nU=(       d    US:  =(       a    US:*  nU=(       d    US:  =(       a    US:*  nU(       a  gg)z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TFr]   )r"   r   tmps      r   r   BasicTokenizer._is_chinese_char<  s     V|,f4bFl3rV|6bGm5g6bGm5g6bGm5g6bGm5g4bFl3rV|6bGm5gr   c                     / nU H`  n[        U5      nUS:X  d  US:X  d  [        U5      (       a  M,  [        U5      (       a  UR                  S5        MO  UR                  U5        Mb     SR	                  U5      $ )zBPerforms invalid character removal and whitespace cleanup on text.r   i  rN   rO   )r   _is_control_is_whitespacer   r3   r   s        r   r   BasicTokenizer._clean_textS  sg    DTBQw",+d*;*;d##c"d#  wwvr   rq   N)Tr   )rX   rY   rZ   r[   r   r<   r?   r   r   r   r   r   r\   r]   r   r   rx   rx      s0    N  $K	'*	,,.r   rx   c                   (    \ rS rSrSrSS jrS rSrg)rz   ia  zRuns WordPiece tokenization.c                 (    Xl         X l        X0l        g r   )r/   	unk_tokenmax_input_chars_per_word)r"   r/   r   r   s       r   r<   WordpieceTokenizer.__init__d  s    
"(@%r   c                 8   / n[        U5       GH  n[        U5      n[        U5      U R                  :  a  UR	                  U R
                  5        ME  SnSn/ nU[        U5      :  ax  [        U5      nSn	Xh:  a<  SR                  XFU 5      n
US:  a  SU
-   n
XR                  ;   a  U
n	OUS-  nXh:  a  M<  U	c  SnO$UR	                  U	5        UnU[        U5      :  a  Mx  U(       a  UR	                  U R
                  5        M  UR                  U5        GM
     U$ )a  Tokenizes a piece of text into its word pieces.

This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.

For example:
  >>> input = "unaffable"
  >>> output = ["un", "##aff", "##able"]

Args:
  text: A single token or whitespace separated tokens. This should have
    already been passed through `BasicTokenizer`.

Returns:
  A list of wordpiece tokens.
Fr   NrO   z##rb   T)	rm   r   r)   r   r   r   r3   r/   r   )r"   rA   r   r8   r   is_badstart
sub_tokensend
cur_substrsubstrs              r   r?   WordpieceTokenizer.tokenizei  s   $ (.EKE5zD999$$T^^4FEJ#e*$%j!
kWWU%56Fqy!%+%+
1HC k %!F!!*- #e*$" $$T^^4$$Z0; /< r   )r   r   r/   N)r   d   )rX   rY   rZ   r[   r   r<   r?   r\   r]   r   r   rz   rz   a  s    &A
1r   rz   c                 p    U S:X  d  U S:X  d  U S:X  d  U S:X  a  g[         R                  " U 5      nUS:X  a  gg)z1Checks whether `chars` is a whitespace character.rN   	
TZsF)r   r   r   r   s     r   r   r     s=     s{ddlddlddl


t
$C
d{r   c                     U S:X  d  U S:X  d  U S:X  a  g[         R                  " U 5      nUR                  S5      (       a  gg)z.Checks whether `chars` is a control character.r   r   r   FCT)r   r   
startswithr   s     r   r   r     s?     t|tt|tt|


t
$C
~~cr   c                 B   [        U 5      nUS:  =(       a    US:*  nU=(       d    US:  =(       a    US:*  nU=(       d    US:  =(       a    US:*  nU=(       d    US:  =(       a    US:*  nU(       a  g	[        R                  " U 5      nUR                  S
5      (       a  g	g)z2Checks whether `chars` is a punctuation character.!   /   :   @   [   `   {   ~   TPF)r   r   r   r   )r   r   r   r   s       r   r   r     s    	TB
 8 bC

("('rRxC

("('rRxC

*"))c	C



t
$C
~~cr   )	lru_cachec                      S $ )Nc                     U $ r   r]   )funcs    r   <lambda>lru_cache.<locals>.<lambda>  s    Dr   r]   r]   r   r   r   r     s      r   c            	      <   [         R                  S   S:X  a  [        O[        n [	        [        [        S5      [        S5      S-   5      5      [	        [        [        S5      [        S5      S-   5      5      -   [	        [        [        S5      [        S	5      S-   5      5      -   nUS
S
 nSn[        S5       H4  nXA;  d  M
  UR                  U5        UR                  SU-   5        US-  nM6     U Vs/ s H
  o0" U5      PM     nn[        [        X5      5      $ s  snf )a  
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
r      !~rb      ¡   ¬   ®   ÿN   )
sysversion_infounichrchrr   ranger   r   dictzip)_chrbscsnbs        r   bytes_to_unicoder     s    %%a(A-63D	eCHHqL" 
#%)%D	D	A+' &(
( +/uSY47IM0C +D
DB
 
AB	A4[;IIaLIIdQhFA	 
 	2a$q'2B	B 
s   2Dc                 d    [        5       nU S   nU SS  H  nUR                  X#45        UnM     U$ )zxReturn set of symbol pairs in a word.

Word is represented as tuple of symbols (symbols being variable-length strings).
r   rb   N)rQ   add)wordpairs	prev_charr   s       r   	get_pairsr     s?    
 EEQIQR		9#$	  Lr   c                   \    \ rS rSrSr   SS jrS rS rS rS r	S	 r
SS
 jrS rS rSrg)r4   i	  z:
GPT-2 BPE tokenizer. Peculiarities:
    - Byte-level BPE
Nc           
         Ub  UO
[        S5      U l        [        R                  " [	        USS95      U l        U R
                  R                  5        VVs0 s H  u  pgXv_M	     snnU l        X0l        [        5       U l
        U R                  R                  5        VVs0 s H  u  pgXv_M	     snnU l        [	        USS9R                  5       R                  S5      SS nU V	s/ s H  n	[        U	R                  5       5      PM     n
n	[        [!        U
[#        [%        U
5      5      5      5      U l        0 U l        [*        R,                  " S5      U l        0 U l        0 U l        U R5                  U5        g s  snnf s  snnf s  sn	f )Nrs   rP   r`   r   rb   r   zJ's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+)r|   r}   jsonloadre   encoderr
   decodererrorsr   byte_encoderrS   readrl   r,   r   r   r   r)   	bpe_rankscacherecompilepatr   special_tokens_decoderset_special_tokens)r"   r:   r;   r   r   r}   r   r   bpe_datamerge
bpe_mergess              r   r<   GPT2Tokenizer.__init__  s>    #*"5w3t9yyj7!CD)-););)=>)=)=>,..2.?.?.E.E.GH.GdaQT.GHg6;;=CCDI!BO8@AueEKKM*
Ac*eC
O.DEF
 ::]
 !&(#/! ? IAs   E7E=$#Fc                 X    [        U R                  5      [        U R                  5      -   $ r   )r)   r   r   )r"   s    r   __len__GPT2Tokenizer.__len__)  s!    4<< 3t':':#;;;r   c                 H  ^  U(       d  0 T l         0 T l        g[        U 4S j[        U5       5       5      T l         T R                   R	                  5        VVs0 s H  u  p#X2_M	     snnT l        [
        R                  SR                  T R                   5      5        gs  snnf )zAdd a list of additional tokens to the encoder.
The additional tokens are indexed starting from the last index of the
current vocabulary in the order of the `special_tokens` list.
Nc              3   \   >#    U  H!  u  pU[        TR                  5      U-   4v   M#     g 7fr   )r)   r   )r    rH   r9   r"   s      r   r#   3GPT2Tokenizer.set_special_tokens.<locals>.<genexpr>5  s.      #L1Jvq %(T\\):Q)>#?1Js   ),zSpecial tokens {})r   r  r   	enumerater
   r   inforu   )r"   r   r   r   s   `   r   r	   GPT2Tokenizer.set_special_tokens,  s    
 "$D*,D'" #L1:>1J#L L ++113'
3 D3'
# 	'..t/B/BCD	'
s   Bc                   ^  UT R                   ;   a  T R                   U   $ [        U5      n[        U5      nU(       d  U$  [        UU 4S jS9nUT R                  ;  a  OUu  pV/ nSnU[        U5      :  a   UR                  XX5      n	UR                  X(U	 5        U	nX(   U:X  a7  U[        U5      S-
  :  a%  UUS-      U:X  a  UR                  XV-   5        US-  nOUR                  X(   5        US-  nU[        U5      :  a  M  [        U5      nUn[        U5      S:X  a  O[        U5      nM  SR                  U5      nUT R                   U'   U$ ! [         a    UR                  X(S  5         Ml  f = f)Nc                 N   > TR                   R                  U [        S5      5      $ )Ninf)r  r+   float)pairr"   s    r   r   #GPT2Tokenizer.bpe.<locals>.<lambda>H  s    (:(:4u(Nr   )keyr   rb   r   rN   )r  r,   r   minr  r)   rh   r   	Exceptionr   r3   )
r"   r8   r   r   bigramfirstsecondnew_wordrH   js
   `         r   bpeGPT2Tokenizer.bpe=  s}   DJJ::e$$U|$LNPFT^^+"MEHAc$i-

5,AOOD1I.A
 7e#CIM(9dA?"(?)OOEN3FAOODG,FA c$i-  XHD4yA~!$; < xx~ 

5' ! OODH-s   8&E E32E3c                   ^  / n[         R                  " T R                  U5       H\  nSR                  U 4S jU 5       5      nUS:X  a  M&  UR	                  S T R                  U5      R                  S5       5       5        M^     U$ )zTokenize a string. rO   c              3      >#    U  H8  n[        U5      TR                  ;   d  M  TR                  [        U5         v   M:     g 7fr   )r   r  )r    r   r"   s     r   r#   )GPT2Tokenizer.tokenize.<locals>.<genexpr>l  s9      <u!"1v):):: 6D--c!f5us
   AAc              3   $   #    U  H  ov   M     g 7fr   r]   )r    	bpe_tokens     r   r#   r)  p  s      G+Ei	+Es   rN   )r  findallr  r3   r   r%  rl   )r"   rA   
bpe_tokensr8   s   `   r   r?   GPT2Tokenizer.tokenizeh  s~    
ZZ$/EGG <u < <E{ G+/88E?+@+@+EG G 0 r   c                 |   / n[        U[        5      n[        R                  S   S:H  =(       a    [        U[        5      nU(       d  U(       a:  XR
                  ;   a  U R
                  U   $ U R                  R                  US5      $ U H]  nXPR
                  ;   a   UR                  U R
                  U   5        M2  UR                  U R                  R                  US5      5        M_     [        U5      U R                  :  a8  [        R                  SR                  [        U5      U R                  5      5        U$ )z8Converts a sequence of tokens into ids using the vocab. r   r   zToken indices sequence length is longer than the specified maximum  sequence length for this OpenAI GPT model ({} > {}). Running this sequence through the model will result in indexing errors)
isinstancestrr   r   unicoder   r   r+   r   r)   r}   r   r   ru   )r"   rF   rG   python_version_3python_version_2r8   s         r   rE   #GPT2Tokenizer.convert_tokens_to_idst  s    %fc2Q1$DFG)D 	/,,,**622||''22E+++

4..u56

4<<++E156	 
 s8dll"NNM s3x.	0
 
r   c                     / nU HY  nX@R                   ;   a)  U(       d   UR                  U R                   U   5        M9  M;  UR                  U R                  U   5        M[     U$ )z9Converts a sequence of ids in BPE tokens using the vocab.)r  r   r   )r"   rG   skip_special_tokensrF   rH   s        r   rK   #GPT2Tokenizer.convert_ids_to_tokens  sY    A///*MM$"="=a"@A + dll1o.  r   c                 B    U R                  U R                  U5      5      $ r   )rE   r?   r@   s     r   encodeGPT2Tokenizer.encode  s    ))$--*=>>r   c                     SR                  U Vs/ s H  o R                  U   PM     sn5      n[        U Vs/ s H  o@R                  U   PM     sn5      R	                  SU R
                  S9nU$ s  snf s  snf )NrO   rP   )r   )r3   r   rR   rS   rT   r   )r"   rF   r8   rA   rV   s        r   rT   GPT2Tokenizer.decode  sp    ww@U+@A=1++A.=>EEDKK F ) A=s
   A1A6)r  rS   r  r  r   r   r   r}   r  r   r  )r   NN)F)rX   rY   rZ   r[   r   r<   r  r	  r%  r?   rE   rK   r:  rT   r\   r]   r   r   r4   r4   	  sA     " $04<E")V
0	?r   r4   )!
__future__r   r   r   r   rc   loggingr1   r   r   r   regexr  r   objectr   	getLoggerrX   r   rj   rm   r-   rx   rz   r   r   r   	functoolsr   ImportErrorr   r   r4   r]   r   r   <module>rE     s   * *   	 
    Y YR 			8	$CF CLsV sl9 9x		@ +!#  6
UF Ua  !!!s   <B 	B*)B*