
    ?KiZ                    2   S r SSKrSSKJr  SSKJr  SSKJr  SSKJ	r	J
r
JrJrJrJrJr  SSKJrJrJr  SSKJr  S	S
KJrJrJrJr  S	SKJr  \R8                  " \5      rSS jrS\ S\!4S jr"S r# " S S5      r$ " S S\$5      r%S\!S\ 4S jr& " S S5      r' " S S\'5      r( " S S\'5      r) " S S\'5      r* " S S \'5      r+ " S! S"\'5      r, " S# S$\'5      r- " S% S&\'5      r. " S' S(\'5      r/ " S) S*\'5      r0 " S+ S,\'5      r1 " S- S.\'5      r2 " S/ S0\'5      r3 " S1 S2\35      r4 " S3 S4\35      r5 " S5 S6\35      r6 " S7 S8\35      r7 " S9 S:\35      r8 " S; S<\35      r9 " S= S>\35      r: " S? S@\35      r; " SA SB\35      r< " SC SD\35      r= " SE SF\35      r> " SG SH\35      r? " SI SJ\35      r@ " SK SL\35      rA " SM SN\35      rB " SO SP\35      rC " SQ SR\'5      rD " SS ST\35      rE " SU SV\'5      rF " SW SX\'5      rG " SY SZ\'5      rH " S[ S\\35      rI " S] S^\35      rJ " S_ S`\35      rK " Sa Sb\'5      rL " Sc Sd\35      rM " Se Sf\35      rN " Sg Sh\35      rOSi rP " Sj Sk5      rQ " Sl Sm5      rR0 Sn\4_So\0_Sp\5_Sq\(_Sr\E_Ss\H_St\6_Su\F_Sv\-_Sw\(_Sx\2_Sy\7_Sz\(_S{\(_S|\(_S}\(_S~\(_0 S\4_S\*_S\-_S\._S\(_S\(_S\0_S\<_S\0_S\0_S\(_S\L_S\8_S\9_S\+_S\(_S\0_E0 S\:_S\,_S\A_S\/_S\(_S\>_S\?_S\(_S\0_S\1_S\;_S\(_S\B_S\C_S\D_S\<_S\=_E\)\I\K\K\J\KS.ErSSS\4S jjrTg)z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)	lru_cache)Optional)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece)tqdm   )is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERRORc                 8   [        5       (       a  SSKJn  U$ [        5       (       aV  SS Kn[
        R                  " UR                  R                  5      [
        R                  " S5      :  a  SSK	Jn  U$ SSK	J
n  U$ [        [        R                  " U 5      5      e)Nr   )sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r   sentencepiecer   r   google.protobufr   parseprotobuf__version__transformers.utilsr   ImportErrorr   format)error_messager   googles      e/var/www/html/dynamic-report/venv/lib/python3.13/site-packages/transformers/convert_slow_tokenizer.pyimport_protobufr%   %   sr    !##9&&==445g8NNB '& b&&/66}EFF    add_prefix_spacereturnc                 H    U (       a  Sn[        USS5      (       d  SnU$ SnU$ )NalwayslegacyTfirstnever)getattr)r'   original_tokenizerprepend_schemes      r$   _get_prepend_schemer1   6   s4    !)8T::$N  !r&   c                   ^  US LnU(       a  [        U5      OT n/ nUR                  5        Hm  u  pE/ n[        S[        U5      5       H.  nUS U XGS  pUT ;   d  M  U	T ;   d  M  UR	                  XU45        M0     [        UU 4S jS9nUR                  U5        Mo     [        US US9nU V
s/ s H  oS   U
S   4PM     nn
U$ s  sn
f )Nr   c                 $   > TU S      TU S      4$ Nr   r    )xvocabs    r$   <lambda>!generate_merges.<locals>.<lambda>K   s    U1Q4[%!+,Fr&   keyc                 B    U S   [        U S   5      [        U S   5      4$ )N   r   r   )lenvals    r$   r8   r9   N   s    SVSQ[#c!f+,Nr&   r;   reverser   )dictitemsranger>   appendsortedextend)r7   vocab_scoresrB   mergesmergepiece_scorelocalindexpiece_lpiece_rr@   s   `          r$   generate_mergesrQ   @   s    $&G)04%eLF*0021c%j)E$Ve}eFmW%Gu$4g<= * u"FGe 3 F NX_`F*01&31vs1v&F1M 2s   -Cc                   R    \ rS rSrSrS\4S jrS	S\\\\	4   \
\   4   4S jjrSrg)
SentencePieceExtractorS   zd
Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
modelc                 v    [        U S5        SSKJn  U" 5       U l        U R                  R	                  U5        g )Nr   r   )SentencePieceProcessor)r   r   rW   spLoad)selfrU   rW   s      r$   __init__SentencePieceExtractor.__init__X   s)    $08(*Ur&   Nr(   c                     U R                   n[        UR                  5       5       Vs0 s H  o2R                  U5      U_M     nn[	        XA5      nXE4$ s  snf )
By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
order the merges with respect to the piece scores instead.
)rX   rE   GetPieceSizeid_to_piecerQ   rZ   rI   rX   rN   r7   rJ   s         r$   extractSentencePieceExtractor.extract_   sS    
 WW;@AR;ST;S%&-;ST 5}	 Us   A)rX   N)__name__
__module____qualname____firstlineno____doc__strr[   tuplerC   intlistrb   __static_attributes__r5   r&   r$   rS   rS   S   s:    c 
E$sCx.$u+2M,N 
 
r&   rS   c                   @    \ rS rSrSS\\\\4   \\   4   4S jjr	Sr
g)GemmaSentencePieceExtractorl   Nr(   c                     U R                   n[        UR                  5       5       Vs0 s H  o2R                  U5      U_M     nnSU;  a  UR	                  S5      US'   [        XA5      nXE4$ s  snf )r^   	<0x09>)rX   rE   r_   r`   getrQ   ra   s         r$   rb   #GemmaSentencePieceExtractor.extractm   so    
 WW;@AR;ST;S%&-;ST u))H-E$K 5} Us   A-r5   rd   )re   rf   rg   rh   rk   rC   rj   rl   rm   rb   rn   r5   r&   r$   rp   rp   l   s)    E$sCx.$u+2M,N  r&   rp   piecec                 z    [        U 5      S:  =(       d'    U S   S:g  =(       d    U S   R                  5       (       + $ )Nr=   ,)r>   isdigit)rw   s    r$   check_number_commar}   }   s3    u:>HU2Y#-HU2Y5F5F5H1HHr&   c                   (    \ rS rSrS rS\4S jrSrg)	Converter   c                     Xl         g rd   r/   )rZ   r/   s     r$   r[   Converter.__init__   s    "4r&   r(   c                     [        5       erd   )NotImplementedErrorrZ   s    r$   	convertedConverter.converted   s    !##r&   r   N)re   rf   rg   rh   r[   r   r   rn   r5   r&   r$   r   r      s    5$9 $r&   r   c                   "    \ rS rSrS\4S jrSrg)BertConverter   r(   c           	      b   U R                   R                  n[        [        U[	        U R                   R
                  5      S95      nSnSnSn[        U R                   S5      (       a`  U R                   R                  R                  nU R                   R                  R                  nU R                   R                  R                  n[        R                  " SUUUS9Ul        [        R                  " 5       Ul        [	        U R                   R"                  5      n[	        U R                   R$                  5      nU R                   R&                  nU R                   R(                  n	[*        R,                  " U SU S3U SU SU S	3Xh4Xy4/S
9Ul        [0        R                  " SS9Ul        U$ )N	unk_tokenFbasic_tokenizerT
clean_texthandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixr/   r7   r   r   rj   r   hasattrr   tokenize_chinese_charsr   do_lower_caser
   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr   TemplateProcessingpost_processorr	   decoder
rZ   r7   	tokenizerr   r   r   clssepr   r   s
             r$   r   BertConverter.converted      ''--iT=T=T=^=^9_`a	!&4**,=>>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5##$
	  %..d;	r&   r5   Nre   rf   rg   rh   r   r   rn   r5   r&   r$   r   r          #9 #r&   r   c                   "    \ rS rSrS\4S jrSrg)SplinterConverter   r(   c           
      v   U R                   R                  n[        [        U[	        U R                   R
                  5      S95      nSnSnSn[        U R                   S5      (       a`  U R                   R                  R                  nU R                   R                  R                  nU R                   R                  R                  n[        R                  " SUUUS9Ul        [        R                  " 5       Ul        [	        U R                   R"                  5      n[	        U R                   R$                  5      n[	        U R                   R&                  5      nSn	U R                   R(                  n
U R                   R*                  nU R                   R,                  nU R                   R/                  S5      nU R                   R0                  S:X  a  U SU S	U	 S	U S
U S3
nOU SU S
U S	U	 S	U S3
n[2        R4                  " U SU S3UXj4X{4X4X4/S9Ul        [8        R                  " SS9Ul        U$ )Nr   Fr   Tr   .rightr    r   r   r   r   r   r   )r/   r7   r   r   rj   r   r   r   r   r   r   r
   r   r   r   r   r   r   r   question_tokenr   r   question_token_idconvert_tokens_to_idspadding_sider   r   r   r	   r   )rZ   r7   r   r   r   r   r   r   questiondotr   r   r   dot_token_idr   s                  r$   r   SplinterConverter.converted   s   ''--iT=T=T=^=^9_`a	!&4**,=>>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334t..==>..;;..;; 33EE..DDSI""//7:U(8*AcU!C5RHDU(3%xz3%qRHD#-#@#@U(3%r*##-#		$
	  %..d;	r&   r5   Nr   r5   r&   r$   r   r      s    .9 .r&   r   c                   "    \ rS rSrS\4S jrSrg)FunnelConverter   r(   c           	      b   U R                   R                  n[        [        U[	        U R                   R
                  5      S95      nSnSnSn[        U R                   S5      (       a`  U R                   R                  R                  nU R                   R                  R                  nU R                   R                  R                  n[        R                  " SUUUS9Ul        [        R                  " 5       Ul        [	        U R                   R"                  5      n[	        U R                   R$                  5      nU R                   R&                  nU R                   R(                  n	[*        R,                  " U SU S3U SU SU S	3Xh4Xy4/S
9Ul        [0        R                  " SS9Ul        U$ )Nr   Fr   Tr   z:2 $A:0 r   r   r   r   r   r   r   r   s
             r$   r   FunnelConverter.converted   r   r&   r5   Nr   r5   r&   r$   r   r      r   r&   r   c                   "    \ rS rSrS\4S jrSrg)MPNetConverteri	  r(   c                 h   U R                   R                  n[        [        U[	        U R                   R
                  5      S95      nSnSnSn[        U R                   S5      (       a`  U R                   R                  R                  nU R                   R                  R                  nU R                   R                  R                  n[        R                  " SUUUS9Ul        [        R                  " 5       Ul        [	        U R                   R"                  5      n[	        U R                   R$                  5      nU R                   R&                  nU R                   R(                  n	[*        R,                  " U SU S3U SU SU S	U S
3Xh4Xy4/S9Ul        [0        R                  " SS9Ul        U$ )Nr   Fr   Tr   r   r   z:0 r   r   r   r   r   r   r   s
             r$   r   MPNetConverter.converted
  s   ''--iT=T=T=^=^9_`a	!&4**,=>>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5SXcU"=##$
	  %..d;	r&   r5   Nr   r5   r&   r$   r   r   	  r   r&   r   c                   "    \ rS rSrS\4S jrSrg)OpenAIGPTConverteri0  r(   c                    U R                   R                  n[        U R                   R                  R	                  5       5      nU R                   R
                  n[        [        UUS [        U5      SSS95      nUR                  [        U5      5      b  UR                  [        U5      /5        [        R                  " SS9Ul        [        R                  " 5       Ul        ["        R$                  " SS9Ul        U$ )N</w>F)r7   rJ   dropoutr   end_of_word_suffixfuse_unkT)r   suffix)r/   encoderrm   	bpe_rankskeysr   r   r   rj   token_to_idadd_special_tokensr
   r   r   r   r   r   r	   
BPEDecoderr   rZ   r7   rJ   r   r   s        r$   r   OpenAIGPTConverter.converted1  s    ''//d--77<<>?++55	i.#)	
	   Y0<((#i.)9:*99DI	"0"A"A"C	$//v>	r&   r5   Nr   r5   r&   r$   r   r   0  s    9 r&   r   c            	       V    \ rS rSr SS\\\\4      S\\\	\\4         S\
4S jjrSrg)	GPT2ConverteriK  Nr7   rJ   r(   c                 Z   U(       d  U R                   R                  nU(       d  [        U R                   R                  5      n[	        [        UUS SSSS95      n[        U R                   SS5      n[        R                  " US9Ul	        [        R                  " 5       Ul        [        U R                   SS5      (       aQ  U R                   R                  nU R                   R                  n[        R                  " U S3U S3XV4/S	9Ul        U$ [        R                  " SS
9Ul        U$ )N Fr7   rJ   r   continuing_subword_prefixr   r   r'   r'   add_bos_tokenz:0 $A:0z:0 $A:0 $B:1r   trim_offsets)r/   r   rm   r   r   r   r.   r   	ByteLevelr   r	   r   	bos_tokenbos_token_idr   r   r   )rZ   r7   rJ   r   r'   bosr   s          r$   r   GPT2Converter.convertedL  s    ++33E$11;;<F*,#%	
	 #4#:#:<NPUV"0":":L\"]	$..0	4**OUCC))33C22??L'1'D'DguL)' (I$  (2';';'OI$r&   r5   NNre   rf   rg   rh   r   rC   rj   rl   rm   rk   r   r   rn   r5   r&   r$   r   r   K  sI    `d$d38n-$>FtERUWZRZOG\>]$	$ $r&   r   c                   "    \ rS rSrS\4S jrSrg)HerbertConverteris  r(   c           
      ~   SnSnU R                   R                  n[        U R                   R                  R	                  5       5      nXS   S   ;   a  USS  n[        [        UUS U R                   R                  US95      n[        R                  " SSS9Ul
        [        R                  " 5       Ul        [        R                  " US9Ul        ["        R$                  " U R                   R&                  U R                   R(                  4U R                   R*                  U R                   R,                  4S	9Ul        U$ )
Nz	#version:r   r   r   )r   r   r   F)r   r   r   )r   r   )r/   r   rm   r   r   r   r   r   r
   r   r   r   r   r   r	   r   r   r   BertProcessingr   r   r   r   r   )rZ   tokenizer_info_strtoken_suffixr7   rJ   r   s         r$   r   HerbertConverter.convertedt  s   (''//d--77<<>?1-ABZF11;;#/
	  +99EY^_	"0"A"A"C	$//|D	#-#<#<((22D4K4K4X4XY((22D4K4K4X4XY$
	 
 r&   r5   Nr   r5   r&   r$   r   r   s      9 r&   r   c            	       V    \ rS rSr SS\\\\4      S\\\	\\4         S\
4S jjrSrg)	Qwen2Converteri  Nr7   rJ   r(   c                 8   U(       d  U R                   R                  nU(       d-  [        U R                   R                  R	                  5       5      n[        [        UUS S SSSSS95      n[        R                  " 5       Ul	        [        R                  " [        R                  " [        S5      SSS9[        R                  " [        U R                   SS5      SS9/5      Ul        ["        R                  " 5       Ul        [&        R                  " SS	9Ul        U$ )
Nr   F)r7   rJ   r   r   r   r   r   byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr'   r'   	use_regexr   )r/   r   rm   r   r   r   r   r
   NFCr   r   SequenceSplitr   r   r.   r   r	   r   r   r   )rZ   r7   rJ   r   s       r$   r   Qwen2Converter.converted  s    ++33E$11;;@@BCF*,#%#	
	  +0	"0"9"9$$ N (  ((%,T-D-DFXZ_%`##
	  %..0	#-#7#7U#K	 r&   r5   r   r   r5   r&   r$   r   r     sI    `d*d38n-*>FtERUWZRZOG\>]*	* *r&   r   c                   "    \ rS rSrS\4S jrSrg)RobertaConverteri  r(   c                    U R                   nUR                  n[        UR                  R	                  5       5      n[        [        UUS SSSS95      n[        R                  " UR                  S9Ul
        [        R                  " 5       Ul        [        R                  " UR                  UR                   4UR"                  UR$                  4UR                  SS9Ul        U$ )Nr   Fr   r   Tr   r   r'   r   )r/   r   rm   r   r   r   r   r   r   r'   r   r	   r   r   RobertaProcessingr   r   r   r   r   rZ   otr7   rJ   r   s        r$   r   RobertaConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#?#?r/r/00	$
	  r&   r5   Nr   r5   r&   r$   r  r        9 r&   r  c                   "    \ rS rSrS\4S jrSrg)RoFormerConverteri  r(   c           	      J   SSK Jn  U R                  R                  n[	        [        U[        U R                  R                  5      S95      nSnSn[        U R                  S5      (       a@  U R                  R                  R                  nU R                  R                  R                  n[        R                  " SSUUS9Ul        [        R                   R#                  U" U5      5      Ul        [        U R                  R&                  5      n[        U R                  R(                  5      nU R                  R*                  nU R                  R,                  n	[.        R0                  " U SU S	3U SU S
U S3Xh4Xy4/S9Ul        [4        R
                  " SS9Ul        U$ )Nr   )JiebaPreTokenizerr   Fr   Tr   r   r   r   r   r   r   r   )"models.roformer.tokenization_utilsr  r/   r7   r   r   rj   r   r   r   r   r   r
   r   r   r   PreTokenizercustomr   r   r   r   r   r   r   r   r	   r   )
rZ   r  r7   r   r   r   r   r   r   r   s
             r$   r   RoFormerConverter.converted  sx   I''--iT=T=T=^=^9_`a	4**,=>> 33CCQQM 33CCQQM*99!&'#	 
	 #1"="="D"DEVW\E]"^	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5##$
	  %..d;	r&   r5   Nr   r5   r&   r$   r  r    r   r&   r  c                   "    \ rS rSrS\4S jrSrg)DebertaConverteri  r(   c                    U R                   nUR                  n[        UR                  R	                  5       5      n[        [        UUS SSSS95      n[        R                  " UR                  S9Ul
        [        R                  " 5       Ul        [        R                  " SSSU R                   R                  S5      4SU R                   R                  S5      4/S	9Ul        U$ )
Nr   Fr   r   [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r   )r/   r   rm   r   r   r   r   r   r   r'   r   r	   r   r   r   r   r   r  s        r$   r   DebertaConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#@#@)4$11GGPQ$11GGPQ$
	  r&   r5   Nr   r5   r&   r$   r  r    r   r&   r  c                   l   ^  \ rS rSrSr\r0 rU 4S jrS r	S r
S rS rS rS	 rS
 rS\4S jrSrU =r$ )SpmConverteri#  Fc                   > [        U S5        [        TU ]  " U6   [        5       nUR	                  5       n[        U R                  R                  S5       nUR                  UR                  5       5        S S S 5        X0l
        U R                  R                  R                  (       a)  U R                  (       d  [        R                  " S5        g g g ! , (       d  f       Nc= f)Nr   rba  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superr[   r%   
ModelProtoopenr/   
vocab_fileParseFromStringreadprototrainer_specr   handle_byte_fallbackwarningswarn)rZ   args	model_pb2mf	__class__s        r$   r[   SpmConverter.__init__(  s    $
+$ $%	  "$))44d;qaffh' <
::""009R9RMMe :S0	 <;s    C
C c                 p    UR                    Vs/ s H  o"R                  UR                  4PM     sn$ s  snf rd   piecesrw   scorerZ   r*  rw   s      r$   r7   SpmConverter.vocab=  s)    8=Euekk*EEEs   !3c                 .    UR                   R                  $ rd   )r+  unk_idrZ   r*  s     r$   r<  SpmConverter.unk_id@  s    !!(((r&   c                    UR                   R                  nU R                  U5      nUS:X  a.  [        [	        UU R                  U5      U R                  S95      nOUS:X  a  U R                  U R                  R                  5      R                  U5      u  pV[        U5       VVV	s0 s H
  u  nu  pX_M     n
nnn	[        [        U
UUR                   R                  SU R                  S S95      nO[        S5      e[        UR                  5       VVs/ s HR  u  pUR                   S;   d  M  XR"                  UR                   S:H  =(       d    UR"                  U R$                  ;   4PMT     nnnUR'                  [)        US	 S
9 VVVs/ s H  u  pn[+        USUS9PM     snnn5        U$ s  sn	nnf s  snnf s  snnnf )Nr   r<  r   r=   Tr   r   r   r   z]You're trying to run a `Unigram` model but you're file was trained with a different algorithm      rC  c                     U S   $ Nr   r5   r6   s    r$   r8   (SpmConverter.tokenizer.<locals>.<lambda>o      QRSTQUr&   r:   F
normalizedspecial)r+  
model_typer7   r   r   r<  r,  SpmExtractorr/   r'  rb   	enumerater   	unk_piece	Exceptionr7  typerw   r   
add_tokensrG   r   )rZ   r*  rM  rI   r   _rJ   iwordr8  	bpe_vocabidpspm_added_tokenstokenrL  s                   r$   r   SpmConverter.tokenizerC  s   ''22
zz%(?! ;;u-"&";";I 1_))$*A*A*L*LMUUVbcIA9B<9PQ9P%5Q9PIQ!#00::!"&";"; 	I o  #5<<0
0vv IR!&&A+GD4G4G)GH0 	 

 	 +11A~*V*V&Bw 5UGD*V	
 C R*
s   'F.F5/?F5F;c                 .   UR                   R                  n[        R                  " SSS9[        R                  " [        S5      S5      /nU(       d  [        R                  " U5      $ [        R                  " [        R                  " U5      /U-   5      $ )NFT)leftr    {2,}   ▁)normalizer_specprecompiled_charsmapr
   StripReplacer   r  PrecompiledrZ   r*  rb  _normalizerss       r$   r   SpmConverter.normalizeru  s{    $44II55g6
 $''55'')@)@AU)V(WZf(fggr&   c                 T    [        X R                  5      n[        R                  " XS9$ Nreplacementr0   )r1   r/   r   	MetaspacerZ   rl  r'   r0   s       r$   r   SpmConverter.pre_tokenizer  s$    ,-=?V?VW''K__r&   c                     g rd   r5   r   s    r$   r   SpmConverter.post_processor  s    r&   c                 T    [        X R                  5      n[        R                  " XS9$ rj  )r1   r/   r	   rm  rn  s       r$   r   SpmConverter.decoder  s$    ,-=?V?VW!!kYYr&   r(   c                    U R                  U R                  5      nU R                  U R                  5      nUb  X!l        SnSn[        U R                  S5      (       a  U R                  R
                  nU R                  X45      nUb  XQl        U R                  X45      Ul        U R                  5       nU(       a  Xal        U$ )Nr`  Tr'   )	r   r*  r   r   r/   r'   r   r   r   )rZ   r   r   rl  r'   r   r   s          r$   r   SpmConverter.converted  s    NN4::.	 __TZZ0
!#- 4**,>??#66GG**;I$&3# LLG	,,.'5$r&   r*  )re   rf   rg   rh   r,  rS   rN  r   r[   r7   r<  r   r   r   r   r   r   r   rn   __classcell__)r3  s   @r$   r!  r!  #  sQ     )LN*F)0d	h`Z9  r&   r!  c                   &    \ rS rSrS rS rS rSrg)AlbertConverteri  c                     UR                    Vs/ s HP  n[        UR                  5      (       a  UR                  UR                  4OUR                  UR                  S-
  4PMR     sn$ s  snf Nd   r7  r}   rw   r8  r9  s      r$   r7   AlbertConverter.vocab  f     
% +=U[[*I*IU[[%++&PUP[P[]b]h]hkn]nOoo%
 	
 
   AA)c                    [         R                  " SS5      [         R                  " SS5      /nU R                  R                  (       dH  UR	                  [         R
                  " 5       5        UR	                  [         R                  " 5       5        U R                  R                  (       a$  UR	                  [         R                  " 5       5        UR                  R                  nU(       a%  UR	                  [         R                  " U5      5        UR	                  [         R                  " [        S5      S5      5        [         R                  " U5      $ Nz``"z''r_  r   r
   rd  r/   keep_accentsrF   NFKDStripAccentsr   	Lowercasera  rb  re  r   r  rZ   r*  list_normalizersrb  s       r$   r   AlbertConverter.normalizer      c*c*
 &&33##K$4$4$67##K$<$<$>?""00##K$9$9$;<$44II##K$;$;<P$QR 3 3E'NC HI##$455r&   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ Nr  r  r  r  r   r   r   r/   r   r   s    r$   r   AlbertConverter.post_processor  R    ,,)4$11GGPQ$11GGPQ
 	
r&   r5   Nre   rf   rg   rh   r7   r   r   rn   r5   r&   r$   ry  ry        
6&
r&   ry  c                        \ rS rSrS rS rSrg)BarthezConverteri  c                 
    SnU$ NrC  r5   rZ   r*  r<  s      r$   r<  BarthezConverter.unk_id      r&   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r   r  r   s    r$   r   BarthezConverter.post_processor  R    ,, +//EEeLM00FFvNO
 	
r&   r5   N)re   rf   rg   rh   r<  r   rn   r5   r&   r$   r  r    s    
r&   r  c                   &    \ rS rSrS rS rS rSrg)CamembertConverteri  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nUS/-  nU$ s  snf )N))z
<s>NOTUSED        <pad>r  )z</s>NOTUSEDr  z<unk>r  )z<unk>NOTUSEDir   z<mask>r  r6  rZ   r*  r7   rw   s       r$   r7   CamembertConverter.vocab  sR    
 	,,qr:JK:J;;,:JKK/"" L   !Ac                     gr  r5   r=  s     r$   r<  CamembertConverter.unk_id  s    r&   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ r  r  r   s    r$   r   !CamembertConverter.post_processor  r  r&   r5   Nre   rf   rg   rh   r7   r<  r   rn   r5   r&   r$   r  r    s    
r&   r  c                   &    \ rS rSrS rS rS rSrg)DebertaV2Converteri  c                    / nU R                   R                  (       a#  UR                  [        R                  " SS95        [        X R                   5      nUR                  [        R                  " XS95        [        R                  " U5      $ )Nr   )r   rk  )r/   split_by_punctrF   r   Punctuationr1   rm  r  )rZ   rl  r'   list_pretokenizersr0   s        r$   r    DebertaV2Converter.pre_tokenizer  sl    ""11%%n&@&@*&UV,-=?V?VW!!.":":{"rs&&'9::r&   c                    / nU R                   R                  (       a$  UR                  [        R                  " 5       5        UR                  [        R
                  " 5       5        UR                  R                  nU(       a%  UR                  [        R                  " U5      5        UR                  [        R                  " [        S5      S5      5        [        R                  " U5      $ )Nr_  r   )r/   r   rF   r
   r  rc  ra  rb  re  rd  r   r  r  s       r$   r   DebertaV2Converter.normalizer  s    ""00##K$9$9$;< 1 1 34$44II##K$;$;<P$QR 3 3E'NC HI##$455r&   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ r  r  r   s    r$   r   !DebertaV2Converter.post_processor  r  r&   r5   N)re   rf   rg   rh   r   r   r   rn   r5   r&   r$   r  r    s    ;6
r&   r  c                   &    \ rS rSrS rS rS rSrg)MBartConverteri  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU/ SQ-  nUS/-  nU$ s  snf )Nr  r  r  r  r  r  rC  )ar_ARr  cs_CZr  de_DEr  en_XXr  es_XXr  et_EEr  fi_FIr  fr_XXr  gu_INr  hi_INr  it_ITr  ja_XXr  kk_KZr  ko_KRr  lt_LTr  lv_LVr  my_MMr  ne_NPr  nl_XXr  ro_ROr  ru_RUr  si_LKr  tr_TRr  vi_VNr  zh_CNr  r  r6  r  s       r$   r7   MBartConverter.vocab  sc    
 	,,qr:JK:J;;,:JKK 
 	
6 	/""; L   !Ac                     gr  r5   r=  s     r$   r<  MBartConverter.unk_id>      r&   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nz$A </s> en_XXz$A $B </s> en_XXr  r  r   r  r   s    r$   r   MBartConverter.post_processorA  R    ,,"#$11GGPQ00FFvNO
 	
r&   r5   Nr  r5   r&   r$   r  r    s    $L
r&   r  c                   &    \ rS rSrS rS rS rSrg)MBart50ConverteriL  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU/ SQ-  nUS/-  nU$ s  snf )Nr  rC  )4r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )af_ZAr  )az_AZr  )bn_INr  )fa_IRr  )he_ILr  )hr_HRr  )id_IDr  )ka_GEr  )km_KHr  )mk_MKr  )ml_INr  )mn_MNr  )mr_INr  )pl_PLr  )ps_AFr  )pt_XXr  )sv_SEr  )sw_KEr  )ta_INr  )te_INr  )th_THr  )tl_XXr  )uk_UAr  )ur_PKr  )xh_ZAr  )gl_ESr  )sl_SIr  r  r6  r  s       r$   r7   MBart50Converter.vocabM  sc    
 	,,qr:JK:J;;,:JKK  R  	R/"" Lr  c                     gr  r5   r=  s     r$   r<  MBart50Converter.unk_idY  r  r&   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nzen_XX $A </s>zen_XX $A $B </s>r  r  r   r  r   s    r$   r   MBart50Converter.post_processor\  r  r&   r5   Nr  r5   r&   r$   r  r  L  s    

r&   r  c                   &    \ rS rSrS rS rS rSrg)NllbConverterig  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU$ s  snf )Nr  rC  r6  r  s       r$   r7   NllbConverter.vocabh  E    
 	,,qr:JK:J;;,:JKK L   !>c                     gr  r5   r=  s     r$   r<  NllbConverter.unk_idr  r  r&   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nzeng_Latn $A </s>zeng_Latn $A $B </s>eng_Latnr  r   r  r   s    r$   r   NllbConverter.post_processoru  sR    ,,%&T44JJ:VW00FFvNO
 	
r&   r5   Nr  r5   r&   r$   r   r   g  s    
r&   r   c                   &    \ rS rSrS rS rS rSrg)SeamlessM4TConverteri  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU$ s  snf )N)r  r  r  r  rC  r6  r  s       r$   r7   SeamlessM4TConverter.vocab  r#  r$  c                 .    U R                   R                  $ rd   )r/   unk_token_idr=  s     r$   r<  SeamlessM4TConverter.unk_id  s    &&333r&   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nz__eng__ $A </s>z__eng__ $A $B </s>__eng__r  r   r  r   s    r$   r   #SeamlessM4TConverter.post_processor  sR    ,,$%D33II)TU00FFvNO
 	
r&   r5   Nr  r5   r&   r$   r+  r+    s    4
r&   r+  c                   &    \ rS rSrS rS rS rSrg)XLMRobertaConverteri  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nUS/-  nU$ s  snf )Nr  rC  r  r6  r  s       r$   r7   XLMRobertaConverter.vocab  sR    
 	,,qr:JK:J;;,:JKK/"" Lr  c                 
    SnU$ r  r5   r  s      r$   r<  XLMRobertaConverter.unk_id  r  r&   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ r  r  r   s    r$   r   "XLMRobertaConverter.post_processor  r  r&   r5   Nr  r5   r&   r$   r5  r5        	
r&   r5  c                   &    \ rS rSrS rS rS rSrg)XLNetConverteri  c                     UR                    Vs/ s HP  n[        UR                  5      (       a  UR                  UR                  4OUR                  UR                  S-
  4PMR     sn$ s  snf r{  r}  r9  s      r$   r7   XLNetConverter.vocab  r  r  c                    [         R                  " SS5      [         R                  " SS5      /nU R                  R                  (       dH  UR	                  [         R
                  " 5       5        UR	                  [         R                  " 5       5        U R                  R                  (       a$  UR	                  [         R                  " 5       5        UR                  R                  nU(       a%  UR	                  [         R                  " U5      5        UR	                  [         R                  " [        S5      S5      5        [         R                  " U5      $ r  r  r  s       r$   r   XLNetConverter.normalizer  r  r&   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r   r  r   s    r$   r   XLNetConverter.post_processor  r  r&   r5   Nr  r5   r&   r$   r>  r>    r  r&   r>  c                       \ rS rSrSrg)ReformerConverteri  r5   Nre   rf   rg   rh   rn   r5   r&   r$   rF  rF        r&   rF  c                        \ rS rSrS rS rSrg)RemBertConverteri  c                    [         R                  " SS5      [         R                  " SS5      [         R                  " [        S5      S5      /nU R                  R                  (       dH  UR                  [         R                  " 5       5        UR                  [         R                  " 5       5        U R                  R                  (       a$  UR                  [         R                  " 5       5        UR                  R                  nU(       a%  UR                  [         R                  " U5      5        [         R                  " U5      $ r  )r
   rd  r   r/   r  rF   r  r  r   r  ra  rb  re  r  r  s       r$   r   RemBertConverter.normalizer  s    c*c*g4

 &&33##K$4$4$67##K$<$<$>?""00##K$9$9$;<$44II##K$;$;<P$QR##$455r&   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ r  r  r   s    r$   r   RemBertConverter.post_processor  r  r&   r5   N)re   rf   rg   rh   r   r   rn   r5   r&   r$   rJ  rJ    s    6&
r&   rJ  c                       \ rS rSrSrg)BertGenerationConverteri  r5   NrG  r5   r&   r$   rP  rP    rH  r&   rP  c                   ,    \ rS rSrS rS rS rS rSrg)PegasusConverteri  c                    U R                   R                  S4U R                   R                  S4/nU R                   R                  b  X R                   R                  S4/-  nU R                   R                  bI  U R                   R
                  U R                   R                  :  a  X R                   R                  S4/-  nU[        SU R                   R                  5       Vs/ s H  nSU S3S4PM     sn-  nX!R                  SS   Vs/ s H  oDR                  UR                  4PM     sn-  nU$ s  snf s  snf )Nr  r=   z<unk_>g      Y)r/   	pad_token	eos_tokenmask_token_sent
mask_tokenmask_token_idoffsetrE   r7  rw   r8  )rZ   r*  r7   rU  rw   s        r$   r7   PegasusConverter.vocab  s)   $$..4$$..4

 ""22>..>>DEEE ##..:''558O8O8V8VV..993?@@E%4;R;R;Y;Y2Z[2ZQU1#Q<(2Z[[,,qr:JK:J;;,:JKK \Ks   &D6!D;c                 \    UR                   R                  U R                  R                  -   $ rd   )r+  r<  r/   rZ  r=  s     r$   r<  PegasusConverter.unk_id  s%    !!((4+B+B+I+IIIr&   c                     [        X R                  5      n[        R                  " [        R                  " 5       [        R
                  " XS9/5      $ rj  )r1   r/   r   r  WhitespaceSplitrm  rn  s       r$   r   PegasusConverter.pre_tokenizer  sE    ,-=?V?VW&&..0(([`
 	
r&   c                     U R                   R                  nXR                   R                  4/n[        R                  " SU/SSU/US9$ )N$A$Br   )r/   rV  eos_token_idr   r   )rZ   eosr   s      r$   r   PegasusConverter.post_processor!  sP    %%//))667
 ,,T3KtTSVFWhvwwr&   r5   N)	re   rf   rg   rh   r7   r<  r   r   rn   r5   r&   r$   rR  rR    s    &J
xr&   rR  c                        \ rS rSrS rS rSrg)T5Converteri)  c                     U R                   R                  nUR                   Vs/ s H  o3R                  UR                  4PM     nnU[        US-
  SS5       Vs/ s H  nSU S3S4PM     sn-  nU$ s  snf s  snf )Nr   ry   z
<extra_id_rT  r  )r/   
_extra_idsr7  rw   r8  rE   )rZ   r*  num_extra_idsrw   r7   rU  s         r$   r7   T5Converter.vocab*  s|    //::9>F++u{{+FE-!:KRQS4TU4TqZs!$c*4TUU GUs   !A4A9c                 n    [         R                  " SS// SQSU R                  R                  S5      4/S9$ Nrb  r  )rb  r  rc  r  r   r  r   s    r$   r   T5Converter.post_processor0  =    ,,&>-00FFvNO
 	
r&   r5   N)re   rf   rg   rh   r7   r   rn   r5   r&   r$   rh  rh  )  s    
r&   rh  c                       \ rS rSrS rSrg)UdopConverteri:  c                 n    [         R                  " SS// SQSU R                  R                  S5      4/S9$ rn  r  r   s    r$   r   UdopConverter.post_processor;  rp  r&   r5   Nre   rf   rg   rh   r   rn   r5   r&   r$   rr  rr  :  s    
r&   rr  c                   "    \ rS rSrS\4S jrSrg)WhisperConverteriE  r(   c                    U R                   R                  n[        U R                   R                  R	                  5       5      n[        [        UUS SSSS95      n[        R                  " U R                   R                  S9Ul
        [        R                  " 5       Ul        U R                   R                  nU R                   R                  U5      nU R                   R                  nU R                   R                   nSR#                  U Vs/ s H  o S3PM	     sn5      n	[$        R&                  " U	 SU S3U	 SU S	3Xg4/[)        XT5      QS
9Ul        U$ s  snf )Nr   Fr   r   r   r   z $A:0 z $A:0 $B:1 r   r   )r/   r   rm   r   r   r   r   r   r   r'   r   r	   r   prefix_tokensconvert_ids_to_tokensrV  rd  joinr   r   zipr   )
rZ   r7   rJ   r   prefix_token_idsprefixesre  rd  r[  prefix_templates
             r$   r   WhisperConverter.convertedF  sO   ''//d--77<<>?*,#%	
	 #1":":DLcLcLtLt"u	$..0	22@@**@@AQR%%//..;;((h#GhUgRLh#GH#-#@#@%&fSE4#$KuB7#X0$
	   $Hs   Er5   Nr   r5   r&   r$   rw  rw  E  s     9  r&   rw  c                       \ rS rSrS rSrg)BigBirdConverterii  c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ r  r  r   s    r$   r   BigBirdConverter.post_processorj  r  r&   r5   Nru  r5   r&   r$   r  r  i  s    
r&   r  c                   "    \ rS rSrS\4S jrSrg)CLIPConverteriu  r(   c                 j   U R                   R                  n[        U R                   R                  R	                  5       5      nU R                   R
                  n[        [        UUS SSS[        U5      S95      n[        R                  " [        R                  " 5       [        R                  " [        S5      S5      [        R                  " 5       /5      Ul        [         R                  " [         R"                  " [        S5      SS	S
9[         R$                  " SS9/5      Ul        [(        R$                  " 5       Ul        [,        R.                  " U R                   R0                  U R                   R2                  4U R                   R4                  U R                   R6                  4SSS9Ul        U$ )Nr   r   Fr7   rJ   r   r   r   r   r   z\s+r   z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTr   r   r
  )r/   r   rm   r   r   r   r   r   rj   r
   r  r  rd  r   r  r   r   r  r   r   r	   r   r   r  rV  rd  r   r   r   r   s        r$   r   CLIPConverter.convertedv  sk   ''//d--77<<>?++55	*,#)i.

	  +33__ 3 3E&M3 GI^I^I`a 
	 #1"9"9$$Z[&
 ((%@	#
	 %..0	 $.#?#?((22D4K4K4X4XY((22D4K4K4X4XY"	$
	  r&   r5   Nr   r5   r&   r$   r  r  u  s    '9 'r&   r  c                   "    \ rS rSrS\4S jrSrg)LayoutLMv2Converteri  r(   c           	      b   U R                   R                  n[        [        U[	        U R                   R
                  5      S95      nSnSnSn[        U R                   S5      (       a`  U R                   R                  R                  nU R                   R                  R                  nU R                   R                  R                  n[        R                  " SUUUS9Ul        [        R                  " 5       Ul        [	        U R                   R"                  5      n[	        U R                   R$                  5      nU R                   R&                  nU R                   R(                  n	[*        R,                  " U SU S3U SU SU S	3Xh4Xy4/S
9Ul        [0        R                  " SS9Ul        U$ )Nr   FTr   r   r   r   r   r   r   r   r   r   r   s
             r$   r   LayoutLMv2Converter.converted  s   ''--iT=T=T=^=^9_`a	!&4**,=>>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5##$
	  %..d;	r&   r5   Nr   r5   r&   r$   r  r    r   r&   r  c                   "    \ rS rSrS\4S jrSrg)BlenderbotConverteri  r(   c                    U R                   nUR                  n[        UR                  R	                  5       5      n[        [        UUS SSSS95      n[        R                  " UR                  S9Ul
        [        R                  " 5       Ul        [        R                  " SUR                   S3UR                  UR                   4/S9Ul        U$ )Nr   Fr   r   z$A:0 r   )r   r   )r/   r   rm   r   r   r   r   r   r   r'   r   r	   r   r   r   rV  rd  r   r  s        r$   r   BlenderbotConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#@#@2<<.+r/$
	  r&   r5   Nr   r5   r&   r$   r  r    r  r&   r  c                   &    \ rS rSrS rS rS rSrg)XGLMConverteri  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU/ SQ-  nU$ s  snf )Nr  rC  ))z<madeupword0>r  )z<madeupword1>r  )z<madeupword2>r  )z<madeupword3>r  )z<madeupword4>r  )z<madeupword5>r  )z<madeupword6>r  r6  r  s       r$   r7   XGLMConverter.vocab  sV    
 	,,qr:JK:J;;,:JKK  z  	z Ls   !Ac                 
    SnU$ r  r5   r  s      r$   r<  XGLMConverter.unk_id  r  r&   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nz</s> $Az</s> $A </s> </s> $Br  r  r   r  r   s    r$   r   XGLMConverter.post_processor  sR    ,,'//EEeLM00FFvNO
 	
r&   r5   Nr  r5   r&   r$   r  r    r<  r&   r  c                   D    \ rS rSrSr\rSS1r S rS r	S r
S rS	 rS
rg)GemmaConverteri  Tz<start_of_turn>z<end_of_turn>c                 0    [         R                  " SS5      $ Nr   r`  )r
   rd  r=  s     r$   r   GemmaConverter.normalizer  s    ""3..r&   c                    U R                   R                  S4U R                   R                  S4U R                   R                  S4/nX!R                  SS   Vs/ s H  o3R
                  UR                  4PM     sn-  n[        S U 5       5      (       d#  [        S [        U5       5       S 5      nUb  SX$'   U$ s  snf )Nr  rC  c              3   0   #    U  H  oS    S:H  v   M     g7f)r   rs   Nr5   ).0r6   s     r$   	<genexpr>'GemmaConverter.vocab.<locals>.<genexpr>  s     /AQ44<s   c              3   @   #    U  H  u  pUS    S:X  d  M  Uv   M     g7f)r   rt   Nr5   )r  rU  r6   s      r$   r  r    s!     "V1AQqTXEU111As   	)rs   r  )
r/   rU  rV  r   r7  rw   r8  anynextrO  )rZ   r*  r7   rw   override_indexs        r$   r7   GemmaConverter.vocab  s    $$..4$$..4$$..4

 	,,qr:JK:J;;,:JKK ////!"V51A"VX\]N)(3% Ls   !B;c                 0    [         R                  " SS5      $ )Nr   merged_with_previous)r   r  rZ   rl  r'   s      r$   r   GemmaConverter.pre_tokenizer"  s    ##C)?@@r&   c                 
    SnU$ r  r5   r  s      r$   r<  GemmaConverter.unk_id%  r  r&   c                     [         R                  " [         R                  " SS5      [         R                  " 5       [         R                  " 5       /5      $ )Nr`  r   )r	   r  rd  ByteFallbackFuser  s      r$   r   GemmaConverter.decoder)  s?        ,%%'
 	
r&   r5   N)re   rf   rg   rh   r,  rp   rN  r   r   r7   r   r<  r   rn   r5   r&   r$   r  r    s6    .L'9N/ A
r&   r  c                   <    \ rS rSrSrS rS rS rS rS r	S r
S	rg
)LlamaConverteri3  Tc                 *   U R                   R                  S5      S4U R                   R                  S5      S4U R                   R                  S5      S4/nX!R                  SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU$ s  snf )Nr   r  r   r=   rC  )r/   rz  r7  rw   r8  r  s       r$   r7   LlamaConverter.vocab6  s    $$::1=sC$$::1=sC$$::1=sC

 	,,qr:JK:J;;,:JKK Ls   (!Bc                 
    SnU$ rF  r5   r  s      r$   r<  LlamaConverter.unk_id?  r  r&   c                     [         R                  " SS5      [         R                  " 5       [         R                  " 5       /nU(       a  U[         R                  " SSS9/-  n[         R
                  " U5      $ Nr`  r   r   )contentr^  r	   rd  r  r  rc  r  rZ   rl  r'   sequences       r$   r   LlamaConverter.decoderC  \    UC(!!#MMO

 !<==H  **r&   c                    [        U R                  SS5      (       ae  / n[        U R                  SS5      (       a  U[        R                  " SS9/-  nU[        R                  " SSS9/-  n[        R
                  " U5      $ g )Nr+   Tr'   r`  )prependr   )patternr  )r.   r/   r
   Prependrd  r  )rZ   r*  r  s      r$   r   LlamaConverter.normalizerM  sx    4**Hd;;Ht..0BDII[00?@@,,S%HIIH''11r&   c                     [        U R                  SS5      (       d*  [        X R                  5      n[        R                  " XSS9$ g )Nr+   TFrl  r0   split)r.   r/   r1   r   rm  rn  s       r$   r   LlamaConverter.pre_tokenizerV  s?    t..$??01ACZCZ[N!++joppr&   c                     g rd   r5   r   s    r$   r   LlamaConverter.post_processor\  s    r&   r5   N)re   rf   rg   rh   r,  r7   r<  r   r   r   r   rn   r5   r&   r$   r  r  3  s&    +r&   r  c                   "    \ rS rSrS\4S jrSrg)MarkupLMConverteria  r(   c                 z   U R                   nUR                  n[        UR                  R	                  5       5      n[        [        UUS SSSU R                   R                  S95      n[        R                  " UR                  S9Ul        [        R                  " 5       Ul        [        U R                   R                  5      n[        U R                   R                   5      nU R                   R"                  nU R                   R$                  n[&        R(                  " U SU 3U SU SU 3XW4Xh4/S9Ul        U$ )Nr   Fr  r   z $A z $B r   )r/   r   rm   r   r   r   r   r   r   r   r'   r   r	   r   rj   r   r   r   r   r   r   r   )	rZ   r  r7   rJ   r   r   r   r   r   s	            r$   r   MarkupLMConverter.convertedb  s(   $$

bll'')**,#%11;;

	 #1":":BL_L_"`	$..0	$))334$))334..;;..;;#-#@#@U$se$5SEcU+##$
	  r&   r5   Nr   r5   r&   r$   r  r  a  s    "9 "r&   r  c                   4    \ rS rSrSrS	S jrS rS rS rSr	g)
MoshiConverteri  TNc                    [        U S5        [        R                  X5        [        5       nUR	                  5       n[        US5       nUR                  UR                  5       5        S S S 5        XPl        g ! , (       d  f       N= fNr   r#  	r   r   r[   r%   r%  r&  r(  r)  r*  )rZ   r'  model_max_lengthkwargsr0  r1  r2  s          r$   r[   MoshiConverter.__init__  se    $
+4, $%	  "*d#qaffh' $
 $#    A77
Bc                     UR                   R                  n[        R                  " SS5      /nU(       d  [        R                  " U5      $ [        R                  " [        R
                  " U5      /U-   5      $ r  )ra  rb  r
   rd  r  re  rf  s       r$   r   MoshiConverter.normalizer  sg    $44IIU+
 $''55'')@)@AU)V(WZf(fggr&   c                     [         R                  " SS5      [         R                  " 5       [         R                  " 5       /nU(       a  U[         R                  " SSS9/-  n[         R
                  " U5      $ r  r  r  s       r$   r   MoshiConverter.decoder  r  r&   c                 0    Sn[         R                  " XSS9$ )Nr,   Fr  )r   rm  rn  s       r$   r   MoshiConverter.pre_tokenizer  s     ''Kfkllr&   rv  rd   )
re   rf   rg   rh   r,  r[   r   r   r   rn   r5   r&   r$   r  r    s    h+mr&   r  c                   L    \ rS rSrSrSS jrS rS rS rS r	S	 r
S
 rS rSrg)HeliumConverteri  TNc                    [        U S5        [        R                  X5        [        5       nUR	                  5       n[        US5       nUR                  UR                  5       5        S S S 5        X@l        g ! , (       d  f       N= fr  r  )rZ   r'  r  r0  r1  r2  s         r$   r[   HeliumConverter.__init__  sc    $
+4,#%	  "*d#qaffh' $
 $#r  c                 R   U R                  U5      n[        [        UU R                  U5      U R                  S95      n[        UR                  5       VVs/ s HR  u  pEUR                  S;   d  M  XER                  UR                  S:H  =(       d    UR                  U R                  ;   4PMT     nnnUR                  [        US S9 VVVs/ s H  u  pGn[        USUSS9PM     snnn5        UR                  [        S	SSS
9/5        UR                  SSS9  U$ s  snnf s  snnnf )Nr@  rB  rC  c                     U S   $ rF  r5   rG  s    r$   r8   +HeliumConverter.tokenizer.<locals>.<lambda>  rI  r&   r:   FT)rK  rL  single_word
rJ  r  )rU  pad_id)r7   r   r   r<  r,  rO  r7  rR  rw   r   rS  rG   r   enable_padding)	rZ   r*  rI   r   rX  rY  rZ  r[  rL  s	            r$   r   HeliumConverter.tokenizer  s    zz%({{5)"77
	 #5<<0
0vv IR!&&A+GD4G4G)GH0 	 

 	 +11A~*V*V&Bw 5UGQUV*V	
 	j%OPQ  71 =
s   D1?DD"c                     / nUR                    HB  nUR                  S:X  a  USUR                  4/-  nM'  X#R                  UR                  4/-  nMD     U$ )Nz<0x0A>r  r6  r  s       r$   r7   HeliumConverter.vocab  sV    \\E{{h&4-..;;455	 "
 r&   c                 
    SnU$ rF  r5   r  s      r$   r<  HeliumConverter.unk_id  r  r&   c                     [         R                  " SS5      [         R                  " 5       [         R                  " 5       /nU[         R                  " SSS9/-  n[         R
                  " U5      $ r  r  r  s       r$   r   HeliumConverter.decoder  sY    UC(!!#MMO

 	X^^Ca899  **r&   c                     [         R                  " [         R                  " S5      [         R                  " SS5      /5      $ r  )r
   r  r  rd  r=  s     r$   r   HeliumConverter.normalizer  s2    ##[%8%8%={?R?RSWY^?_$`aar&   c                 Z    [         R                  " [         R                  " SS5      /5      $ )Nr  
contiguous)r   r  r  r  s      r$   r   HeliumConverter.pre_tokenizer  s#    &&(<(<T<(P'QRRr&   c                 8    [         R                  " SS// SQS/S9$ )Nr  rb  )r  rb  r  rc  )r  r   r   )r   r   r   s    r$   r   HeliumConverter.post_processor  s/    ,, 
 	
r&   rv  rd   )re   rf   rg   rh   r,  r[   r   r7   r<  r   r   r   r   rn   r5   r&   r$   r  r    s2    
8+bS
r&   r  c                   (    \ rS rSrSrSS jrS rSrg)ParakeetConverteri	  TNc                    Xl         [        U S5        [        R                  X5        [	        5       nUR                  5       n[        US5       nUR                  UR                  5       5        S S S 5        X@l	        g ! , (       d  f       N= fr  )
r'  r   r   r[   r%   r%  r&  r(  r)  r*  )rZ   r'  r/  r0  r1  r2  s         r$   r[   ParakeetConverter.__init__  sh    $$
+4,#%	  "*d#qaffh' $
 $#s    A==
Bc                    U R                  U5      nU R                  U R                  5      R                  U5      u  p4[	        U5       VVVs0 s H
  u  nu  pgXe_M     nnnn[        [        UUUR                  R                  SU R                  S S95      n	[	        UR                  5       V
Vs/ s HR  u  pUR                  S;   d  M  XR                  UR                  S:H  =(       d    UR                  U R                  ;   4PMT     nn
nU	R                  [        US S9 V
VVs/ s H  u  pn[!        USUS9PM     snnn
5        U	$ s  snnnf s  snn
f s  snnn
f )	NTrA  rB  rC  c                     U S   $ rF  r5   rG  s    r$   r8   -ParakeetConverter.tokenizer.<locals>.<lambda>2  rI  r&   r:   FrJ  )r7   rN  r'  rb   rO  r   r   r+  rP  r,  r7  rR  rw   r   rS  rG   r   )rZ   r*  rI   rT  rJ   rU  rV  r8  rW  r   rX  rY  rZ  r[  rL  s                  r$   r   ParakeetConverter.tokenizer  sG   zz%(%%doo6>>|L	5>|5LM5L!1MTTW5L	M,,66"77	
	 #5<<0
0vv IR!&&A+GD4G4G)GH0 	 

 	 +11A~*V*V&Bw 5UGD*V	
 3 N
s   E/E	?E%E)r*  r'  rd   )re   rf   rg   rh   r,  r[   r   rn   r5   r&   r$   r  r  	  s    r&   r  c            	         [        [        [        S5      [        S5      S-   5      5      [        [        [        S5      [        S5      S-   5      5      -   [        [        [        S5      [        S5      S-   5      5      -   n U SS nS	n[        S
5       H4  nX0;  d  M
  U R                  U5        UR                  S
U-   5        US-  nM6     U Vs/ s H  n[	        U5      PM     nn[        [        X5      5      $ s  snf )a  
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on.

The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
!~r      ¡   ¬   ®   ÿNr      )rm   rE   ordrF   chrrC   r|  )bscsnbs       r$   bytes_to_unicoder  :  s     	U3s8SX\*+d5TCIPQM3R.SSVZ[`adeiajloptluxyly[zV{{  
AB	A4[;IIaLIIdQhFA	 
 	"Q#a&"B	B 
s   C:c                   L    \ rS rSrSr    SS jrS\4S jrS rS\	4S	 jr
S
rg)TikTokenConverteriR  z
A general tiktoken converter.
Nc                     Xl         X l        X0l        [        U[        5      (       a  UR                  5       U l        g UU l        g rd   r'  r  r'   
isinstancerC   r   additional_special_tokensrZ   r'  r  r'   r  r  s         r$   r[   TikTokenConverter.__init__W  G     % 0 3T:: &**, 	& + 	&r&   tiktoken_urlc                 >  ^^  SSK Jn  U" U5      m[	        5       mU4S jn/ n0 nTR                  5        H  u  pgXuU" U5      '   [        U5      S:X  a  M   / n[        S[        U5      5       H8  n	US U	 XiS  pU
T;   d  M  UT;   d  M  X-   T;   d  M%  UR                  XU45        M:     [        UU4S jSS9nUR                  U5        M     [        US	 SS9nU Vs/ s H  o" US   5      U" US   5      4PM     nnXT4$ ! [         a    [        S5      ef = fs  snf )
Nr   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c           	         > SR                  U R                  S5       Vs/ s H  nT[        U5         PM     sn5      $ s  snf Nr   zlatin-1r{  decoder	  r  charbyte_encoders     r$   token_bytes_to_stringPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_strings  s8    77@ST@SLT3@STUUT   ?r   c                 $   > TU S      TU S      4$ r4   r5   r6   r   s    r$   r8   CTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>  s    1Q4)AaD/0Rr&   FrA   c                     U S   $ Nr=   r5   r?   s    r$   r8   r(        Ar&   )tiktoken.loadr  rQ  
ValueErrorr  rD   r>   rE   rF   rG   rH   )rZ   r  r  r#  rJ   r7   r[  rankrM   rN   rO   rP   r@   r   r"  s                @@r$   extract_vocab_merges_from_model1TikTokenConverter.extract_vocab_merges_from_modelh  sC   	7 &l3	')	V $??,KE26'./5zQEq#e*-#(%=%-i'Gy,@gFW\eEeLL'D!9: . 5&R\abEMM%  - $6F\bc\bUX(Q02GA2OP\bc}5  	k 	2 ds   D DDc                     U R                  U R                  5      u  p[        [        XSS95      n[	        UR
                  S5      (       a  SUR
                  l        U$ NF)r   ignore_mergesTr/  r'  r   r   r   rU   r3  rZ   rI   rJ   r   s       r$   r   TikTokenConverter.tokenizer  M    #CCDOOTc,GH	9??O44,0IOO)r&   r(   c                    U R                  5       n[        R                  " [        R                  " [	        U R
                  5      SSS9[        R                  " U R                  SS9/5      Ul        [        R                  " 5       Ul
        UR                  U R                   Vs/ s H  n[        USSS9PM     sn5        [        R                  " SS9Ul        U$ s  snf )Nr   Fr   r  TrJ  r   )r   r   r  r  r   r  r   r'   r   r	   r   r   r  r   r   r   )rZ   r   r[  s      r$   r   TikTokenConverter.converted  s    NN$	"0"9"9$$U4<<%8:V[\(($:O:O[`a#
	 %..0	$$LPLjLjkLj5Z%>Ljk	
 $.#7#7U#K	  ls   %Cr'   r  r  r'  Nzs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+FN)re   rf   rg   rh   ri   r[   rj   r/  r   r   r   rn   r5   r&   r$   r  r  R  s:      K"&
"C >9 r&   r  c                   H    \ rS rSr    S
S jrS\4S jrS rS\4S jr	S	r
g)MistralConverteri  Nc                     Xl         X l        X0l        [        U[        5      (       a  UR                  5       U l        g UU l        g rd   r  r  s         r$   r[   MistralConverter.__init__  r  r&   r  c                   ^^ SS K nSS Kn[        U R                  SSS9 nUR	                  U5      nS S S 5        WS   S   U l        US    Vs/ s H  n[        US   US	   S
9PM     snU l        US   m[        5       m[        U4S j5       n/ n0 n	[        U R                  5       H  u  pXUR                  '   M     T Vs/ s H  obR                  US   5      PM     snm[        T5      n[        [        TSS95       H  u  pXU" U5      '   [        U5      S:X  a  M   / n[!        S[        U5      5       H;  nUS U XS  nnUU;   d  M  UU;   d  M  UU-   U;   d  M'  UR#                  UUU45        M=     [%        UU4S jSS9nUR'                  U5        M     [%        US SS9nU Vs/ s H  nU" US   5      U" US   5      4PM     nnX4$ ! , (       d  f       GN= fs  snf s  snf s  snf )Nr   rzutf-8)encodingconfigr  r   	token_str
is_control)rL  r7   c           	         > SR                  U R                  S5       Vs/ s H  nT[        U5         PM     sn5      $ s  snf r  r  r   s     r$   r#  OMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string  s8    77@ST@SLT3@STUUTr%  token_bytesz(Converting tekken.json to tokenizer.json)descr   c                 T   > TR                  U S   5      TR                  U S   5      4$ r4   )rN   r'  s    r$   r8   BMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>  s'    11F	XYZ[X\H]0^r&   FrA   c                     U S   $ r*  r5   r?   s    r$   r8   rK    r+  r&   )base64jsonr&  r'  loadr  r   r  r  r   rO  r  	b64decodesetr   r>   rE   rF   rG   rH   )rZ   r  rM  rN  r2  untypedkr#  rJ   r7   idxr[  rank_setr.  rM   rN   rO   rP   r@   r   r"  s                      @@r$   r/  0MistralConverter.extract_vocab_merges_from_model  s   $//39QiilG :x(3IPQaIb*
IbAJq~q?Ib*
& G$	')		V 
	V #D$B$BCJC#&%--  DAJKA%%a&67K	y>$T):d%efKD26'./5zQEq#e*-#(%=%-h&7h+>GgDUZbCbLL'7D!9: . 5&^hmnEMM%  g $6F\bc\bUX(Q02GA2OP\bc}A :9*
 L ds   GG!G&* G+
Gc                     U R                  U R                  5      u  p[        [        XSS95      n[	        UR
                  S5      (       a  SUR
                  l        U$ r2  r4  r5  s       r$   r   MistralConverter.tokenizer  r7  r&   r(   c                    U R                  5       n[        R                  " [        R                  " [	        U R
                  5      SSS9[        R                  " U R                  SS9/5      Ul        [        R                  " 5       Ul
        UR                  U R                  5        [        R                  " SS9Ul        U$ )Nr   Fr   r  r   )r   r   r  r  r   r  r   r'   r   r	   r   rS  r  r   r   )rZ   r   s     r$   r   MistralConverter.converted  s    NN$	"0"9"9$$U4<<%8:V[\(($:O:O[`a#
	 %..0	T;;<#-#7#7U#K	 r&   r:  r;  )re   rf   rg   rh   r[   rj   r/  r   r   r   rn   r5   r&   r$   r=  r=    s6      K"&
"$C $L9 r&   r=  AlbertTokenizerBartTokenizerBarthezTokenizerBertTokenizerBigBirdTokenizerBlenderbotTokenizerCamembertTokenizerCLIPTokenizerCodeGenTokenizerConvBertTokenizerDebertaTokenizerDebertaV2TokenizerDistilBertTokenizerDPRReaderTokenizerDPRQuestionEncoderTokenizerDPRContextEncoderTokenizerElectraTokenizerFNetTokenizerFunnelTokenizerGPT2TokenizerHerbertTokenizerLayoutLMTokenizerLayoutLMv2TokenizerLayoutLMv3TokenizerLayoutXLMTokenizerLongformerTokenizerLEDTokenizerLxmertTokenizerMarkupLMTokenizerMBartTokenizerMBart50TokenizerMPNetTokenizerMobileBertTokenizerMvpTokenizerNllbTokenizerOpenAIGPTTokenizerPegasusTokenizerQwen2TokenizerRealmTokenizerReformerTokenizerRemBertTokenizerRetriBertTokenizerRobertaTokenizerRoFormerTokenizerSeamlessM4TTokenizerSqueezeBertTokenizerT5TokenizerUdopTokenizerWhisperTokenizerXLMRobertaTokenizerXLNetTokenizer)SplinterTokenizerXGLMTokenizerLlamaTokenizerCodeLlamaTokenizerGemmaTokenizerPhi3Tokenizerc                 <   U R                   R                  nU[        ;   a&  U(       d  [        U   nU" U 5      R                  5       $ U R                  R                  S5      (       a>  X l        [        R                  S5        [        U R                  5      R                  5       $  [        R                  S5        [        U R                  U R                  S9R                  5       $ ! [         a*    [        S[        [        R                  5       5       35      ef = f)a\  
Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

Args:
    transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
        Instance of a slow tokenizer to convert in the backend tokenizer for
        [`~tokenization_utils_base.PreTrainedTokenizerFast`].
   from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
        Defaults to False.

Return:
    A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
    [`~tokenization_utils_base.PreTrainedTokenizerFast`]
ztekken.jsonz#Converting from Mistral tekken.jsonzConverting from Tiktoken)r'  r  zConverting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: )r3  re   SLOW_TO_FAST_CONVERTERSr   r'  endswithr/   loggerinfor=  r  r  rQ  r-  rm   r   )transformer_tokenizerfrom_tiktokentokenizer_class_nameconverter_classs       r$   convert_slow_tokenizerr  -  s      1::CC66}12FG45??AA		)	)	2	2=	A	A3H09: 5 @ @AKKMM	KK23$0;;*?*Y*Y ik  	>>BCZC_C_Ca>b=ce 	s   &A C' '4D)r   )F)Uri   r-  	functoolsr   typingr   	packagingr   
tokenizersr   r   r   r	   r
   r   r   tokenizers.modelsr   r   r   r   utilsr   r   r   r   utils.import_utilsr   
get_loggerre   r  r%   boolrj   r1   rQ   rS   rp   r}   r   r   r   r   r   r   r   r   r   r  r  r  r!  ry  r  r  r  r  r  r   r+  r5  r>  rF  rJ  rP  rR  rh  rr  rw  r  r  r  r  r  r  r  r  r  r  r  r  r  r=  r  r  r5   r&   r$   <module>r     s       f f f 5 5  ` ` 5 
		H	%G"$ s & 2"8 "Ic Id I$ $$I $N/	 /d$i $N$Y $N 6%I %Py >+Y +\y :$	 $Ny >~9 ~B"
l "
J
| 
 
 
:
 
B2
\ 2
j
| 
6
L 
2
< 
2
, 
6"
\ "
J	 	
| 
@	l 	%x| %xP
, 
"
L 
!y !H	
| 	
(I (V$) $N) :
L 
61
\ 1
h+\ +\#	 #L&m\ &mRV
l V
r- -b0L L^L L^::%: (: ]	:
 (: .: ,: ]: : : (: ,: =: -: "=:  !-!:" #:$ _%:& ':( ]):* (+:, -:. =/:0 +1:2 -3:4 +5:6 $7:8 }9:: *;:< n=:> (?:@ nA:B =C:D $E:F ]G:H ,I:J (K:L nM:N mO:P *Q:R (S:T -U:V (W:X *Y:Z 0[:\ M]:^ ;_:` ]a:b (c:d .e:f ng:h +"$($#s: z$) $r&   