
    9i"                         S r SSKJrJrJrJr  SSKrSSKrSSKrSSK	r	SSK
Jr  SSKJrJrJrJr  SSKrSSKrSSKJrJr  SSKJr  \" 5       rSS	0r " S
 S\5      rg)zTokenization classes for QWen.    )absolute_importdivisionprint_functionunicode_literalsN)open)ListOptionalTupleUnion)
AddedTokenPreTrainedTokenizer)
get_logger
vocab_fileqwen.tiktokenc                   D  ^  \ rS rSrSr \r         SU 4S jjrS rS r	S r
S\S\\   4S	 jrS
\S\\   4S jrS\\   S\4S jr\S 5       rS\S\4S jrS\S\4S jr\S\\   4S j5       r\S\\   4S j5       rS r SS\\\\   4   S\S\4S jjrSrU =r$ )QWenTokenizer   zQWen tokenizer.c           
        > [        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        TU ]  UUUUUUU	S9  Xl        Ub  UO
[        S5      U l        X l        SnSnSnSnU
(       a5  UUUS	S
SSS4[        [        S5       Vs/ s H	  nSU S3PM     sn5      -   nOXU4nSnS[        SS4S jnU" U5      n[        U[        U5      S9 VVs0 s H	  u  nnUU_M     nnnUU l        [        R                  " UUUUS9n[        U5      [        U5      -   UR                   :X  d,   [        U5      [        U5      -    SUR                    S35       eUU l        U R"                  U l        U R$                  R'                  5        VVs0 s H	  u  nnUU_M     snnU l        UU l        U R*                  R,                  U l        UU   U l        UU   U l        g s  snf s  snnf s  snnf )NF)lstriprstrip)errors	unk_token	bos_token	eos_token	pad_tokenadd_prefix_spaceadd_bos_tokeng   mBQwen<|endoftext|>z<|im_start|>z
<|im_end|>z<R>z<S>z<X>z<mask>z<sep>   z<extra_>zn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+tiktoken_bpe_filereturnzdict[bytes, int]c                     [        U S5      R                  5       nS UR                  5        5        VVs0 s H%  u  p#[        R                  " U5      [        U5      _M'     snn$ s  snnf )Nrbc              3   R   #    U  H  o(       d  M  UR                  5       v   M     g 7fN)split).0lines     g/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/qwen/tokenization.py	<genexpr>DQWenTokenizer.__init__.<locals>.load_tiktoken_bpe.<locals>.<genexpr>c   s$      $O0E %1DJJLL0Es   
'')r   read
splitlinesbase64	b64decodeint)r"   contentstokenranks       r+   load_tiktoken_bpe1QWenTokenizer.__init__.<locals>.load_tiktoken_bpe_   sn    -t499;H$O080C0C0E$O$OKE   'T2$O  s   ,A%)start)pat_strmergeable_ranksspecial_tokensz != z in encoding)
isinstancestrr   super__init__r   r2   max_lenr   tuplerange	enumeratelenr;   tiktokenEncodingn_vocabr:   encoderitemsdecoder	tokenizer	eot_tokeneod_idim_start_id	im_end_id)selfr   r   r@   r   r   r   r   r   r   add_more_sp_tokenskwargsname	ENDOFTEXTIMSTARTIMENDir;   PAT_STRr6   r:   indexr4   enckv	__class__s                             r+   r?   QWenTokenizer.__init__!   s    BL3B  B Jyu=%. 	 BL3B  B Jyu=%. 	 BL3B  B Jyu=%. 	 BL3B  B Jyu=%. 	 	-' 	 	
 +")"5w3t9#	 	 uSz:z!1~z:;	<N (%8NF 		 	9K 	 ,J7 !*c/&:!<
!<u 5L!< 	 

 -+)	
  3~#66#++E	X/"S%889ckk],W	XE  /++)-););)=>)=A1)=>nn..)'2'.O ;"
$ ?s   (H:
3H?*Ic                 .    U R                   R                  $ r'   rK   rG   rP   s    r+   __len__QWenTokenizer.__len__   s    ~~%%%    c                     U R                   $ r'   )r:   ra   s    r+   	get_vocabQWenTokenizer.get_vocab   s    ###rd   c                    / n[        U[        5      (       a9  XR                  ;   a  U R                  U   $ U R                  R	                  U5      $ U H\  nX0R                  ;   a   UR                  U R                  U   5        M2  UR                  U R                  R	                  U5      5        M^     [        U5      U R                  :  a8  [        R                  SR                  [        U5      U R                  5      5        U$ )NzToken indices sequence length is longer than the specified maximum  sequence length for this model ({} > {}). Running this sequence through the model will result in indexing errors)r<   r=   r;   rH   getappendrD   r@   loggerwarningformat)rP   tokensidsr4   s       r+   convert_tokens_to_ids#QWenTokenizer.convert_tokens_to_ids   s    fc"",,,**622||''//E+++

4..u56

4<<++E23	 
 s8dll"NNM s3x.	0
 
rd   save_directoryr#   c                 l   [         R                  R                  US5      n[        USSS9 nU R                  R                  5        HM  u  pV[        R                  " U5      R                  S5      S-   [        U5      -   S-   nUR                  U5        MO     SSS5        U4$ ! , (       d  f       U4$ = f)z~
Save only the vocabulary of the tokenizer (vocabulary + added tokens).

Returns:
    `Tuple(str)`: Paths to the files saved.
r   wutf8)encoding 
N)ospathjoinr   r:   rI   r0   	b64encodedecoder=   write)rP   rr   rR   	file_pathrt   r[   r\   r*   s           r+   save_vocabularyQWenTokenizer.save_vocabulary   s     GGLLA	)S62a,,224''*11&9C?#a&H4O 5 3 }	 32 }s   A,B##
B3textc                     / n[         R                  " SU5      nU R                  R                  U5       H!  nUR	                  U R
                  U   5        M#     U$ )a  
Converts a string in a sequence of tokens, replacing unknown tokens with the `unk_token`.

Args:
    text (`str`):
        The sequence to be encoded.
    kwargs (additional keyword arguments, *optional*):
        Will be passed to the underlying model specific encode method. See details in
        [`~PreTrainedTokenizerBase.__call__`]

Returns:
    `List[str]`: The list of tokens.
NFC)unicodedata	normalizerK   encode_ordinaryrj   rJ   )rP   r   rR   rn   ts        r+   tokenizeQWenTokenizer.tokenize   sM     $$UD1//5AMM$,,q/* 6rd   rn   c                     SR                  U5      n[        U Vs/ s H  o0R                  U   PM     sn5      R                  SU R                  S9nU$ s  snf )z
Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
often want to remove sub-word tokenization artifacts at the same time.
 zutf-8)r   )r{   	bytearraybyte_decoderr}   r   )rP   rn   r   cs       r+   convert_tokens_to_string&QWenTokenizer.convert_tokens_to_string   sW    
 wwv=1++A.=>EEDKK F ) >s   Ac                 .    U R                   R                  $ r'   r`   ra   s    r+   
vocab_sizeQWenTokenizer.vocab_size   s    ~~%%%rd   rY   c                     XR                   R                  :  a  U R                  $ U R                   R                  U/5      $ r'   )rK   rG   r   r}   )rP   rY   s     r+   _convert_id_to_token"QWenTokenizer._convert_id_to_token   s3    NN***>>!~~$$eW--rd   r4   c                     U R                   R                  UR                  S5      U R                  R                  U R                  SS9S   5      $ )z*Converts a token to an id using the vocab.zUTF-8all)allowed_specialr   )rH   ri   encoderK   r   )rP   r4   s     r+   _convert_token_to_id"QWenTokenizer._convert_token_to_id   sE    ||LL!NN!!$..%!HK
 	
rd   c                 v    U R                   R                  5        Vs/ s H  n[        U5      PM     nnU$ s  snf )z
`List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.

Convert tokens of `tokenizers.AddedToken` type to string.
)r;   keysr=   )rP   sall_tokss      r+   all_special_tokens QWenTokenizer.all_special_tokens   s8     %)$7$7$<$<$>?$>qCF$>? @s   6c                 b    U R                   R                  5        Vs/ s H  oPM     nnU$ s  snf )zi
`List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
)r;   values)rP   r\   all_idss      r+   all_special_idsQWenTokenizer.all_special_ids   s3    
 #1188:;:1:; <s   ,c                     [         e)z
Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).

Do NOT take care of added tokens.
)NotImplementedError)rP   r   rR   s      r+   	_tokenizeQWenTokenizer._tokenize   s
     "!rd   	token_idsskip_special_tokensc                     [        U[        5      (       a  U/nU(       a#  U Vs/ s H  oDU R                  ;  d  M  UPM     nnU R                  R	                  U5      $ s  snf r'   )r<   r2   r   rK   r}   )rP   r   r   rR   rW   s        r+   _decodeQWenTokenizer._decode   sU     i%%"I$-OIq$:N:N1NIIO~~$$Y// Ps
   AA)r   rJ   rH   rM   r   rO   rN   r@   r:   r;   rK   )	replaceNr   r   r   NFFT)F)__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesr?   rb   rf   rp   r=   r
   r   r   r   r   propertyr   r2   r   r   r   r   r   r   boolr   __static_attributes____classcell__)r]   s   @r+   r   r      s<   X)
 !!!]/~&$*c c
 S tCy (tCy S  & &.# .# .

# 
# 
 DI   c  " %*
0d3i(
0 "
0
 

0 
0rd   r   )r   
__future__r   r   r   r   r0   loggingry   r   ior   typingr   r	   r
   r   jsonrE   transformersr   r   modelscope.utils.loggerr   rk   r   r    rd   r+   <module>r      sV   
 %* *   	   / /   8 .	!?3 a0' a0rd   