
    9i'                         S SK r S SKJrJrJrJr  S SKJr  S SKJ	r	  S SK
JrJr  S SKJr   " S S5      r " S	 S
\	5      rg)    N)DictListOptionalUnion)SentencePieceProcessor)PreTrainedTokenizer)BatchEncodingEncodedInput)PaddingStrategyc            
           \ rS rSrS\4S jrS\4S jr  SS\S\S\S\\	   4S	 jjr
S
\\	   S\4S jrS\\   S\4S jrS rS rSrg)SPTokenizer
   
model_pathc                    [         R                  R                  U5      (       d   U5       e[        US9U l        U R                  R                  5       U l        U R                  R                  5       U l        U R                  R                  5       U l        U R                  R                  5       U l
        U R                  R                  5       U R                  R                  5       :X  d   e/ SQn0 U l        0 U l        U HI  nU R                  U R                  U'   X0R                  U R                  '   U =R                  S-  sl        MK     g )N)
model_file)z[MASK][gMASK]z[sMASK]sopeop   )ospathisfiler   sp_model
vocab_sizen_wordsbos_ideos_idunk_idpad_idget_piece_sizespecial_tokensindex_special_tokens)selfr   r!   tokens       k/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/chatglm2/tokenization.py__init__SPTokenizer.__init__   s    ww~~j))5:5).*E !MM446==//1==//1==//1}}'')T]]-I-I-KKKKG $&!#E)-D&6;%%dll3LLAL $    sc                 8    U R                   R                  U5      $ N)r   EncodeAsPieces)r#   r)   s     r%   tokenizeSPTokenizer.tokenize    s    }}++A..r(   boseosreturnc                     [        U5      [        L d   eU R                  R                  U5      nU(       a  U R                  /U-   nU(       a  X@R
                  /-   nU$ r+   )typestrr   encoder   r   )r#   r)   r/   r0   ts        r%   r5   SPTokenizer.encode#   sQ     Aw#~~MM  #!A[[M!Ar(   r6   c                 8    U R                   R                  U5      $ r+   )r   decode)r#   r6   s     r%   r9   SPTokenizer.decode/   s    }}##A&&r(   tokensc                 <    U R                   R                  U5      nU$ r+   )r   DecodePieces)r#   r;   texts      r%   decode_tokensSPTokenizer.decode_tokens2   s    }}))&1r(   c                 t    XR                   ;   a  U R                   U   $ U R                  R                  U5      $ z1Converts a token (str) in an id using the vocab. )r!   r   	PieceToIdr#   r$   s     r%   convert_token_to_idSPTokenizer.convert_token_to_id6   s4    '''&&u--}}&&u--r(   c                     XR                   ;   d-  UU R                  U R                  U R                  4;   d  US:  a  gU R                  R                  U5      $ )=Converts an index (integer) in a token (str) using the vocab.r    )r"   r   r   r   r   	IdToPiecer#   indexs     r%   convert_id_to_tokenSPTokenizer.convert_id_to_token<   sL    ---T[[$++;
 2
QY}}&&u--r(   )r   r   r"   r   r   r   r!   N)FF)__name__
__module____qualname____firstlineno__r4   r&   r-   boolr   intr5   r9   r?   rE   rM   __static_attributes__ r(   r%   r   r   
   s    3 (/# /
 ! 


 
 &*#Y
'S	 'c 'DI # ..r(   r   c                     ^  \ rS rSrSS0r/ SQrS!U 4S jjrS r\S\	4S j5       r
\S	 5       r\S\	4S
 j5       r\S 5       r\S 5       rS rS rS rS rS\\	   S\	4S jrS"S jrS rS"S jr S"S\\   S\\\      S\\   4S jjrS\R8                  SS4S\\\	\4   \ 4   S\\   S\S\\   S\\!   S\"4S jjr#S r$U =r%$ )#ChatGLM2TokenizerE   
vocab_fileztokenizer.model)	input_idsattention_maskposition_idsc                    > SU l         Xl        [        U5      U l        U R                  R                  U R                  R
                  U R                  R                  S.U l        [        TU ]$  " SSU0UD6  g )NGLMTokenizer)z<bos><eos><pad>padding_siderV   )
namerZ   r   	tokenizerr   r   r   r!   superr&   )r#   rZ   rb   kwargs	__class__s       r%   r&   ChatGLM2Tokenizer.__init__J   sc    "	$$Z0^^**^^**^^**

 	=l=f=r(   c                     XR                   ;   a  U R                   U   $ XR                  R                   ;   d   U SU R                   35       eU R                  R                   U   $ )Nz is not a special token for )r!   rd   rc   rD   s     r%   get_commandChatGLM2Tokenizer.get_commandV   sc    '''&&u--555h%@\]a]f]f\g7hh5~~,,U33r(   r1   c                     g)Nz<unk>rV   r#   s    r%   	pad_tokenChatGLM2Tokenizer.pad_token\   s    r(   c                 $    U R                  S5      $ )Nra   rj   rm   s    r%   pad_token_idChatGLM2Tokenizer.pad_token_id`       ((r(   c                     g)Nz</s>rV   rm   s    r%   	eos_tokenChatGLM2Tokenizer.eos_tokend   s    r(   c                 $    U R                  S5      $ )Nr`   rq   rm   s    r%   eos_token_idChatGLM2Tokenizer.eos_token_idh   rt   r(   c                 .    U R                   R                  $ r+   )rd   r   rm   s    r%   r   ChatGLM2Tokenizer.vocab_sizel   s    ~~%%%r(   c                     [        U R                  5       Vs0 s H  nU R                  U5      U_M     nnUR                  U R                  5        U$ s  snf )zReturns vocab as a dict )ranger   _convert_id_to_tokenupdateadded_tokens_encoder)r#   ivocabs      r%   	get_vocabChatGLM2Tokenizer.get_vocabp   s[     4??+
+ %%a(!++ 	 
 	T../
s   Ac                 8    U R                   R                  U5      $ r+   )rd   r-   )r#   r>   rf   s      r%   	_tokenizeChatGLM2Tokenizer._tokenizey   s    ~~&&t,,r(   c                 8    U R                   R                  U5      $ rB   )rd   rE   rD   s     r%   _convert_token_to_id&ChatGLM2Tokenizer._convert_token_to_id|       ~~11%88r(   c                 8    U R                   R                  U5      $ )rH   )rd   rM   rK   s     r%   r   &ChatGLM2Tokenizer._convert_id_to_token   r   r(   r;   c                 8    U R                   R                  U5      $ r+   )rd   r?   )r#   r;   s     r%   convert_tokens_to_string*ChatGLM2Tokenizer.convert_tokens_to_string   s    ~~++F33r(   Nc                    [         R                  R                  U5      (       a.  [         R                  R                  UU R                  S   5      nOUn[        U R                  S5       nUR                  5       nSSS5        [        US5       nUR                  W5        SSS5        U4$ ! , (       d  f       N7= f! , (       d  f       U4$ = f)a9  
Save the vocabulary and special tokens file to a directory.

Args:
    save_directory (`str`):
        The directory in which to save the vocabulary.
    filename_prefix (`str`, *optional*):
        An optional prefix to add to the named of the saved files.

Returns:
    `Tuple(str)`: Paths to the files saved.
rZ   rbNwb)	r   r   isdirjoinvocab_files_namesopenrZ   readwrite)r#   save_directoryfilename_prefixrZ   fin	proto_strwriters          r%   save_vocabulary!ChatGLM2Tokenizer.save_vocabulary   s     77==((n&*&<&<\&JLJ (J$//4(C
I ) *d#vLL# $ ~ )( $# ~s   +B-B>-
B;>
Cc                 J    U R                  S5      U R                  S5      /nU$ )Nr   r   rq   )r#   prefix_tokenss     r%   get_prefix_tokens#ChatGLM2Tokenizer.get_prefix_tokens   s(    )))4d6F6Fu6MNr(   c                     Uc  / nSn[        U5       H   u  nu  pVUSR                  US-   XV5      -  nM"     USR                  [        U5      S-   U5      -  nU$ )NrI   u    [Round {}]

问：{}

答：{}

r   u   [Round {}]

问：{}

答：)	enumerateformatlen)r#   queryhistorypromptr   	old_queryresponses          r%   build_promptChatGLM2Tokenizer.build_prompt   sq    ?G(1'(:$A$	>EEAy, ,F ); 	4;;CL1<LeTTr(   token_ids_0token_ids_1c                 b    U R                  5       nX1-   nUb  X-   U R                  S5      /-   nU$ )a6  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:

- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
r`   )r   rj   )r#   r   r   r   s       r%    build_inputs_with_special_tokens2ChatGLM2Tokenizer.build_inputs_with_special_tokens   sH    ( ..0#1"%3  )7 K r(   encoded_inputs
max_lengthpadding_strategypad_to_multiple_ofreturn_attention_maskc                 :   U R                   S:X  d   eXR                  S      n[        U5      nU[        R                  :X  a  [        U5      nUb  Ub  X$-  S:w  a
  X$-  S-   U-  nU[        R
                  :g  =(       a    [        U5      U:g  nSU;  a	  S/U-  US'   SU;  a  [        [        U5      5      US'   U(       aZ  U[        U5      -
  n	SU;   a  S/U	-  US   -   US'   SU;   a  S/U	-  US   -   US'   U R                  /U	-  U-   XR                  S   '   U$ )a  
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

Args:
    encoded_inputs:
        Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
    max_length: maximum length of the returned list and optionally padding length (see below).
        Will truncate by taking into account the special tokens.
    padding_strategy: PaddingStrategy to use for padding.

        - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
        - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
        - PaddingStrategy.DO_NOT_PAD: Do not pad
        The tokenizer padding sides are defined in self.padding_side:

            - 'left': pads on the left of the sequences
            - 'right': pads on the right of the sequences
    pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
        This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
        `>= 7.5` (Volta).
    return_attention_mask:
        (optional) Set to False to avoid returning attention mask (default: set to model specifics)
leftr   r   r\   r]   )	rb   model_input_namesr   r   LONGEST
DO_NOT_PADlistr~   rr   )
r#   r   r   r   r   r   required_input
seq_lengthneeds_to_be_padded
differences
             r%   _padChatGLM2Tokenizer._pad   s   @   F***'(>(>q(AB(
666^,J!&8&D/141Q6:LMJ .1K1KK *PSQ)Q* >101sZ/?N+,/-1%
2C-DN>*#c.&99J>144!/0@!A4B/0 /22!/!?2@~. (()J6G 11  r(   )rc   r!   rd   rZ   )r   r+   )&rO   rP   rQ   rR   r   r   r&   rj   propertyr4   rn   rr   rv   ry   r   r   r   r   r   r   r   r   r   r   rT   r   r   r   r   r   r   r
   r	   rS   dictr   rU   __classcell__)rg   s   @r%   rX   rX   E   s   %'89G
>4 3   ) ) 3   ) ) & &-994tCy 4S 46 04c "$s), 9=S	> %),;,F,F,004Ed3#45}DEE SME *	E
 %SME  (~E 
E Er(   rX   )r   typingr   r   r   r   sentencepiecer   transformersr   $transformers.tokenization_utils_baser	   r
   transformers.utilsr   r   rX   rV   r(   r%   <module>r      s4    	 . . 0 , L .8. 8.vL+ Lr(   