
    9iD                         S r SSKrSSKJrJrJrJr  SSKrSSK	r
SSKJr  SSKJrJr  SSKJr  SSKJr  \R*                  " 5       rSS	0r " S
 S5      r " S S5      r " S S\5      rg)z!Tokenization classes for ChatGLM.    N)DictListOptionalUnion)PreTrainedTokenizer)BatchEncodingEncodedInput)PaddingStrategy)loggerzTHUDM/chatglm-6bi   c                   R    \ rS rSrS rS rS\\   4S jrS r	S r
S rS	 rS
 rSrg)TextTokenizer   c                     [         R                  " 5       U l        U R                  R                  U5        U R                  R	                  5       U l        g N)spmSentencePieceProcessorspLoad
vocab_size
num_tokens)self
model_paths     j/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/chatglm/tokenization.py__init__TextTokenizer.__init__   s6    ,,.Z '',,.    c                 8    U R                   R                  U5      $ r   )r   EncodeAsIdsr   texts     r   encodeTextTokenizer.encode   s    ww""4((r   idsc                 8    U R                   R                  U5      $ r   )r   	DecodeIds)r   r#   s     r   decodeTextTokenizer.decode       ww  %%r   c                 8    U R                   R                  U5      $ r   )r   EncodeAsPiecesr   s     r   tokenizeTextTokenizer.tokenize!   s    ww%%d++r   c                 b    U Vs/ s H  o R                   R                  U5      PM     sn$ s  snf r   r   	PieceToId)r   tokenstokens      r   convert_tokens_to_ids#TextTokenizer.convert_tokens_to_ids$   s'    6<=fU!!%(f===s   $,c                 8    U R                   R                  U5      $ r   r.   r   r1   s     r   convert_token_to_id!TextTokenizer.convert_token_to_id'   s    ww  ''r   c                 8    U R                   R                  U5      $ r   )r   	IdToPiece)r   idxs     r   convert_id_to_token!TextTokenizer.convert_id_to_token*   r(   r   c                     U R                   $ r   )r   r   s    r   __len__TextTokenizer.__len__-   s    r   )r   r   N)__name__
__module____qualname____firstlineno__r   r!   r   intr&   r+   r2   r6   r;   r?   __static_attributes__ r   r   r   r      s5    /
)&$s) &,>(&r   r   c                      \ rS rSr   SS jrS r\S\4S j5       r\S 5       r	\
S 5       r\
S 5       r\SS	\S
\4S jj5       rSS	\4S jjr   SS	\S\\   4S jjrS\\   S\4S jr   SS	\S\\   4S jjrS\\\4   4S jrSrg)SPTokenizer1   c                 p    Uc   eXl         X l        / SQU l        X0l        X@l        [        U5      U l        g )N)[MASK][gMASK]z[sMASK]z
<unused_0><sop><eop>z<ENC>z<dBLOCK>)
vocab_filenum_image_tokensspecial_tokensmax_blank_lengthbyte_fallbackr   text_tokenizer)r   rP   rQ   rS   rT   s        r   r   SPTokenizer.__init__3   s@     %%%$ 0
 !1*+J7r   c                     U R                   $ r   )rU   r>   s    r   _get_text_tokenizerSPTokenizer._get_text_tokenizerE   s    """r   lengthc                     U S:  d   eSU  S3$ )N   z<|blank_z|>rG   )rZ   s    r   get_blank_tokenSPTokenizer.get_blank_tokenH   s    {{&$$r   c                      g)Nz<|tab|>rG   rG   r   r   get_tab_tokenSPTokenizer.get_tab_tokenM   s    r   c                 .    U R                   R                  $ r   )rU   r   r>   s    r   num_text_tokensSPTokenizer.num_text_tokensQ   s    ""---r   c                 4    U R                   U R                  -   $ r   )rQ   rc   r>   s    r   r   SPTokenizer.num_tokensU   s    $$t';';;;r   r    max_lenc                     U R                  S[        R                  5       5      n [        USS5       H+  nU R                  SU-  [        R	                  U5      5      n M-     U $ )N	    )replacerI   r`   ranger]   )r    rg   is      r   _encode_whitespacesSPTokenizer._encode_whitespacesY   sR    ||D+";";"=>w2&A<<a)D)DQ)GHD 'r   c                 x    U(       a  UR                  SS5      nU(       a  U R                  XR                  S9nU$ )N
<n>)rg   )rm   rp   rS   )r   r    	linebreakwhitespacess       r   _preprocessSPTokenizer._preprocess`   s;    <<e,D++33 , 5Dr   returnc                     U R                  XU5      nU(       d  SU-   nU R                  5       R                  U5      nU Vs/ s H  ofU R                  -   PM     nnU(       a  U$ USS $ s  snf a  
        @param text: Text to encode.
        @param linebreak: Whether to encode newline (
) in text.
        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
        rt   r\   N)rw   rX   r!   rQ   )r   r    ru   rv   add_dummy_prefixtmpxr0   s           r   r!   SPTokenizer.encodeh   sp     =4<D&&(//5589Sd+++S9)v9vabz9 :s   A*text_idsc                    U Vs/ s H  n[        U5      U R                  -
  PM     nnU Vs/ s H  o"S:  d  M
  UPM     nnU R                  5       R                  U5      nUR	                  SS5      nUR	                  [
        R                  5       S5      n[        SU R                  S-   5       H'  nUR	                  U R                  U5      SU-  5      nM)     U$ s  snf s  snf )Nr   rt   rs   ri   r\   rj   rl   )
rE   rQ   rX   r&   rm   rI   r`   rn   rS   r]   )r   r   _idr#   r    ro   s         r   r&   SPTokenizer.decode{   s    ;CD8Cs3x$///8D!.csAXsc.'')005||E4(||K557>q$//!34A<< 4 4Q 7qAD 5 E.s   "C	C"C"c                     U R                  XU5      nU(       d  SU-   nU R                  5       R                  U5      nU(       a  U$ USS $ r{   )rw   rX   r+   )r   r    ru   rv   r|   r0   s         r   r+   SPTokenizer.tokenize   sN     =4<D))+44T:)v9vabz9r   r~   c                    [        U[        5      (       aG  XR                  :  a  SR                  U5      $ U R                  R                  XR                  -
  5      $ [        U[        5      (       az  UR                  S5      (       a<  UR                  S5      (       a&  USS R                  5       (       a  [        USS 5      $ U R                  R                  U5      U R                  -   $ [        S5      e)Nz
<image_{}>z<image_>   rk   zThe key should be str or int.)
isinstancerE   rQ   formatrU   r;   str
startswithendswithisdigitr6   
ValueError)r   r~   s     r   __getitem__SPTokenizer.__getitem__   s    a(((#**1--**>>---/ /3||I&&1::, ,qW__..1Qr7|#**>>../ / <==r   )rT   rS   rQ   rR   rU   rP   N) N  P   T)r   )TT)TTT)rA   rB   rC   rD   r   rX   staticmethodrE   r]   r`   propertyrc   r   r   rp   rw   r   r!   r&   r+   r   r   rF   rG   r   r   rI   rI   1   s   
 8$# % % %   . . < < #      $	:: *.c	:&tCy S   !"&	:: ,09	:$>U38_ >r   rI   c                     ^  \ rS rSrSrSS0r\r/ SQr         S  S!U 4S jjjr	\
S\\   4S	 j5       r\
S\\   4S
 j5       r\
S 5       rS rS rS r  S"S\\\\   4   S\S\S\4S jjrS rS rS#S jr S#S\\   S\\\      S\\   4S jjrS\R:                  SS4S\\\\4   \ 4   S\\   S\S\\   S\\   S\!4S jjr"Sr#U =r$$ )$ChatGLMTokenizer   a  
Construct a ChatGLM tokenizer. Based on byte-level Byte-Pair-Encoding.

Args:
    vocab_file: Path to the vocabulary file.
    do_lower_case: Use lower case letters.
    remove_space: Remove spaces.
    bos_token: The bos token
    eos_token: The Eos Token
    end_token: The end token
    mask_token: The mask token
    gmask_token: The gmask token
    padding_side: The padding side
    num_image_tokens: The `num_image_tokens` in `SPTokenizer`
rP   zice_text.model)	input_idsattention_maskposition_idsry   Nc                    > [        XS9U l        [        TU ]  " SUUU	UUUUUU
S.	UD6  X l        X0l        Xl        X@l        XPl        X`l	        Xpl
        Xl        g )N)rQ   )	do_lower_caseremove_spacepadding_side	bos_token	eos_token	end_token
mask_tokengmask_tokenrQ   rG   )rI   sp_tokenizersuperr   r   r   rP   r   r   r   r   r   )r   rP   r   r   r   r   r   r   r   r   rQ   kwargs	__class__s               r   r   ChatGLMTokenizer.__init__   s~     (; 	 
	'%%!#-
	 
	 +($"""$&r   c                 T    U R                   c  g U R                  U R                   5      $ r   )r   r2   r>   s    r   gmask_token_idChatGLMTokenizer.gmask_token_id   s(    #))$*:*:;;r   c                 T    U R                   c  gU R                  U R                   5      $ )zr
`Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
set.
N)r   r2   r>   s    r   end_token_idChatGLMTokenizer.end_token_id   s&     >>!))$..99r   c                 .    U R                   R                  $ )zReturns vocab size )r   r   r>   s    r   r   ChatGLMTokenizer.vocab_size   s       +++r   c                     [        U R                  5       Vs0 s H  nU R                  U5      U_M     nnUR                  U R                  5        U$ s  snf )zReturns vocab as a dict )rn   r   _convert_id_to_tokenupdateadded_tokens_encoder)r   ro   vocabs      r   	get_vocabChatGLMTokenizer.get_vocab   s[     4??+
+ %%a(!++ 	 
 	T../
s   Ac                     U R                   (       a.  SR                  UR                  5       R                  5       5      nOUnU R                  (       a  UR                  5       nU$ )Nrl   )r   joinstripsplitr   lower)r   inputsoutputss      r   preprocess_text ChatGLMTokenizer.preprocess_text  sF    hhv||~3356GGmmoGr   c                 ^    U R                  U5      nU R                  R                  U5      nU$ )zReturns a tokenized string. )r   r   r+   )r   r    r   seqs       r   	_tokenizeChatGLMTokenizer._tokenize  s-    ##D)((.
r   	token_idsskip_special_tokensclean_up_tokenization_spacesc                     [        U[        5      (       a  U/n[        U5      S:X  a  gU R                  U;   a)  [	        [        U R                  R                  U5      5      nU R                  R                  U5      $ )Nr    )	r   rE   lenpad_token_idlistfilter__ne__r   r&   )r   r   r   r   r   s        r   _decodeChatGLMTokenizer._decode  si    
 i%%"Iy>Q	)VT%6%6$>$>	JKI  ''	22r   c                      U R                   U   $ )z1Converts a token (str) in an id using the vocab. r   r5   s     r   _convert_token_to_id%ChatGLMTokenizer._convert_token_to_id#        ''r   c                      U R                   U   $ )z=Converts an index (integer) in a token (str) using the vocab.r   )r   indexs     r   r   %ChatGLMTokenizer._convert_id_to_token'  r   r   c                    [         R                  R                  U5      (       a.  [         R                  R                  UU R                  S   5      nOUn[        U R                  S5       nUR                  5       nSSS5        [        US5       nUR                  W5        SSS5        U4$ ! , (       d  f       N7= f! , (       d  f       U4$ = f)a9  
Save the vocabulary and special tokens file to a directory.

Args:
    save_directory (`str`):
        The directory in which to save the vocabulary.
    filename_prefix (`str`, *optional*):
        An optional prefix to add to the named of the saved files.

Returns:
    `Tuple(str)`: Paths to the files saved.
rP   rbNwb)	ospathisdirr   vocab_files_namesopenrP   readwrite)r   save_directoryfilename_prefixrP   fin	proto_strwriters          r   save_vocabulary ChatGLMTokenizer.save_vocabulary+  s     77==((n&*&<&<\&JLJ (J$//4(C
I ) *d#vLL# $ ~ )( $# ~s   +B-B>-
B;>
Ctoken_ids_0token_ids_1c                    U R                   U R                     nU R                   U R                     nU R                   U R                     nX1;  a
  XA;  a  X/-  nUS   U:w  a%  US   U:w  a  XR                   U R                     /-  nXR                   U R
                     /-  nUb  U(       a	  US   U:w  a  X%/-  nX-  nU$ )a6  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:

- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
rk   )r   r   r   r   r   r   )r   r   r   mask_ids	gmask_idseos_ids         r    build_inputs_with_special_tokens1ChatGLMTokenizer.build_inputs_with_special_tokensF  s    ( $$T__5%%d&6&67	""4>>2&9+G;&Kr?h&;r?i+G--dnn=>>K))$..9::"+b/V";x'&Kr   encoded_inputs
max_lengthpadding_strategypad_to_multiple_ofreturn_attention_maskc           	         U R                   U R                     nU R                   U R                     nU R                   U R                     nU R                  S:X  d   eXR
                  S      n	[        U	5      n
U[        R                  :X  a  [        U	5      nUb  Ub  X$-  S:w  a
  X$-  S-   U-  nU[        R                  :g  =(       a    [        U	5      U:g  nUGb?  SU;  ar  Xi;   a  U	R                  U5      nOU
n[        R                  " SX45      n[        R                  " U5      nSUSS2SS2SU24'   [        R                  " US:  5      nXS'   SU;  a  [        R                  " U
[        R                   S9nXy;   a  UOUnX;   a  U	R                  U5      nUUWS& [        R"                  " [        R$                  " W[        R                   S9[        R                  " SX-
  S-   [        R                   S9/5      n[        R&                  " UU/SS	9US'   U(       a  U[        U	5      -
  nSU;   a$  [        R(                  " US   S
US4US4/SSS9US'   SU;   a  U R*                  /U-  US   -   US'   SU;   a  S/U-  US   -   US'   SU;   a  [        R(                  " US   S
US4/S9US'   U R,                  /U-  U	-   XR
                  S   '   U$ )a  
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

Args:
    encoded_inputs:
        Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
    max_length: maximum length of the returned list and optionally padding length (see below).
        Will truncate by taking into account the special tokens.
    padding_strategy: PaddingStrategy to use for padding.

        - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
        - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
        - PaddingStrategy.DO_NOT_PAD: Do not pad
        The tokenizer padding sides are defined in self.padding_side:

            - 'left': pads on the left of the sequences
            - 'right': pads on the right of the sequences
    pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
        This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
        `>= 7.5` (Volta).
    return_attention_mask:
        (optional) Set to False to avoid returning attention mask (default: set to model specifics)
leftr   Nrj   r   g      ?r   )dtype)axis)r   r   constantT)	pad_widthmodeconstant_valuestoken_type_idsspecial_tokens_mask)r   )r   r   r   r   r   model_input_namesr   r
   LONGEST
DO_NOT_PADr   nponestrilbool_arangeint64concatenatezerosstackpadpad_token_type_idr   )r   r   r   r   r   r   bos_token_idmask_token_idr   required_input
seq_lengthneeds_to_be_paddedcontext_lengthr   r   r   mask_positionblock_position_ids
differences                      r   _padChatGLMTokenizer._padl  s7   @ ((8))$//:**4+;+;<  F***'(>(>q(AB(
666^,J!&8&D/141Q6:LMJ .1K1KK *PSQ)Q* !~51%3%9%9,%GN%/N!#!Z)D!E!#!889q!_n_45!#.3*>!?3A/0^3!yy288D.;.M]Sa
/$2$8$8$DM4AL1%'^^HH^288<II:6:"((L5 &"
 24!#56Q2@~. #c.&99J>13566"#34%
AQH#$(	4*/0
  >1**44!/0@!A4B/0 %699!/0E!F9G45 /13">2%
A729~. (()J6G 11  r   )	r   r   r   r   r   r   r   r   rP   )	FFrN   rO   z</s>rL   rM   r   r   )ry   N)FTr   )%rA   rB   rC   rD   __doc__r   &PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESmax_model_input_sizesr  r   r   r   rE   r   r   r   r   r   r   r   r   boolr   r   r   r   r   r   r
   r  r   r	   r   dictr  rF   __classcell__)r   s   @r   r   r      s     &'78BG  %#""!$&$"'% #% %N < < <
 :hsm : : , ,	 -2593 d3i03%)3 /33 !	3((< 04$c$ "$s),$ 9=S	$R %),;,F,F,004fd3#45}DEf SMf *	f
 %SMf  (~f 
f fr   r   )r  r   typingr   r   r   r   numpyr  sentencepiecer   transformers.tokenization_utilsr   $transformers.tokenization_utils_baser   r	   transformers.utilsr
   modelscope.utilsr   logging
get_loggerr  r   rI   r   rG   r   r   <module>r*     sf    ' 	 . .   ? L . .				 * &
 :u> u>pi* ir   