
    9i                         S r SSKJrJrJrJrJrJr  SSKJ	r	  SSK
Jr  SSKJrJr  SSKJr  SSKJr  \" 5       rS\R(                  0rS0 0rS	S	S
.rSS0SS0S
.r " S S\5      rg)zTokenization classes for PoNet     )TYPE_CHECKINGAnyDictListOptionalUnion)PaddingStrategy)BertTokenizer)BatchEncodingEncodedInput)	ModelFile)
get_logger
vocab_filei   )z nlp_ponet_fill-mask_chinese-basez nlp_ponet_fill-mask_english-basedo_lower_caseTc                       \ rS rSrSr\r\r\	r
\rS\R                  SS4S\\\\4   \4   S\\   S\S\\   S\\   S	\4S
 jjrSrg)PoNetTokenizer0   am  
Construct an PoNet tokenizer. Based on BertTokenizer.

This tokenizer inherits from :class:`~transformers.BertTokenizer` which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.

Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
parameters.
Nencoded_inputs
max_lengthpadding_strategypad_to_multiple_ofreturn_attention_maskreturnc                 L   Uc  SU R                   ;   nXR                   S      nU[        R                  :X  a  [        U5      nUb  Ub  X$-  S:w  a
  X$-  S-   U-  nU[        R                  :g  =(       a    [        U5      U:g  nU(       Gaz  U[        U5      -
  nU R
                  S:X  a  U(       a  S/[        U5      -  S/U-  -   US'   SU;   a  US   U R                  /U-  -   US'   SU;   a  US   S/U-  -   US'   SU;   a  US   US   S   S-   /U-  -   US'   X`R                  /U-  -   XR                   S   '   U$ U R
                  S	:X  a  U(       a  S/U-  S/[        U5      -  -   US'   SU;   a  U R                  /U-  US   -   US'   SU;   a  US   S   S-   /U-  US   -   US'   SU;   a  S/U-  US   -   US'   U R                  /U-  U-   XR                   S   '   U$ [        S
[        U R
                  5      -   5      eU(       a  SU;  a  S/[        U5      -  US'   U$ )a  
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

Args:
    encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or
    batch of tokenized inputs (`List[List[int]]`).
    max_length: maximum length of the returned list and optionally padding length (see below).
        Will truncate by taking into account the special tokens.
    padding_strategy: PaddingStrategy to use for padding.

        - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
        - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
        - PaddingStrategy.DO_NOT_PAD: Do not pad
        The tokenizer padding sides are defined in self.padding_side:

            - 'left': pads on the left of the sequences
            - 'right': pads on the right of the sequences
    pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
        This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
        >= 7.5 (Volta).
    return_attention_mask: (optional) Set to False to avoid returning
    attention mask (default: set to model specifics)
attention_maskr      righttoken_type_idsspecial_tokens_masksegment_idsleftzInvalid padding strategy:)
model_input_namesr	   LONGESTlen
DO_NOT_PADpadding_sidepad_token_type_idpad_token_id
ValueErrorstr)	selfr   r   r   r   r   required_inputneeds_to_be_padded
differences	            h/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/ponet/tokenization.py_padPoNetTokenizer._pad@   s2   @ !($48N8N$N!'(>(>q(AB666^,J!&8&D/141Q6:LMJ .1K1KK *PSQ)Q* #c.&99J  G+(89sS&>( 8(+,#
*:8;N#34#~5&'78112Z?@ ##34 )N:<J-=/23z1A=BN#89 N2)7)F*=9"=AJ&J' *' #%'
 ),=,=+>+KK 55  4 1 ""f,(89sZ7GKN+K, 8,N#34 $~5..8"8#%34D%E8FN#34 !N25CM5RSU5VYZ5Z4[^h4h4B=4Q5RN=1(N:="=#%34I%J=KN#89  ,,-
:^K 55    !!<#&t'8'8#9": ; ;"'7~'M01sS5H/HN+,     )__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesPRETRAINED_VOCAB_FILES_MAPpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESmax_model_input_sizesPRETRAINED_INIT_CONFIGURATIONpretrained_init_configurationr	   r&   r   r   r+   r   r   r   intbooldictr1   __static_attributes__r4   r3   r0   r   r   0   s     *!;B$A!
 %),;,F,F,004\d3#45}DE\ SM\ *	\
 %SM\  (~\ 
\ \r3   r   N)r9   typingr   r   r   r   r   r   transformers.file_utilsr	   *transformers.models.bert.tokenization_bertr
   transformers.tokenization_utilsr   r   modelscope.utils.constantr   modelscope.utils.loggerr   logger
VOCAB_FILEr:   r<   r>   r@   r   r4   r3   r0   <module>rN      s     & B B 3 D G / .	!9#7#78 *B/  ),(+* & 	) 	)	! l] lr3   