
    9i                     $    S SK Jr   " S S5      rg)    )	Tokenizerc                   n    \ rS rSrSrS r\S 5       r\S 5       r\S 5       r	SS jr
S r\S	 5       rS
rg)JiebaBPETokenizer   z2SentencePiece BPE tokenizer with Jieba integrationc                     SU l         [        R                  " U5      U l        U R                  R	                  S5      U l         SS KnX l        U R                  S   U l	        U R                  S   U l
        g ! [         a    [        S5      ef = f)NzJieba BPE Tokenizerz<|endoftext|>r   zhYou need to install rjieba to use JiebaTokenizer. See https://pypi.org/project/rjieba/ for installation.
z<sep>)namer   	from_file	tokenizertoken_to_ideod_idjiebaImportErrorvocabnew_line	sep_token)selftokenizer_json_filer   s      g/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/gpt_moe/tokenizer.py__init__JiebaBPETokenizer.__init__   s    )	",,-@Ann00A	J
 


4(G,  	JIJ J	Js   A7 7Bc                 4    U R                   R                  SS9$ NT)with_added_tokens)r   get_vocab_sizer   s    r   
vocab_sizeJiebaBPETokenizer.vocab_size$   s    ~~,,t,DD    c                 4    U R                   R                  SS9$ r   )r   	get_vocabr   s    r   r   JiebaBPETokenizer.vocab(   s    ~~''$'??r   c                 l    U R                   n[        5       nUR                  5        H	  u  p4X2U'   M     U$ N)r   dictitems)r   r   	inv_vocabkeyvals        r   r'   JiebaBPETokenizer.inv_vocab,   s1    

F	HC cN &r   c                    U(       dO  U R                   R                  U5       Vs/ s H  o3PM     nnU R                  R                  USSS9R                  $ U R                  R                  USSS9R                  $ s  snf )NT)is_pretokenizedadd_special_tokensF)r   cutr   encodeids)r   textis_codexseg_lists        r   tokenizeJiebaBPETokenizer.tokenize4   s    #'::>>$#78#7a#7H8>>(($4 ) IILM >>((e ) FFIcJ	 9s   A<c                 :    U R                   R                  USS9nU$ )NF)skip_special_tokens)r   decode)r   	token_idsr1   s      r   
detokenizeJiebaBPETokenizer.detokenize=   s     ~~$$YE$Jr   c                     U R                   $ r$   )r   r   s    r   eodJiebaBPETokenizer.eodA   s    {{r   )r   r   r	   r   r   r   N)F)__name__
__module____qualname____firstlineno____doc__r   propertyr   r   r'   r5   r;   r>   __static_attributes__ r   r   r   r      si    <- E E @ @  J  r   r   N)
tokenizersr   r   rG   r   r   <module>rI      s    !1 1r   