
    9i
                     0    S SK Jr  S SKJr   " S S5      rg)    )List)	Tokenizerc                       \ rS rSrSrS r\S 5       r\S 5       r\S 5       r	SS\
S\S	\\   4S
 jjrSS\\   S\S	\
4S jjr\S 5       rSrg)JiebaBPETokenizer   z2SentencePiece BPE tokenizer with Jieba integrationc                 ^   SU l         [        R                  " U5      U l        U R                  R	                  S5      U l         SS KnSS KnUR                  UR                  5        X l        U R                  S   U l        U R                  S   U l        g ! [         a    [        S5      ef = f)NzJieba BPE Tokenizerz<|endoftext|>r   zfYou need to install jieba to use JiebaTokenizer. See https://pypi.org/project/jieba/ for installation.
z<sep>)namer   	from_file	tokenizertoken_to_ideod_idjiebaloggingsetLogLevelINFOImportErrorvocabnew_line	sep_token)selftokenizer_json_filer   r   s       d/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/gpt3/tokenizer.py__init__JiebaBPETokenizer.__init__   s    )	",,-@Ann00A	Igll+
 


4(G,  	IHI I	Is   #B B,c                 4    U R                   R                  SS9$ NT)with_added_tokens)r   get_vocab_sizer   s    r   
vocab_sizeJiebaBPETokenizer.vocab_size(   s    ~~,,t,DD    c                 4    U R                   R                  SS9$ r   )r   	get_vocabr    s    r   r   JiebaBPETokenizer.vocab,   s    ~~''$'??r#   c                 l    U R                   n[        5       nUR                  5        H	  u  p4X2U'   M     U$ N)r   dictitems)r   r   	inv_vocabkeyvals        r   r+   JiebaBPETokenizer.inv_vocab0   s1    

F	HC cN &r#   textis_codereturnc                    U(       dO  U R                   R                  U5       Vs/ s H  o3PM     nnU R                  R                  USSS9R                  $ U R                  R                  USSS9R                  $ s  snf )z	
        T)is_pretokenizedadd_special_tokensF)r   cutr   encodeids)r   r/   r0   xseg_lists        r   tokenizeJiebaBPETokenizer.tokenize8   s     #'::>>$#78#7a#7H8>>(($4 ) IILM >>((e ) FFIcJ	 9s   A<	token_ids
early_stopc                     U(       a.  U R                   U;   a  US UR                  U R                   5       nU R                  R                  USS9nU$ )NT)skip_special_tokens)r   indexr   decode)r   r<   r=   r/   s       r   
detokenizeJiebaBPETokenizer.detokenizeC   sH    $..I5!"B9??4>>#BCI~~$$YD$Ir#   c                     U R                   $ r(   )r   r    s    r   eodJiebaBPETokenizer.eodI   s    {{r#   )r   r   r
   r   r   r   N)F)T)__name__
__module____qualname____firstlineno____doc__r   propertyr!   r   r+   strboolr   intr:   rB   rE   __static_attributes__ r#   r   r   r      s    <-" E E @ @  	JS 	J4 	JDI 	JDI 4 3   r#   r   N)typingr   
tokenizersr   r   rQ   r#   r   <module>rT      s      7 7r#   