
    i                     H    S SK r S SKrS SKrS SKJr  S SKJr   " S S5      rg)    N)List)SentencePieceProcessorc            	           \ rS rSrS\4S jrS\S\S\S\\   4S jr	S	\\   S\4S
 jr
S	\S\4S jrSS.S\S\SS4S jjrSrg)Llama2cTokenizer   
model_pathc                 ,   [         R                  R                  U5      (       d
   SU 35       e[        US9U l        Xl        U R                  R                  5       U l        U R                  R                  5       U l        U R                  R                  5       U l	        [        R                  " SU R                   SU R                   SU R                   35        U R                  R                  5       U R                  R                  5       :X  d   eg )Nz*Need a valid tokenizer model path but got )
model_filez#words: z - BOS ID: z - EOS ID: )ospathisfiler   sp_modelr   
vocab_sizen_wordsbos_ideos_idlogginginfoget_piece_size)selfr   s     Y/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/pytorch_tokenizers/llama2c.py__init__Llama2cTokenizer.__init__   s    ww~~j)) 	
8E	
) /*E$ !MM446==//1==//1t||nK}K}U	
 }}'')T]]-I-I-KKKK    sboseosreturnc                     [        U5      [        L d   eU R                  R                  U5      nU(       a  U R                  /U-   nU(       a  X@R
                  /-   nU$ N)typestrr   encoder   r   )r   r   r   r   ts        r   r#   Llama2cTokenizer.encode#   sO    Aw#~~MM  #!A[[M!Ar   r$   c                 8    U R                   R                  U5      $ r    r   decoder   r$   s     r   r(   Llama2cTokenizer.decode-       }}##A&&r   c                 8    U R                   R                  U5      $ r    r'   r)   s     r   decode_tokenLlama2cTokenizer.decode_token1   r+   r   F)prepend_paddingoutput_pathr/   Nc                   / / pCU(       a1  UR                  SR                  S5      5        UR                  S5        [        U R                  5       H  nU R                  R                  U5      nU R                  R                  U5      nXPR                  :X  a  SnOXPR                  :X  a  SnUR                  SS5      nUR                  S5      nUR                  U5        UR                  U5        M     U(       d  SO[        S	 U 5       5      n	[        US
5       n
U
R                  [        R                  " SU R                  U R                  U R                  U	5      5        [        X45       HF  u  pU
R                  [        R                  " SU[!        U5      5      5        U
R                  U5        MH     SSS5        ["        R$                  " SU 35        g! , (       d  f       N(= f)a   
Export tokenizer.model to another serialization format. Here we did some lightweight
processing such as supporting prepend padding token, prepend max token length and
replace '_' back to empty space.

The binary format is:
1. vocab size: int32
2. bos token id: int32
3. eos token id: int32
4. max token length: int32
5. score: float32, len of bytes: int32, token bytes: [byte] for each token

:param output_path: output path of the new binary.
:param prepend_padding: a boolean to control if we want to prepend a padding token.

:return: None
z<pad>zutf-8z<s>z</s>u   ▁ r   c              3   8   #    U  H  n[        U5      v   M     g 7fr    )len).0r$   s     r   	<genexpr>*Llama2cTokenizer.export.<locals>.<genexpr>b   s     3KFqCFFFs   wbIIIIfINzWrote tokenizer to )appendr#   ranger   r   id_to_piece	get_scorer   r   replacemaxopenwritestructpackzipr5   r   r   )r   r0   r/   tokensscoresir$   r   bmax_token_lengthfbytesscores                r   exportLlama2cTokenizer.export5   st   ( RMM'..12MM"t||$A ))!,A''*AKKkk!		%%A!AMM!MM! %$ %+13KF3K0K +t$GGDLL$++t{{DT
 !$F 3D%U<= !4 % 	*;-89 %$s   BG
G*)r   r   r   r   r   )__name__
__module____qualname____firstlineno__r"   r   boolr   intr#   r(   r-   rO   __static_attributes__ r   r   r   r      s    L3 L$ $ T d3i 'S	 'c ''c 'c ' CH ::# ::4 ::D :: ::r   r   )r   r   rD   typingr   sentencepiecer   r   rX   r   r   <module>r[      s"     	   J_: _:r   