
    i                         S SK r S SKJr  S SKJr  S SKJrJrJrJ	r	J
r
JrJrJrJr  S SKrS SKJr  SSKJrJr  \" \5      rSrS	rSq " S
 S5      rg)    N)	getLogger)Path)	AbstractSetcast
CollectionIteratorListLiteralOptionalSequenceUnion)load_tiktoken_bpe   )CL100K_PAT_STRLLAMA_SPECIAL_TOKENSi ia  c                      \ rS rSrSr\S 5       r\\4S\	S\	S\
\	   4S jjrSS	S
.S\	S\S\S\\\S   \\	   4      S\\S   \\	   4   S\
\   4S jjrSS\\   S\S\	4S jjrS\S\	4S jr\S\	S\S\\	   4S j5       rSrg)TiktokenTokenizer,   z
Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
WARNING: The regex and special tokens are hardcoded from Llama 3+.
c                     [         cJ  [        [        R                  R	                  [        R                  R                  [        5      S5      5      q [         $ )Nztokenizer.model)	_INSTANCEr   ospathjoindirname__file__)clss    Z/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/pytorch_tokenizers/tiktoken.pyget_instanceTiktokenTokenizer.get_instance2   s<     )RWW__X68IJI     
model_pathpat_strspecial_tokensc                    [         R                  R                  U5      (       d   U5       e[        U5      n[	        U5      n[        U5       VVs0 s H
  u  pgXuU-   _M     snnU l        [        R                  " [        U5      R                  UUU R                  S9U l        U[	        U5      -   U l        U R                  S   U l        U R                  S   U l        gs  snnf )zt
Initializes the Tokenizer with a Tiktoken model.

Args:
    model_path (str): The path to the Tiktoken model file.
)namer"   mergeable_ranksr#   z<|begin_of_text|>z<|end_of_text|>N)r   r   isfiler   len	enumerater#   tiktokenEncodingr   r%   modeln_wordsbos_ideos_id)selfr!   r"   r#   r&   num_base_tokensitokens           r   __init__TiktokenTokenizer.__init__<   s     ww~~j))5:5)+J7o. 8A7P
7P81EQ&&7P
 &&j!&&+..	

 ,c..AA../BC../@A
s   CN allowed_specialdisallowed_specialsboseosr8   allr9   returnc          	        ^ ^ Uc
  [        5       n[        T5      [        L d   eUU 4S j[        S[	        T5      [
        5       5       n/ nU H-  nUR                  T R                  R                  UUUS95        M/     U(       a  UR                  ST R                  5        U(       a  UR                  T R                  5        U$ )a0  
Encodes a string into a list of token IDs.

Args:
    s (str): The input string to be encoded.
    bos (bool): Whether to prepend the beginning-of-sequence token.
    eos (bool): Whether to append the end-of-sequence token.
    allowed_special ("all"|set[str]): allowed special tokens in string
    disallowed_special ("all"|set[str]): special tokens that raise an error when in string

Returns:
    list[int]: A list of token IDs.

By default, setting disallowed_special=() encodes a string by ignoring
special tokens. Specifically:
- Setting `disallowed_special` to () will cause all text corresponding
  to special tokens to be encoded as natural text (insteading of raising
  an error).
- Setting `allowed_special` to "all" will treat all text corresponding
  to special tokens to be encoded as special tokens.
c              3   v   >#    U  H.  nTR                  TX[        -    [        5        H  nUv   M	     M0     g 7f)N)$_split_whitespaces_or_nonwhitespacesTIKTOKEN_MAX_ENCODE_CHARSMAX_NO_WHITESPACES_CHARS).0r2   substrr:   r0   s      r   	<genexpr>+TiktokenTokenizer.encode.<locals>.<genexpr>~   sC      
@CC!3346N  @s   69r   r7   )settypestrranger(   rB   extendr,   encodeinsertr.   appendr/   )	r0   r:   r;   r<   r8   r9   substrstrE   s	   ``       r   rM   TiktokenTokenizer.encode\   s    < "!eOAw#~~
1c!f&?@
 FHH

!!$3'9 "   HHQ$HHT[[!r    rQ   skip_special_tokensc                 b    U R                   R                  [        [        [           U5      5      $ )z
Decodes a list of token IDs into a string.

Args:
    t (List[int]): The list of token IDs to be decoded.

Returns:
    str: The decoded string.
)r,   decoder   r	   int)r0   rQ   rS   s      r   rU   TiktokenTokenizer.decode   s$     zz  d3i!344r    c                 V    U R                   R                  U5      R                  S5      $ )z
Decodes a single token ID into a string.

Args:
    t (int): The token ID to be decoded.

Returns:
    str: The decoded string.
zutf-8)r,   decode_single_token_bytesrU   )r0   rQ   s     r   decode_tokenTiktokenTokenizer.decode_token   s$     zz33A6==gFFr    max_consecutive_slice_lenc              #     #    Sn[        U 5      S:  a  U S   R                  5       OSnSn[        [        U 5      5       H:  nX   R                  5       nX6-  (       a  SnUnM$  US-  nX!:  d  M0  XU v   UnSnM<     XS v   g7f)z
Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
consecutive whitespaces or consecutive non-whitespaces.
r   Fr   N)r(   isspacerK   )r:   r\   current_slice_lencurrent_slice_is_spaceslice_startr2   is_now_spaces          r   rA   6TiktokenTokenizer._split_whitespaces_or_nonwhitespaces   s      36q6A:15s1vA4<<>L%4$%!)5&!Q&!$@**"#K()%  os   A)B/B)r.   r/   r,   r-   r#   )F)__name__
__module____qualname____firstlineno____doc__classmethodr   r   r   rJ   r	   r4   boolr   r   r
   r   r   rV   rM   r   rU   rZ   staticmethodr   rA   __static_attributes__r6   r    r   r   r   ,   s)   
   &$8	BB B S		BL NREG66 	6
 6 "%C8H(H"IJ6 "'%.*S/"AB6 
c6p5 5D 5S 5
Gc 
Gc 
G +.	# r    r   )r   loggingr   pathlibr   typingr   r   r   r   r	   r
   r   r   r   r*   tiktoken.loadr   	constantsr   r   rd   loggerrB   rC   r   r   r6   r    r   <module>rs      sW    
  
 
 
  + ;	8	
 $ 
 "  	Y Yr    