
    9i                         S SK JrJr  S SKrS SKJr  S SKJr  S\S\4S jr	S\
S\S\4S	 jr " S
 S\5      r " S S\5      rg)    )ListUnionN)AutoTokenizer)GPT2TokenizerFaststart_extra_idmax_lenc                    ^^ S[         S[        4UU4S jjnSnSnU  H0  nUS:X  a  US-  nUT:X  a  U" XE5      nSnM   M"  U" XE5      nSnXV-   nM2     U" XE5      nU$ )zEncode whitespaces to extra tokens in GPT-J.

>>> encode_whitespaces('a\n  b\n   c', 10, 10)
'a\n<|extratoken_10|>b\n<|extratoken_11|>c'
acc_lentextc                 t   > U S:X  a  U$ U S:X  a  US-   $ U T::  d   ST SU  35       eTS-
  U -   nSU S3nX-   $ )	Nr       zMax whitespace run length z, but found    <|extratoken_|> )r
   r   extra_idextra_tokenr   r   s       h/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/codegeex/tokenizer.pypush_acc_space*encode_whitespaces.<locals>.push_acc_space   si    a<Ka<#:'!^%?yU\T]#^^!!A%/%hZr2!!    r    r   r   )intstr)r   r   r   r   r
   reschs    ``    r   encode_whitespacesr   	   s    " "3 " " G
C9qLG'!$W2 " !.CG(C  
&CJr   r   c                 x    [        SUS-   5       H&  nUS-
  U-   nSU S3nU R                  USU-  5      n M(     U $ )zDecode the whitespace-encoded strings produced by encode_whitespace.

>>> text = 'a\n  b\n   c'
>>> s, l = 10, 10
>>> text == decode_whitespaces(encode_whitespaces(text, s, l), s, l)
True
r   r   r   r   r   )rangereplace)r   r   r   ltoken_idtokens         r   decode_whitespacesr%   ,   sQ     1gk"!A%)z,||E37+ # Kr   c                       \ rS rSr  SS\S\\   S\4S jjrS\4S jrS\4S	 jr	S
\S\4S jr
S rS
\4S jrS\4S jrS\\\4   4S jrS rS rSrg)Code13BDictionary;   N	dict_fileextra_token_idspad_to_vocab_sizec                    [        5       U l        [        5       U l        SU l        / U l        U R                  SS5        U R                  SS5        U R                  SS5        U R                  SS5        U R                  U5        Uc%  [        SS5       Vs/ s H  n[        U5      PM     nnU H  nU R                  US5        M     US:  a  U R                  U5        g g s  snf )Nr   z<s>z<pad>z</s>z<unk>iQ  i  )
dict_idx_count_num_symbols_symbols_add_symbol
_load_dictr    r   _pad_to_vocab_size)selfr)   r*   r+   xr#   s         r   __init__Code13BDictionary.__init__=   s     F	f"!$#!$	""/4UE/B !/B!s1v/BO ! (HXq) ( q ##$56 !!s   C"
vocab_sizec                     U[        U 5      -
  nUS::  a  g [        SUS-   5       H$  nU R                  SR                  U5      S5        M&     g )Nr   r   zvocab_pad_token{})lenr    r2   format)r5   r9   num_padis       r   r4   $Code13BDictionary._pad_to_vocab_sizeX   sK    s4y(a<q'A+&A077:A> 'r   c                    [        US5       nU H^  nUR                  5       nUS:X  d  UR                  S5      (       a  M1  UR                  5       u  pEU R	                  U[        U5      5        M`     S S S 5        g ! , (       d  f       g = f)Nrr   #)openstrip
startswithsplitr2   r   )r5   r)   flinesymcounts         r   r3   Code13BDictionary._load_dict_   sh    )S!Qzz|2:!5!5!ZZ\
  c%j1  "!!s   A%A;;
B	rI   rJ   c                     U R                   U R                  U'   X R                  U'   U R                  R	                  U5        U =R                   S-  sl         g )Nr   )r0   r.   r/   r1   append)r5   rI   rJ   s      r   r2   Code13BDictionary._add_symbolh   sD    **		# CS!Qr   c                     U R                   $ N)r0   )r5   s    r   __len__Code13BDictionary.__len__n   s       r   c                      U R                   U   $ rP   )r.   )r5   rI   s     r   indexCode13BDictionary.indexq   s    yy~r   idxc                      U R                   U   $ rP   )r1   )r5   rV   s     r   stringCode13BDictionary.stringt   s    }}S!!r   r$   c                 d    [        U[        5      (       a  [        U5      nU R                  U5      $ rP   )
isinstancer   r   rT   )r5   r$   s     r   	map_tokenCode13BDictionary.map_tokenw   s'    eS!!JEzz%  r   c                 N    U Vs/ s H  o R                  U5      PM     sn$ s  snf rP   )r\   )r5   tokensr$   s      r   
map_tokensCode13BDictionary.map_tokens|   s!    39:6%u%6:::s   "c                     U Vs/ s H  nUS:X  a  SOU R                  U5      PM     nnU Vs/ s H%  oDR                  S5      (       a  M  [        U5      PM'     sn$ s  snf s  snf )NiP  50256vocab_pad_token)rX   rE   r   )r5   r_   r$   decodedr6   s        r   decode_tokensCode13BDictionary.decode_tokens   sj      
 ~G4;;u+== 	 
 !(O1||<M/NAOO	
 Ps   #AA"A")r/   r.   r0   r1   )N)__name__
__module____qualname____firstlineno__r   r   r   r7   r4   r3   r2   rQ   rT   rX   r   r\   r`   rf   __static_attributes__r   r   r   r'   r'   ;   s    
 &*!#	77 c7 	76?S ?2C 2s 3 ! "# "!uS#X !
;Pr   r'   c                   V    \ rS rSr      SS\S\S\S\S\4
S jjrS	\4S
 jrS r	Sr
g)CodeGeeXTokenizer   N	tokenizertokenizer_pathr   r   r)   c                     Ub  UO[         R                  " U5      U l        US;  a  [        SU S35      eX0l        X@l        XPl        Ub"  U R                  S:X  a
  [        USS9OS U l        OS U l        U R                  R                  U l	        g )N)codegeex-13bcodegeex-python-13bzInvalid mode z5, choose from ['codegeex-13b', 'codegeex-python-13b']ru   i   )r+   )
r   from_pretrainedrq   
ValueErrorr   r   moder'   	code_dicteos_token_id)r5   rq   rr   r   r   rx   r)   s          r   r7   CodeGeeXTokenizer.__init__   s     '0&;A^A^B>>v%Z[  -	  33 /U9= N "DN NN77r   codec                    U R                   S:X  a<  [        XR                  U R                  5      nU R	                  USS9R
                  nU$ U R                   S:X  az  [        XR                  U R                  5      nU R                  R                  U R                  R                  U5      5      n[        R                  " U5      R                  SS5      nW$ )Nrt   F)is_split_into_wordsru   r   rh   )rx   r   r   r   rq   	input_idsry   r`   encodetorch
LongTensorreshape)r5   r|   r   s      r   encode_codeCodeGeeXTokenizer.encode_code   s    99&%d,?,?ND% ' 11:   YY//%d,?,?ND11$..2G2G2MNI((3;;ArBIr   c                    U R                   S:X  a<  U R                  R                  USS9n[        X R                  U R
                  5      nU$ U R                   S:X  ag  U R                  R                  UR                  5       S   5      /nU R                  R                  USS9n[        X R                  U R
                  5      nW$ )Nrt   F)skip_special_tokensru   r   )	rx   rq   decoder%   r   r   ry   rf   tolist)r5   r   r   output_codes       r   decode_codeCodeGeeXTokenizer.decode_code   s    99&>>(((ND,T3F3F-1\\;K  YY//55i6F6F6H6KLMI>>(((ND,T3F3F-1\\;K r   )ry   rz   r   rx   r   rq   )NzEleutherAI/gpt-j-6B
   r   rt   N)ri   rj   rk   rl   r   r   r   r7   r   r   rm   r   r   r   ro   ro      s\     (,3 8$8 8 	8
 8 84 r   ro   )typingr   r   r   transformersr   transformers.models.gpt2r   r   r   r   r%   objectr'   ro   r   r   r   <module>r      s\      & 6 S  3  FS #  IP IPX4 4r   