
    9i>                     p   S SK r S SKrS SKJrJr  S SKrS SKrS SKJr  S SKJ	r
  S SKJr  S SKJr  S SKJr  SS	KJr  SS
KJr   " S S\R*                  5      r " S S\R*                  5      r " S S\R*                  5      r " S S\R*                  5      r " S S\R*                  5      r " S S\5      rg)    N)OptionalUnion)nn)
functional)PreTrainedModel)TokenGeneratorOutput)	ModelFile   )
GPT3Config)samplec                   H   ^  \ rS rSrSrU 4S jrS r SS jrSS jrSr	U =r
$ )	GPT3SelfAttention    zParallel self-attention layer abstract class.

Self-attention layer takes input with size [s, b, h]
and returns output of the same size.
c                 .  > [         TU ]  5         UR                  U l        UR                  U l        U R                  U R                  -  U l        [
        R                  " U R                  SU R                  -  5      U l        [
        R                  " SS9U l	        [
        R                  " UR                  5      U l        [
        R                  " U R                  U R                  5      U l        [
        R                  " UR                  5      U l        g )N   dim)super__init__hidden_sizenum_attention_headshidden_size_per_attention_headr   Linearquery_key_valueSoftmaxsoftmaxDropoutattention_probs_dropout_probattention_dropoutdensehidden_dropout_proboutput_dropoutselfconfig	__class__s     c/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/gpt3/backbone.pyr   GPT3SelfAttention.__init__'   s    !--#)#=#=   8 88 	+  "yy)9)9)*T-=-=)= ?zzb)!#//"1 YYt//1A1AB
 jj)C)CD    c                     UR                  5       SS U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )zOTranspose a 3D tensor [b, s, np*hn] into a 4D tensor with
size [b, np, s, hn].
Nr   r      r
   r   )sizer   r   viewpermute)r%   tensornew_tensor_shapes      r(   _transpose_for_scores'GPT3SelfAttention._transpose_for_scores:   sX     ";;="-$$d&I&I1K K./~~aAq))r*   c                     UR                  5       S-
  nUR                  5       U   U-  n[        R                  " XUS9nU(       a  [	        S U 5       5      $ U$ )Nr
   r   c              3   @   #    U  H  oR                  5       v   M     g 7fN)
contiguous).0chunks     r(   	<genexpr>AGPT3SelfAttention._split_tensor_along_last_dim.<locals>.<genexpr>N   s     E))++s   )r   r-   torchsplittuple)r%   r0   num_partitionscontiguous_split_chunkslast_dimlast_dim_sizetensor_lists          r(   _split_tensor_along_last_dim.GPT3SelfAttention._split_tensor_along_last_dimC   sS    
 ::<!#h/>Akk&XF"EEEEr*   c                 V   UR                  S5      n[        R                  " USSXD/5      nU R                  U5      nU R	                  US5      u  pgnU R                  U5      n	U R                  U5      n
U R                  U5      nUR                  5       n[        R                  " U	U
R                  SS5      5      nU[        R                  " U R                  5      -  nU(       af  U
R                  S5      n[        R                  " [        R                  " SXN4UR                  S95      R                  SSXN5      R                  U5      nSSU-
  -  n[        R                   " X5      U-
  R                  U5      nU R#                  U5      nU R%                  U5      n[        R                  " UU5      nUR'                  S	SSS5      R)                  5       nUR                  5       S S U R*                  4-   nUR                  " U6 nU R-                  U5      nU R/                  U5      nU$ )
Nr
   r   r   r,   deviceg     @      ?r   )r-   r<   reshaper   rD   r2   typematmul	transposemathsqrtr   trilonesrI   r.   mulr   r    r/   r7   r   r!   r#   )r%   hidden_states	ltor_maskis_infertgt_lenmixed_x_layermixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerprevious_typeattention_scoressrc_lenconverted_maskattention_probscontext_layernew_context_layer_shapeoutputs                       r(   forwardGPT3SelfAttention.forwardR   s    
  $$Q'MM)aG-EF	,,];--mQ? 	@	-> 001BC..?	001BC#((* !<<(1(;(;B(CE+dii///1 1 nnQ'G



Aw0"/"6"6899= !W:77;tM7J  !C)O4!II&6B,-.2d=.A 	 ,,'78 00A _kB%--aAq9DDF"/"4"4"6s";?  #  &**,CD M*$$V,r*   )r    r!   r   r   r   r#   r   r   )F)__name__
__module____qualname____firstlineno____doc__r   r2   rD   rg   __static_attributes____classcell__r'   s   @r(   r   r       s(    E&* >C6 6r*   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )GPT3MLP   zMLP.

MLP will take the input with h hidden state, project it to 4*h
hidden dimension, perform nonlinear transformation, and project the
state back into h hidden dimension.
c                 *  > [         TU ]  5         UR                  n[        R                  " USU-  5      U l        [        R                  U l        [        R                  " SU-  U5      U l	        [        R                  " UR                  5      U l        g )N   )r   r   r   r   r   dense_h_to_4hFgeluactivation_funcdense_4h_to_hr   r"   dropout)r%   r&   r   r'   s      r(   r   GPT3MLP.__init__   si    ((YY{AOD vvYYq;Dzz&"<"<=r*   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r6   )rv   ry   rz   r{   )r%   rT   intermediate_parallelrf   s       r(   rg   GPT3MLP.forward   sK     !% 2 2= A $ 4 45J K##$9:f%r*   )ry   rz   rv   r{   	ri   rj   rk   rl   rm   r   rg   rn   ro   rp   s   @r(   rr   rr      s    
> r*   rr   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )GPT3TransformerLayer   zwA single transformer layer.

Transformer layer takes input with size [s, b, h] and returns an
output of the same size.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  S9U l        [        U5      U l        [        R                  " UR                  UR
                  S9U l	        [        U5      U l        g N)eps)r   r   r   	LayerNormr   layernorm_epsiloninput_layernormr   	attentionpost_attention_layernormrr   mlpr$   s     r(   r   GPT3TransformerLayer.__init__   sq      "||F$<$< > +62 )+F$<$<)>% 6?r*   c                     U R                  U5      nU R                  X25      nX-   nU R                  U5      nU R                  U5      nXV-   nU$ r6   )r   r   r   r   )r%   rT   rU   layernorm_outputattention_outputlayernorm_input
mlp_outputrf   s           r(   rg   GPT3TransformerLayer.forward   sZ    
  //>>>*:F':88IXX./
 -r*   )r   r   r   r   r   rp   s   @r(   r   r      s    #" r*   r   c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )GPT3Transformer   zTransformer class.c                 Z  > [         TU ]  5         S U l        UR                  U l        [
        R                  R                  [        U R                  5       Vs/ s H  n[        U5      PM     sn5      U l
        [        R                  " UR                  UR                  S9U l        g s  snf r   )r   r   input_tensornum_hidden_layers
num_layersr<   r   
ModuleListranger   layersr   r   r   final_layernorm)r%   r&   _r'   s      r(   r   GPT3Transformer.__init__   s      !22hh))383IJ3Ia!&)3IJL  "||F$<$< > Ks   B(c                      U R                   U   $ r6   )r   )r%   layer_numbers     r(   
_get_layerGPT3Transformer._get_layer   s    {{<((r*   c                     [        U R                  5       H  nU R                  U5      nU" X5      nM     U R                  U5      nU$ r6   )r   r   r   r   )r%   rT   attention_maskindexlayers        r(   rg   GPT3Transformer.forward   sG     4??+EOOE*E!-@M ,
 ,,];r*   )r   r   r   r   )
ri   rj   rk   rl   rm   r   r   rg   rn   ro   rp   s   @r(   r   r      s    >)
 
r*   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )GPT3TransformerLanguageModel   a  Transformer language model.

Arguments:
    transformer_hparams: transformer hyperparameters
    vocab_size: vocabulary size
    max_sequence_length: maximum size of sequence. This
                         is used for positional embedding
    embedding_dropout_prob: dropout probability for embeddings
    num_tokentypes: size of the token-type embeddings. 0 value
                    will ignore this embedding
c                 L  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l        [        U5      U l        g r6   )r   r   r   	Embedding
vocab_sizer   word_embeddingsmax_position_embeddingsposition_embeddingsr   r"   embedding_dropoutr   transformerr$   s     r(   r   %GPT3TransformerLanguageModel.__init__  s|      "||F,=,=,2,>,> @#%<<0N0N060B0B$D !#F,F,F!G +62r*   c                     U R                  U5      nU R                  U5      nXE-   nU R                  U5      nU R                  UU5      n[        R
                  " XR                   R                  5      n	U	$ r6   )r   r   r   r   rw   linearweight)
r%   	input_idsr   position_idswords_embeddingsr   
embeddingstransformer_inputtransformer_outputlogitss
             r(   rg   $GPT3TransformerLanguageModel.forward  sv    //	:"66|D%;
 22:>!--.?.<> ,.B.B.I.IJr*   )r   r   r   r   r   rp   s   @r(   r   r      s    
3
 
r*   r   c                      ^  \ rS rSr\rS rU 4S jr   S
S jr\	S\
\\\R                  4      4S j5       rSS jrSS jrS	rU =r$ )	GPT3Modeli  c                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        gg)zInitialize the weightsg        )meanstdNrJ   )
isinstancer   r   r   datanormal_r&   initializer_rangebiaszero_r   padding_idxr   fill_)r%   modules     r(   _init_weightsGPT3Model._init_weights#  s   fbii(( MM&&dkk;; ' ={{&  &&( '--MM&&dkk;; ' =!!-""6#5#56<<> .--KK""$MM$$S) .r*   c                 D   > [         TU ]  U5        [        U5      U l        g r6   )r   r   r   language_modelr$   s     r(   r   GPT3Model.__init__5  s     :6Br*   c                 H   UR                  S5      n[        R                  " [        R                  " SSXf4[        R                  UR
                  S95      nUcN  [        R                  " U[        R                  UR
                  S9nUR                  S5      R                  U5      nU R                  XU5      nS nUbQ  [        R                  " 5       n	U	" UR                  SU R                  R                  5      UR                  S5      5      n[        R                   " XS9$ )Nr
   dtyperI   r   r   )lossr   )r-   r<   rQ   rR   longrI   arange	unsqueeze	expand_asr   r   CrossEntropyLossr.   r&   r   addictDict)
r%   r   r   r   labelskwargs
seq_lengthr   r   loss_fcts
             r(   rg   GPT3Model.forward9  s     ^^A&
JJ1j5"ZZ'..01  <<%**Y5E5EGL'11!4>>yIL$$YM**,HB 6 67RJD{{44r*   pretrained_model_name_or_pathc                    U R                   R                  U5      nU " U5      n[        R                  R	                  U[
        R                  5      n[        R                  " U5      nSU;   a  US   nUR                  5        VVs0 s H  u  pgUR                  SS5      U_M     nnnUR                  U5        U$ s  snnf )N
state_dictzmodel.language_modelr   )config_classfrom_pretrainedospathjoinr	   TORCH_MODEL_BIN_FILEr<   loaditemsreplaceload_state_dict)clsr   r&   modelstate_dict_filer   kvs           r(   r   GPT3Model.from_pretrainedQ  s     !!11)+F'',,'D'0'E'EGZZ0
:%#L1J #((*
* II,.>?B* 	 
 	j)
s   B;c           
   +   Z  #    UR                  SU R                  R                  5      nUR                  SU R                  R                  5      nUR                  SUR	                  S5      S-   5      nUR	                  S5      nUR                  S[
        R                  " UR	                  S5      /UR                  S95      nUR                  5       R                  5       n	[        UU R                  R                  5      n
X:  a  [        S	5      eXR	                  S5      -
  nUS:  aC  [
        R                  " X{UR                  S9R                  5       n[
        R                  " X4S
S9nU R                  R                  n[
        R                  " U[
        R                   UR                  S9n[
        R"                  " 5          [%        U	U
5       H  nUS S 2S U24   nU " U5      R&                  nUS S 2S
S S 24   n[)        UUUUU R                  R*                  S9nX:*  nUU   UUU4'   [-        US S 2S US-   24   S9v   UU:H  R/                  5       UR/                  5       -  nUU-  n[
        R0                  " U5      nU(       d  M    O   S S S 5        g ! , (       d  f       g = f7f)Ntop_ktop_p
max_lengthr
   d   r   prompt_lengthrH   zcontext length too larger   r   r   )r   r   temperaturer   )	sequences)popr&   r   r   r-   r<   r0   rI   minitemr   
ValueErrorzerosr   cateod_iduint8no_gradr   r   r   r   r   byteall)r%   tokensr   r   r   r   r   
batch_sizelengthsmin_prompt_lengthmax_sequence_length
pad_lengthpadstermination_idis_generation_donecontext_length
tokens2user   last_token_logits
new_samplestarted
done_tokendones                          r(   streaming_generateGPT3Model.streaming_generated  sy    

7DKK$5$56

7DKK$5$56ZZfkk!ns.BC
[[^
**LL&++a.)&--@B $KKM..0!*"&++"E"EG 3788(;;q>9
>;;v}}>>Bdf YY~26F ++ #[[ekk&--A ]]_"'(9(;#= $A$67
 j)00 %+1b!8$4!#% +#{{557
 "32<W2Ew./*VA @FCDBE @F =F 6G H H )N:@@BLLN#
 &8*%D"yy!344E#= __s%   GJ+CJJ	J+
J($J+c                 B    S nU R                   " X40 UD6 H  nUnM     U$ r6   )r  )r%   r
  r   r   last_outputrf   s         r(   generateGPT3Model.generate  s-    --fLVLF K Mr*   )r   )NNN)rJ   )ri   rj   rk   rl   r   r   r   r   rg   classmethodr   r   strr   PathLiker   r  r  rn   ro   rp   s   @r(   r   r     sn    L*$C  $!	50 08s?A{{@K :L 1M $CJ r*   r   )rO   r   typingr   r   r   r<   r   torch.nnr   rw   transformers.modeling_utilsr   modelscope.outputsr   modelscope.utils.constantr	   configurationr   distributed_gpt3r   Moduler   rr   r   r   r   r    r*   r(   <module>r,     s      	 "    $ 7 3 / % $h		 hVbii >)299 )Xbii D$299 $NN Nr*   