
    9i                     ~   S SK r S SKrS SKJr  S SKJrJrJrJrJ	r	  S SK
r
S SKJrJr  S SKJr  S SKJrJrJrJr  S SKJr  S SK
Jr  S S	KJr  S S
KJr  S SKJr  S SKJ r   S SK!J"r"J#r#  S SK$J%r%  S SK&J'r'  S SK(J)r)   " S S\RT                  5      r+ " S S\RT                  5      r, " S S\RT                  5      r-S r. " S S\RT                  5      r/ " S S\RT                  5      r0 " S S5      r1S r2S r3\
Rh                  Rj                  S \
Rl                  S!\
Rl                  S"\
Rl                  S#\7S$\
Rl                  4
S% j5       r8\
Rh                  Rj                  S \
Rl                  S!\
Rl                  S"\
Rl                  S#\7S$\
Rl                  4
S& j5       r9 " S' S(\RT                  5      r: " S) S*\RT                  5      r; " S+ S,\RT                  5      r<S- r=S. r> " S/ S0\5      r?S1 r@S2 rAS?S3 jrB " S4 S55      rCS6 rDS7\\E\
Rl                  4   S8\?S9\FS$\\E\
Rl                  4   4S: jrG " S; S<\\)5      rH " S= S>5      rIg)@    N)OrderedDict)CallableDictListOptionalUnion)get_argsmpu)get_global_memory_buffer)AttnMaskTypeFloat16Module	LayerNormbias_gelu_impl)FusedScaleMaskSoftmax)nn)
functional)PreTrainedModel)
TorchModel)
GPT3Config)TextGenerationModelOutputTokenGeneratorOutput)init_megatron_util)pre_load)StreamingOutputMixinc                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )GPT3ParallelMLP'   zMLP.

MLP will take the input with h hidden state, project it to 4*h
hidden dimension, perform nonlinear transformation, and project the
state back into h hidden dimension.
c                 2  > [         TU ]  5         [        R                  " UR                  UR
                  SUSS9U l        UR                  U l        [        R                  U l
        [        R                  " UR
                  UR                  SUSS9U l        g )NFT)gather_outputinit_methodskip_bias_addinput_is_parallelr    r!   )super__init__r
   ColumnParallelLinearhidden_sizeffn_hidden_sizedense_h_to_4hbias_gelu_fusionFgeluactivation_funcRowParallelLineardense_4h_to_hselfconfigr    output_layer_init_method	__class__s       k/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/gpt3/distributed_gpt3.pyr%   GPT3ParallelMLP.__init__/   s     !55""#  !' 7 7 vv !22"""0     c                     U R                  U5      u  p#U R                  (       a  [        X#5      nOU R                  X#-   5      nU R	                  U5      u  pEXE4$ N)r)   r*   r   r-   r/   )r1   hidden_statesintermediate_parallelbias_paralleloutputoutput_biass         r5   forwardGPT3ParallelMLP.forwardE   sl     04/A/A0,   4D " $$%:%JK " #001FG""r7   )r-   r*   r/   r)   	__name__
__module____qualname____firstlineno____doc__r%   r?   __static_attributes____classcell__r4   s   @r5   r   r   '   s     ,# #r7   r   c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )GPT3EmbeddingW   a  Language model embeddings.

Arguments:
    hidden_size: hidden size
    vocab_size: vocabulary size
    max_sequence_length: maximum size of sequence. This
                         is used for positional embedding
    embedding_dropout_prob: dropout probability for embeddings
    init_method: weight initialization method
    num_tokentypes: size of the token-type embeddings. 0 value
                    will ignore this embedding
c                   > [         TU ]  5         UR                  U l        X l        [        R
                  " UR                  U R                  U R                  S9U l        [        R                  " UR                  U R                  5      U l        U R                  U R                  R                  5        UR                  U l        UR                  U l        [        R                  " UR                   5      U l        g )N)r    )r$   r%   r'   r    r
   VocabParallelEmbedding
vocab_sizeword_embeddingsr   	Embeddingmax_position_embeddingsposition_embeddingsweightfp32_residual_connectionsequence_parallelDropouthidden_dropoutembedding_dropout)r1   r2   r    r4   s      r5   r%   GPT3Embedding.__init__e   s    !--&  #99t//T=M=M O $&<<0N0N040@0@$B  	11889(.(G(G%!'!9!9!#F,A,A!Br7   c                 ,   U R                   R                  R                  R                  S5        SU R                   R                  l        U R
                  R                  R                  R                  S5        SU R
                  R                  l        g)z%Zero out all parameters in embedding.r   TN)rP   rT   datafill_sharedrS   r1   s    r5   zero_parametersGPT3Embedding.zero_parametersz   sj    ##((..q1-1##*  '',,221515  ''.r7   c                    U R                  U5      nU R                  U5      nX4-   nUR                  SS5      R                  5       nU R                  (       a  UR                  5       nU R                  (       aU  [        R                  " U5      n[        R                  " 5       R                  5          U R                  U5      nS S S 5        U$ U R                  U5      nU$ ! , (       d  f       U$ = f)Nr      )rP   rS   	transpose
contiguousrU   floatrV   r
   #scatter_to_sequence_parallel_regionget_cuda_rng_trackerforkrY   )r1   	input_idsposition_idswords_embeddingsrS   
embeddingss         r5   r?   GPT3Embedding.forward   s    //	:"66|D%;
  ))!Q/::<
 ((#))+J !!@@LJ))+002!33J?
 3  //
;J	 32 s   2C!!
C0)rY   rU   r'   r    rS   rV   rP   )
rB   rC   rD   rE   rF   r%   r`   r?   rG   rH   rI   s   @r5   rK   rK   W   s    C*6 r7   rK   c                   8   ^  \ rS rSrU 4S jr   SS jrSrU =r$ )NoopTransformerLayer   c                 .   > [         TU ]  5         Xl        g r9   )r$   r%   layer_number)r1   rs   r4   s     r5   r%   NoopTransformerLayer.__init__   s    (r7   c                 "    UR                  5       $ r9   )clone)r1   r:   attention_maskencoder_outputenc_dec_attn_maskinference_paramss         r5   r?   NoopTransformerLayer.forward   s     ""$$r7   )rs   NNN)rB   rC   rD   rE   r%   r?   rG   rH   rI   s   @r5   rp   rp      s    )  $"&!%% %r7   rp   c                 *    U R                  US5        U $ )Ng     )masked_fill_)attention_scoresrw   s     r5   attention_mask_funcr      s    !!.(;r7   c                   H   ^  \ rS rSr\R
                  4U 4S jjrS rSrU =r	$ )GPT3CoreAttention   c           	        > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR
                  U l        U R                  (       a  SU l        [        SU5      U l        X0l        UR                  U l	        UR                  UR                  -  n[        R                  " 5       n[        R                  " UU5      U l        [        R                  " XAR                  5      U l        [        R                  " UR                  U5      U l        S n[$        R&                  " U R                   5      U l        U R                  (       a!  U R                  nU =R(                  U-  sl        [+        U R                  U R                  U R                  UR,                  [.        U R
                  U5      U l        [2        R4                  " UR6                  5      U l        g )NTrc   )r$   r%   fp16bf16apply_query_key_layer_scalingattention_softmax_in_fp32maxrs   attn_mask_typerV   kv_channelsnum_attention_headsr
   $get_tensor_model_parallel_world_sizedividehidden_size_per_partitionhidden_size_per_attention_head!num_attention_heads_per_partitionmathsqrtnorm_factorr   masked_softmax_fusionr   scale_mask_softmaxr   rW   attention_dropout)r1   r2   rs   r   projection_size
world_sizecoeffr4   s          r5   r%   GPT3CoreAttention.__init__   s~    	KK	KK	-3-Q-Q*)/)I)I&---1D*<0,!'!9!9 ,,v/I/II ==?
),O4>*@&.1jj77/9+14&&
24. 99T%H%HI--%%E%"7IItyy$"5"5((*=**E#3 "$F,D,D!Er7   c                 j   UR                  S5      UR                  S5      UR                  S5      UR                  S5      4nUR                  US   US   US   -  S5      nUR                  US   US   US   -  S5      n[        5       R                  US   US   -  US   US   4UR                  S5      n[
        R                  " UUR                  SS5      UR                  SS5      R                  SS5      SSU R                  -  S	9nUR                  " U6 nU R                  UU5      n	U R                  (       d>  [        R                  " 5       R                  5          U R                  U	5      n	S S S 5        OU R                  U	5      n	UR                  S5      UR                  S5      UR                  S5      UR                  S5      4nUR                  UR                  S5      US   US   -  S5      nU	R                  US   US   -  US   S5      n	[
        R                  " XR                  SS5      5      n
U
R                  " U6 n
U
R!                  SSSS5      R#                  5       n
U
R                  5       S S
 U R$                  4-   nU
R                  " U6 n
U
$ ! , (       d  f       GN$= f)Nrc      r      r
                 ?)betaalpha)sizeviewr   
get_tensordtypetorchbaddbmmrd   r   r   rV   r
   rh   ri   r   bmmpermutere   r   )r1   query_layer	key_layervalue_layerrw   output_sizematmul_input_buffermatmul_resultr   attention_probscontext_layernew_context_layer_shapes               r5   r?   GPT3CoreAttention.forward   s    #''*K,<,<Q,?"''*INN1,=? "&&{1~'21~A'FL NN;q>#.q>KN#BBH	 78CC^k!n,k!nk!nMu&
 !!!Q'1%//15))), )--{; 112B2@B %%))+002"&"8"8"I 32 #44_EO #''*K,<,<Q,?"''*K,<,<Q,?A "&&QQ+a.!@"F *..{1~A/N/:1~rC 		/3H3HA3NO &**K8 &--aAq9DDF #0"4"4"6s";++-#.%**,CDM 32s   "J##
J2)r   r   r   r   r   r   r   r   rs   r   r   r   rV   )
rB   rC   rD   rE   r   paddingr%   r?   rG   rH   rI   s   @r5   r   r      s!    
 !- 4 4)FVS Sr7   r   c                   <   ^  \ rS rSrSrU 4S jrS rSS jrSrU =r	$ )GPT3ParallelAttentioni/  zParallel self-attention layer abstract class.

Self-attention layer takes input with size [s, b, h]
and returns output of the same size.
c                 ,  > [         TU ]  5         [        SU5      U l        UR                  U l        UR
                  UR                  -  n[        R                  " 5       n[        R                  " XQR                  5      U l
        [        R                  " UR                  U5      U l        [        R                  " UR                  SU-  SUS9U l        [        XR                  5      U l        [        R"                  " UUR                  SUSS9U l        g )Nrc   r   F)r   r    Tr"   )r$   r%   r   rs   params_dtyper   r   r
   r   r   r   r   r&   r'   query_key_valuer   core_attentionr.   dense)r1   r2   r    r3   rs   r   r   r4   s          r5   r%   GPT3ParallelAttention.__init__6  s    <0"// ,,v/I/II ==?
.1jj77/9+14&&
24.  #77#	 % 08I8IJ **"0 
r7   c           	          [         R                  " UUU R                  U R                  U R                  [         R
                  R                  5       S9$ )Nr   device)r   emptyr   r   r   cudacurrent_device)r1   inference_max_sequence_len
batch_sizes      r5   _allocate_memory&GPT3ParallelAttention._allocate_memoryV  sD    {{&22//##::,,.0 	0r7   c                    U(       a  U R                   UR                  ;  aU  UR                  nUR                  nU R	                  XE5      nU R	                  XE5      nXg4UR                  U R                   '   OUR                  U R                      u  pgU R                  U5      u  pUR                  5       S S U R                  SU R                  -  4-   n
UR                  " U
6 n[        R                  " US5      u  pnU(       a  UR                  nXR                  S5      -   nUWR                  S5      ::  d   eUR                  nUUR                  S5      -   nUUR                  S5      ::  d   eUUUU2X2S4'   UWUU2X2S4'   US U2X2S4   nUS U2X2S4   nU R                  XX5      nU R                  U5      u  nnUU4$ )Nr   r   rc   r   .)rs   key_value_memory_dictmax_sequence_lenmax_batch_sizer   r   r   r   r   r   r
   split_tensor_along_last_dimbatch_size_offsetsequence_len_offsetr   r   )r1   r:   rw   rz   inf_max_seq_leninf_max_batch_sizeinference_key_memoryinference_value_memorymixed_x_layer_new_tensor_shaper   r   r   batch_start	batch_endsequence_startsequence_endr   r=   biass                        r5   r?   GPT3ParallelAttention.forward_  sG      (8(N(NN"2"C"C%5%D%D"'+'<'<#(9$)-)>)>#*9& )MB 66t7H7HI %::4;L;LM =$  //> )--/43344466 &**,<= 77qI		 *<<K#nnQ&77I 4 9 9! <<<<-AAN)INN1,==L#7#<#<Q#???? @I !!<!,!6"< = BM #>,#>#.#8#$> ?,]l]-8-BC.H II0,1<1F2L MK ++K,7I zz-0t|r7   )r   r   r   rs   r   r   r   r9   )
rB   rC   rD   rE   rF   r%   r   r?   rG   rH   rI   s   @r5   r   r   /  s     @0F Fr7   r   c                   *    \ rS rSrSS jrS rS rSrg)nullcontexti  Nc                     Xl         g r9   enter_result)r1   r   s     r5   r%   nullcontext.__init__  s    (r7   c                     U R                   $ r9   r   r_   s    r5   	__enter__nullcontext.__enter__  s       r7   c                     g r9    )r1   excinfos     r5   __exit__nullcontext.__exit__  s    r7   r   r9   )rB   rC   rD   rE   r%   r   r   rG   r   r7   r5   r   r     s    )!r7   r   c                 <    [         R                  " X-   X4S9nX%-   nU$ )N)ptraining)r+   dropout)xr   residualprobr   outs         r5   bias_dropout_addr     s     
))AH
8C
.CJr7   c                    ^  U 4S jnU$ )Nc                    > [        XX#T5      $ r9   r   )r   r   r   r   r   s       r5   _bias_dropout_add/get_bias_dropout_add.<locals>._bias_dropout_add  s    BBr7   r   )r   r   s   ` r5   get_bias_dropout_addr     s    C r7   r   r   r   r   returnc                     [        XX#S5      $ )NTr   r   r   r   r   s       r5   bias_dropout_add_fused_trainr     s     AXT::r7   c                     [        XX#S5      $ )NFr   r   s       r5    bias_dropout_add_fused_inferencer     s     AXU;;r7   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )GPT3ParallelTransformerLayeri  zwA single transformer layer.

Transformer layer takes input with size [s, b, h] and returns an
output of the same size.
c                 8  > [         TU ]  5         X@l        UR                  U l        UR                  U l        UR
                  U l        [        UR                  UR                  UR                  UR                  S9U l        [        XUU5      U l        UR                  U l        UR                  U l        [        UR                  UR                  UR                  UR                  S9U l        [#        XU5      U l        ['        [(        R*                  R-                  S5      S   5      n['        [(        R*                  R-                  S5      S   5      nUS:  =(       d    US:H  =(       a    US:  nU(       a  [.        U l        g [(        R0                  U l        g )Nepsno_persist_layer_normrV   .r   rc   
   )r$   r%   rs   (apply_residual_connection_post_layernormr   rU   r   r'   layernorm_epsilonr  rV   input_layernormr   self_attentionrX   bias_dropout_fusionpost_attention_layernormr   mlpintr   __version__splitr   enable_gradbias_dropout_add_exec_handler)	r1   r2   r    r3   rs   TORCH_MAJORTORCH_MINORuse_nvfuserr4   s	           r5   r%   %GPT3ParallelTransformerLayer.__init__  s|    	( == 	5 KK	(.(G(G%  )(("(">">$66	 8 4F4L4@B %33#)#=#=  )2(("(">">$66	)8% #6#;= %++11#6q9:%++11#6q9:!Ao A+*: +@.9R.? 	 'K 	*,1,=,= 	*r7   c                    U R                  U5      nU R                  UUUS9u  pVU R                  (       a  UnOUnU R                  (       a  U R                  (       a  [
        nO[        nO[        U R                  5      nU R                  5          U" XVR                  U5      UU R                  5      n	S S S 5        U R                  W	5      nU R                  U5      u  pU R                  (       a  UnOU	nU R                  5          U" U
UR                  U5      XpR                  5      nS S S 5        [        R                  " WUR                  SS9nU$ ! , (       d  f       N= f! , (       d  f       NA= f)Nrz   T)inprequires_grad
keep_graph)r  r  r	  r  r   r   r   r   r  	expand_asrX   r  r  r
   make_viewless_tensorr  )r1   r:   rw   rz   layernorm_outputattention_outputattention_biasr   bias_dropout_add_funclayernorm_input
mlp_outputmlp_biasr=   s                r5   r?   $GPT3ParallelTransformerLayer.forward  sW     //>  !1   3 	) 88'H$H##}}(D%(H%$8$G!//13 ":":8"Dh##%O 2  88I  $xx(89
88'H&H//1*:+3+=+=h+G+35H5HJF 2 ))f&:&:tM = 21  21s   $E$E%
E"%
E3)r	  r   r  r  rU   rX   r  rs   r  r  r  r9   rA   rI   s   @r5   r  r    s    +>Z7 7r7   r  c                   F   ^  \ rS rSrSr   SU 4S jjrS rSS jrSrU =r	$ )	GPT3ParallelTransformeri?  zTransformer class.c           	        >^^^ [         T	U ]  5         TR                  U l        TR                  U l        X@l        XPl        X`l        S U l        TR                  U l        TR                  U l
        UUU4S jnU R                  S:X  a6  SU l
        [        R                  R                  [        S5      /5      U l        OP[        R                  R                  [!        U R                  5       Vs/ s H  o" US-   5      PM     sn5      U l        U R                  (       aL  U R                  (       a:  [#        TR$                  TR&                  TR(                  TR                  S9U l        g g g s  snf )Nc                     > [        TTTU 5      $ r9   )r  )rs   r2   r    r3   s    r5   build_layer5GPT3ParallelTransformer.__init__.<locals>.build_layerX  s    /0H0<> >r7   r   rc   r  )r$   r%   r   rU   post_layer_normpre_processpost_processinput_tensorrV   num_hidden_layers
num_layersr   r   
ModuleListrp   layersranger   r'   r
  r  final_layernorm)
r1   r2   r    r3   r.  r/  r0  r,  ir4   s
    ```     r5   r%    GPT3ParallelTransformer.__init__B  s(    	KK	(.(G(G%.&( !'!9!9 !22	>
 ??aDO((--/CA/F.GHDK((---24??-CD-CQU#-CDFDK !5!5#,"",,&,&B&B"(":":	$<D  "6 Es   .E,c                      U R                   U   $ r9   )r5  )r1   rs   s     r5   
_get_layer"GPT3ParallelTransformer._get_layerl  s    {{<((r7   c                    U R                   (       d  U R                  n[        R                  " USSS9nU R                  (       a$  [        R
                  " 5       R                  5       nO
[        5       nU   [        U R                  5       H  nU R                  U5      nU" UUUS9nM     S S S 5        U R                  (       a"  U R                  (       a  U R                  U5      nU$ ! , (       d  f       NC= f)NT)r  r  r  )r/  r1  r
   r  rV   rh   ri   r   r6  r3  r;  r0  r.  r7  )r1   r:   rw   rz   rng_contextindexlayers          r5   r?   GPT3ParallelTransformer.forwardo  s      --M" 00
 !!22499;K%-Kt/. %!"%5!7 0  !5!5 00?M [s   56C((
C6)
r   r7  rU   r1  r5  r3  r.  r0  r/  rV   )TTTr9   )
rB   rC   rD   rE   rF   r%   r;  r?   rG   rH   rI   s   @r5   r)  r)  ?  s&     "&!"(<T). .r7   r)  c                   :   ^  \ rS rSrSrU 4S jr  SS jrSrU =r$ )GPT3TransformerLanguageModeli  a  Transformer language model.

Arguments:
    transformer_hparams: transformer hyperparameters
    vocab_size: vocabulary size
    max_sequence_length: maximum size of sequence. This
                         is used for positional embedding
    embedding_dropout_prob: dropout probability for embeddings
    num_tokentypes: size of the token-type embeddings. 0 value
                    will ignore this embedding
c                    > [         TU ]  5         UR                  U l        X l        S U l        [        XR                  5      U l        [        UU R                  U5      U l        g r9   )	r$   r%   r'   r    encoder_hidden_staterK   	embeddingr)  encoderr0   s       r5   r%   %GPT3TransformerLanguageModel.__init__  s[    !--&$(! 'v/?/?@ /$
r7   c                     U R                  X5      nUc/  U R                  b  U R                  UUUS9nU$ U R                  n U$ UR                  UR                  5      nU$ )Nr  )rF  rG  rE  tor   )r1   enc_input_idsenc_position_idsenc_attn_maskrz   enc_hidden_statesencoder_inputrx   s           r5   r?   $GPT3TransformerLanguageModel.forward  s{     }G $||'!%!!%5 ". "7 	 "&!:!:  /11-2E2EFNr7   )rF  rG  rE  r'   r    )NNrA   rI   s   @r5   rC  rC    s    

* "&"& r7   rC  c                    ^  U 4S jnU$ )z!Init method based on N(0, sigma).c                 B   > [         R                  R                  U STS9$ Nr   )meanstdr   initnormal_)tensorsigmas    r5   init_!init_method_normal.<locals>.init_  s    wwvCU;;r7   r   )rZ  r[  s   ` r5   init_method_normalr]    s    < Lr7   c                 L   ^ U [         R                  " SU-  5      -  mU4S jnU$ )z3Init method based on N(0, sigma/sqrt(2*num_layers).g       @c                 B   > [         R                  R                  U STS9$ rS  rV  )rY  rU  s    r5   r[  (scaled_init_method_normal.<locals>.init_  s    wwvCS99r7   )r   r   )rZ  r3  r[  rU  s      @r5   scaled_init_method_normalra    s'    
$))C*,-
-C: Lr7   c                   T   ^  \ rS rSr\rU 4S jrS r\S 5       r	    SS jr
SrU =r$ )	GPT3Modeli  c                    > [         TU ]  U5        [        U[        UR                  5      [        UR                  UR                  5      5      U l        g r9   )r$   r%   rC  r]  init_method_stdra  r2  language_model)r1   r2   r4   s     r5   r%   GPT3Model.__init__  sI     :&v'='=>%f&<&<&,&>&>@Ar7   c                 V    U R                   R                  R                  R                  $ r9   )rf  rF  rP   rT   r_   s    r5   word_embeddings_weight GPT3Model.word_embeddings_weight  s     "",,<<CCCr7   c                 <   U R                  S5      n[        R                  " [        R                  " SSX4U R                  S95      nUS:  n[        R
                  " U[        R                  U R                  S9nUR                  S5      R                  U 5      nX#4$ )Nrc   r   g      ?r   r   )	r   r   trilonesr   arangelong	unsqueezer  )tokens
seq_lengthrw   rk   s       r5   %build_attention_mask_and_position_ids/GPT3Model.build_attention_mask_and_position_ids  s    [[^
JJ1j5fmmLN(3.||ejj@#--a0::6B++r7   c                 D   Uc  Uc  U R                  U5      u  p#U R                  UUUUS9n[        R                  R	                  XpR                  5       S SSU R                  R                  5      nS n	Ubs  UR                  SS5      R                  5       n[        R                  " UR                  5       R                  5       U5      n	U	R                  SS5      R                  5       n	[        R                  " U5      n
U
R                  SS5      R                  5       n
X4$ )Nr  FTr   rc   )rt  rf  r
   /LinearWithGradAccumulationAndAsyncCommunicationapplyri  r2   rV   rd   re   vocab_parallel_cross_entropyrv   rf   (gather_from_tensor_model_parallel_region)r1   rj   rw   rk   rz   labelskwargs	lm_outputlogits_parallellosseslogitss              r5   r?   GPT3Model.forward  s    !l&:::9E )N ''-	 ( /	 MMSS224dE4KK))+ %%a+668F55%%'--/9F %%a+668F ==oN!!!Q'224~r7   )rf  )NNNN)rB   rC   rD   rE   r   config_classr%   ri  staticmethodrt  r?   rG   rH   rI   s   @r5   rc  rc    s@    LAD 
, 
,  $!!%# #r7   rc  c                 x    U [         R                  " X5      S   S   :  nU R                  U[        S5      5        g)z-Set the logits for none top-k values to -inf.r   ).r   N-InfN)r   topkr~   rf   )r  top_kfilter_s      r5   !modify_logits_for_top_k_filteringr  ,  s5     uzz&03MBBG
v/r7   c                    [         R                  " U SS9u  p#UR                  SS9R                  SS9nXA:  nUSS2SS24   R	                  5       USS2SS24'   SUS'   UR                  SX55      nU R                  U[        S	5      5        g)
z-Set the logits for none top-p values to -inf.T
descendingr   dimNrc   r   ).r   r  )r   sortsoftmaxcumsumrv   scatterr~   rf   )r  top_psorted_logitssorted_indicescumulative_probsr  s         r5   !modify_logits_for_top_p_filteringr  3  s     %*JJv$$G!M$,,,4;;;C &G
 QV_**,GAqrENGFO ooa9G
v/r7   c                 P   U R                   S:X  d   S5       eUS:X  a#  US:X  d   S5       e[        R                  " U SS9nOU R                  5       n US:w  a  U R	                  U5        US:  aG  US:X  d   S	5       eXR                  S5      ::  d   S
5       eU(       a  X:  d   S5       e[        X5        OUS:  a  US::  d   S5       e[        X5        U R                  SS9n[        R                  " USS9R                  S5      nU(       a  [        R                  " USUS-
  S9nU$ )a   Sample and generate a token.
Note: logits has the dimension [b, v] where b is the batch size
      and v is the vocabulary size.
If vocab_size is provided, we will make sure the sample that is
generated is in [0, vocab-size). This will avoid out of vocabulary
generations due to padding.
r   z*expected the logits to be of [b, v] shape.rc   r   z+cannot set both greedy and top-p samplings.r   r  r   z*cannot set both top-k and top-p samplings.z top-k is larger than logit size.z top-k is larger than vocab size.ztop-p should be in (0, 1].)num_samplesr   )minr   )ndimr   argmaxrv   div_r   r  r  r  multinomialr   clamp)r  r  r  temperaturerO   samplesprobss          r5   sampler  I  s(    ;;!III z|JJJ|,,v2.
 #KK$19C<M!MM<KKN*N,NN*)M+MM)-f<S[C<=!==<-f< 2&##Eq9>>rB ++g1:>CNr7   c                   $    \ rS rSrSrS rS rSrg)InferenceParamsiy  zInference parameters that are passed to the main model in order
to efficienly calculate and store the context during inference.c                 F    X l         Xl        SU l        SU l        0 U l        g)zNote that offsets are set to zero and we always set the
flag to allocate memory. After the first call, make sure to
set this flag to False.r   N)r   r   r   r   r   )r1   r   r   s      r5   r%   InferenceParams.__init__}  s(     !1,#$ !"%'"r7   c                 0   [        U R                  5      S:X  a  [        S5      eU R                  R                  5        HT  nU R                  U   u  p4[        U5      UR                  S   :X  d   eUSS2U4   nUSS2U4   nXV4U R                  U'   MV     g)zswap between batchesr   z"should not swap when dict in emptyrc   N)lenr   
ValueErrorkeysshape)r1   	batch_idxrs   r   r   new_inference_key_memorynew_inference_value_memorys          r5   swap_key_value_dict#InferenceParams.swap_key_value_dict  s    t))*a/ABB 66;;=L;?;U;U<8 y>%9%?%?&   ';AyL'I$)?9)M&(8FD&&|4 >r7   )r   r   r   r   r   N)rB   rC   rD   rE   rF   r%   r  rG   r   r7   r5   r  r  y  s    G(Fr7   r  c                 L   [         R                  R                  U R                  U5      U5      n[         R                  R                  UU5      n[        R
                  " XUS9n/ n[        U5       H.  n[        R                  " XhS U2   US9n	UR                  U	5        M0     U$ )Nr  )	r
   utilsr   r   r   r  r6  catappend)
rY  num_partitionspartition_dimstrideper_partition_sizeper_partition_per_stride_sizepartitions_list
partitionsr8  	partitions
             r5   split_into_partitionsr    s    ))M"N4$'II$4$45G5;%=!kk=BOJ>"II-~-.MC	)$ # r7   
state_dictmodelr  c                    US:X  a  U $ [         R                  " 5       nUR                  5        HY  u  pEUR                  X   R                  :X  a  M#  [	        UR
                  S5      nUR                  n[        X   UXg5      U   X'   M[     U $ )Nrc   r   )r
   get_tensor_model_parallel_ranknamed_parametersr  r   r  partition_strider  )r  r  r  rankname
parametersr  r  s           r5   split_state_dictr    s    Q224D!224z/555***A.,,01A:14>>BD
 5 r7   c            
       ^  ^  \ rS rSr SSS.U 4S jjjrSS\4U 4S jjjr     SS jr    SS jrSS	 jr	\
R                  " 5       SS
 j5       r\
R                  " 5       S 5       rSS jr SSSS\4S jjr   SS\\\R&                  4   S\\\\   4   S\S\\   4U 4S jjjrSrU =r$ )DistributedGPT3i  N)megatron_cfgc                *  > [         TU ]  " U/UQ70 UD6  [        XAUS9  [        R                  " U5      U l        [        U R
                  5      nUR                  5        H  n[        R                  " U5        M     UR                  [        R                  R                  5       5        U R
                  R                  (       d  U R
                  R                  (       a  [        XpR
                  5      nXpl        [        R"                  " 5       n	[%        5       R'                  SS 5      n
U
c  U	OU
n
[        R(                  " 5       U
-  U	-  n[+        XUS9n[-        XX-  5      nU R                   R/                  XR'                  SS5      S9  S U l        g )N)r  %checkpoint_tensor_model_parallel_size)tagstrictT)r  )r$   r%   r   r   from_pretrainedr2   rc  r  r
   8set_defaults_if_not_set_tensor_model_parallel_attributesr   r   r   r   r   r   
dist_modelr   r	   getr  r   r  load_state_dictrz   )r1   	model_dirr  path_load_tagr  argsr|  r  param	tensor_wsckpt_ws	ckpt_rank
load_modelr4   s                r5   r%   DistributedGPT3.__init__  s>    	4T4V4<> 00;$++&%%'EHHO ( 	

5::,,./ ;;t{{//!%5E<<>	*..!H$O&)G6687BiO	iF
%j9MN
''zz(D9 	( 	; !%r7   modec                 >   > U(       a  S U l         [        TU ]	  U5      $ r9   )rz   r$   train)r1   r  r4   s     r5   r  DistributedGPT3.train  s    $(D!w}T""r7   c                 8   U R                  UUUU R                  US9u  pxS n	Uc0  U R                  =R                  UR                  S5      -  sl        GO>[        R
                  " UR                  5       [        R                  UR                  S9n
Uc  [        U5       H  u  pSXUS 24'   M     O>[        U5       H  u  pSXUS-
  S 24'   M     [        U5       H  u  pSXS US-
  24'   M     UR                  5       nU
R                  S5      R                  5       n
U
R                  5       nUS:X  a4  [        R                  " UR                  S5      5      R                  5       n	O+[        R                  " UR                  S5      U
-  5      U-  n	[        XyS9$ )N)rz   r{  rc   r   r   r   )r  loss)r  rz   r   r   r   rn  rf   r   	enumerater   sumzero_r   )r1   rr  rw   rk   r{  prompts_len
inputs_lenr  r  r  	loss_maskr8  lmask_sums                 r5   r?   DistributedGPT3.forward  sl    !22 )  >!!55QG5

U[[HI!%k2DA'(Ie$ 3 &j1DA+,IQi( 2%k2DA+,I!a%i( 3 \\^F!r*002I }}H1}yyR1779yyR9!<=H(BBr7   c           	   +     #    UR                  SU R                  R                  5      nUR                  SU R                  R                  5      nUR                  SU R                  R                  5      n	UR                  SUR                  S5      U R                  R                  -   5      n
UR                  S5      nUnUc/  [        R                  " UR                  S5      /UR                  S9nUR                  5       R                  5       n[        U
U R                  R                  5      nX:  a  [        S5      eXR                  S5      -
  nUS:  aD  [        R                  " XUR                  S9R                  5       n[        R                   " UU4S	S
9n[#        UU5      U l        U R                  R&                  n[        R                  " U[        R(                  [        R*                  R-                  5       S9n[.        R1                  U5      u  nnSn[3        X5       GH  nUS S 2UU24   nUS S 2UU24   nUSUU2S U24   nU " UUU5      R4                  nUS S 2S	S S 24   n[7        UUUU	U R                  R8                  S9nUU:*  nUU   UUU4'   [;        US S 2S US-   24   S9v   UnU(       al  US:H  R=                  5       UR=                  5       -  nUS:H  R=                  5       US S 2US-
  4   S:H  R=                  5       -  UR=                  5       -  nUU-  n OyU(       aN  US:H  R=                  5       UR=                  5       -  nUS:H  R=                  5       UR=                  5       -  n!UU!-  n O$UU:H  R=                  5       UR=                  5       -  n UU -  n[        R>                  " U5      n"U(       d  GM  U"(       d  GM    g    g 7f)Nr  r  r  
max_lengthrc   r   rl  -context length + tokens_to_generate too larger   r  r   .)r  r  r  rO   )	sequencesit     ) popr2   r  r  r  r   tokens_to_generater   rY  r   r  itemrR   r  zerosrp  r  r  rz   eod_iduint8r   r   rc  rt  r6  r  r  rO   r   byteall)#r1   rr  r  #use_eod_token_for_early_terminationstop_on_double_eolstop_on_eolr|  r  r  r  r  r   lengthsmin_prompt_lengthmax_sequence_length
pad_lengthpadstermination_idis_generation_donerw   rk   prev_context_lengthcontext_length
tokens2usepositions2useattention_mask2user  last_token_logits
new_samplestartedhit_double_eolhit_two_eols
done_tokenhit_eoldones#                                      r5   r  DistributedGPT3.sample
  s     

7DKK$5$56

7DKK$5$56jj0G0GHZZKKNT[[;;;=
 [[^
?llFKKN#3FMMJG#KKM..0!*"&++"E"EG 3LMM(;;q>9
>;;v}}>>Bdf YY~26F !0
0C!E
 ++ #[[ekk%**2K2K2MO ;;FC 	%#$5KN  #6~#E EFJ(,?,N)NOM!/(7.H"J *&8-HOOF !'q"ax 0!';;113J /G.8.AF7N*+ ' <B^?@>A <B 9B 2C D D #1 "","3!9!9!;glln!L *c 17791)A-. /2567;tv >@G O ,l:
","3!9!9!;glln!L%,224w||~E+g5
(N:@@BLLN#
 "4j!@99/0D22tti Ls   OO	OOc                 
   UR                  S5      nUS:X  d   eUR                  S[        R                  " UR                  S5      /UR                  S95      R                  5       nU R                  R                  n[        R                  " SU R                  R                  UR                  S9R                  5       U-  n[        R                  " X4SS9nUR                  S5      n	[        U	U R                  R                  5      n	Xi:  a  [        S5      e[        UU	5      U l        [#        U5      n
Sn[        R$                  " U[        R&                  [        R(                  R+                  5       S	9R-                  S5      nUR/                  US5      n[0        R3                  U5      u  pSn[5        Xi5       GH  nUS S 2UU24   nUS S 2UU24   nUS
UU2S U24   nU " UUU5      R6                  nUR                  S5      n[8        R:                  " USS9nUS S 2SS S 24   U-   nUU:X  a   [        R<                  " USS S 24   SS9u  nnO'[        R<                  " UR?                  S5      SS9u  nn[        R@                  " US SU-   U5      RC                  5       R                  5       nUS SU-   U-  nUS SU-   n/ n[E        [G        UUU5      5       H|  u  nu  nn n!UR                  5       U:X  a9  UU:  n"U"(       a  M,  U
RI                  UU!   RK                  5       U US-   U-
  5        OURM                  UU U!45        [O        U5      U:X  d  M|    O   U
RQ                  URS                  5       R                  5       US-   U-
  5      (       a  Sn  OURU                  U V#s/ s H  n#U#S   PM
     sn#5      n$UU$S S 24   nURU                  U V#s/ s H  n#U#S   PM
     sn#5      US S 2U4'   URU                  U V#s/ s H  n#U#S   PM
     sn#5      R-                  S5      nU R                   RW                  U$5        UnGM     U(       d?  [5        U5       H0  n!U
RI                  UU!   RK                  5       UU!   WS-   U-
  5        M2     [Y        U
RZ                  S SS9n%[        U[O        U%5      5      n[5        U5       V&s/ s H  n&U%U&   S   PM     nn&[5        U5       V&s/ s H  n&U%U&   S   PM     nn&[        R\                  " USS9n[        R\                  " USS9n[_        XS9$ s  sn#f s  sn#f s  sn#f s  sn&f s  sn&f )Nr   rc   prompt_lengthrl  r   r  r  Fr   .r   Tr  c                     U S   $ )Nr   r   )r   s    r5   <lambda>-DistributedGPT3.beam_search.<locals>.<lambda>  s    1Q4r7   )keyreverse)r  scores)0r   r  r   rY  r   r  r2   r  rn  r  rp  r  r  rR   r  r  rz   BeamHypothesesr  float32r   r   rq  repeatrc  rt  r6  r  r+   log_softmaxr  r   divtruncr  zipaddrv   r  r  is_doner   newr  sortedbeamsstackr   )'r1   rr  	beam_sizenum_return_genr|  r   r  
stop_tokenr  final_sequence_lengthbeam_hypr  r  rw   rk   r  r  r  r  r  r  rO   	log_probs
new_scoressorted_scoresindicesbest_beam_ids
best_wordsbest_scores
next_beamsbeam_token_ranktoken_id
beam_scorebeam_id&is_beam_token_worse_than_top_num_beamsr  best_batchessorted_hypsr8  s'                                          r5   beam_searchDistributedGPT3.beam_searcht  s/   [[^
a 

LL&++a.)&--@BBF$& 	 [[''
zzt{{--==""&$&:6 F>r2 &A #$9$(KK$G$G!I 1LMM !0	0E!G "),U]]::,,.009	! 	 y!,;;FC 	%#MIN  #6~#E EFJ(,?,N)NOM!/(7.H"J *&8-HOOFQJf!4I"1b!8,v5J.).q!t$*7&w */OOB'D*:&w "IIgnq9}&=&0227%'$$&  !i-0:=J'Y7KJDM
K?EA@!@(J==?j0=LPY=Y:= LL!6!6!8*!/!!3m!CE %%xW&EFz?i/EA   1 6 6 8 . 2] BD D!:::&F:4tAw:&FGLL!O,F(.

%/0ZTaZ0)2F1n$%ZZZ @ZTaZ @AKKANF !!55lC #1w J|  +VG_224fWo+a/-?A ,
 X^^N^S-=>-2>-BC-B+a.#-BC-2>-BC-B+a.#-BCV+V+#fDD5 'G 1 @" DCs   (T/
T4
T9
T>(Uc                     U(       a#  S nU R                   " U/UQ70 UD6 H  nUnM     U$ U R                  " U/UQ70 UD6$ r9   )r  r;  )r1   rr  	do_sampler  r|  last_outputr=   s          r5   generateDistributedGPT3.generate  sM    K++f>t>v>$ ?##F<T<V<<r7   c                 .    U R                   " U/UQ70 UD6$ r9   )r  )r1   rr  r  r|  s       r5   stream_generateDistributedGPT3.stream_generate  s    {{63D3F33r7   c                 :    U R                   R                  XU5      $ r9   )r  r  )r1   destinationprefix	keep_varss       r5   r  DistributedGPT3.state_dict  s    ))+yIIr7   r  zOrderedDict[str, torch.Tensor]r  c                 8    U R                   R                  X5      $ r9   )r  r  )r1   r  r  s      r5   r  DistributedGPT3.load_state_dict  s     ..zBBr7   target_foldersave_checkpoint_namessave_functionr2   c                 D  > SUS   S'   US   R                  SS 5        US   R                  SS 5        US   R                  SS 5        US   R                  SS 5        [        5       R                  n[        5       R                  nXg-  US   S	'   [        TU ]  " XX440 UD6$ )
Nzgpt3-generationpipelinetyper  r  r  megatronr  r   )r  r	   tensor_model_parallel_sizepipeline_model_parallel_sizer$   save_pretrained)	r1   rL  rM  rN  r2   r|  tp_sizepp_sizer4   s	           r5   rU  DistributedGPT3.save_pretrained  s     &7z6"wFD)wND1zvt,zFM*77*99+2+<z<(w&}'4H@FH 	Hr7   )r2   r  rz   )r  )T)NNNNN)NTFF)   rc   )N Fr|   )rB   rC   rD   rE   r%   boolr  r?   r  r;  r   no_gradr@  rC  r  r  r   strosPathLiker   r   r   dictrU  rG   rH   rI   s   @r5   r  r    s   
  '$%
 #$% $%L#$ # #  $! &CT  37"' hToEb ]]_= = ]]_4 4J
 (,C$DC $C HL2615	H',S"++-='>H/4S$s)^/DH (0H !)	H Hr7   r  c                       \ rS rSr  SS\S\S\4S jjrS r SS\	R                  S	\S
\\	R                     4S jjrS\S\S\4S jrSrg)r  i  	num_beamslength_penaltyearly_stoppingc                 D    X l         X0l        Xl        / U l        SU l        g)z'
Initialize n-best list of hypotheses.
g    eAN)rc  rd  rb  r%  worst_score)r1   rb  rc  rd  s       r5   r%   BeamHypotheses.__init__  s$     -,"
r7   c                 ,    [        U R                  5      $ )z#
Number of hypotheses in the list.
)r  r%  r_   s    r5   __len__BeamHypotheses.__len__  s     4::r7   Nhypsum_logprobsbeam_indicesc           
         X!R                   S   U R                  -  -  n[        U 5      U R                  :  d  X@R                  :  a  U R
                  R                  XAU45        [        U 5      U R                  :  a\  [        [        U R
                  5       VVVs/ s H  u  nu  n  ovU4PM     snnn5      nU R
                  US   S   	 US   S   U l        g[        X@R                  5      U l        ggs  snnnf )z#
Add a new hypothesis to the list.
r   r   rc   N)
r  rc  r  rb  rf  r%  r  r$  r  r  )	r1   rk  rl  rm  scoreidxsr   sorted_next_scoress	            r5   r!  BeamHypotheses.add%  s     		"t/B/B BCt9t~~%1A1A)AJJu<894y4>>)%+3<TZZ3H-3Hiq!QH3H- &" JJ1!4Q78#5a#8#; #&u.>.>#?  *B-s   C5best_sum_logprobscur_lenr   c                     [        U 5      U R                  :  a  gU R                  (       a  gXU R                  -  -  nU R                  U:  nU$ )z
If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
one in the heap, then we are done with this sentence.
FT)r  rb  rd  rc  rf  )r1   rt  ru  	cur_scorerets        r5   r"  BeamHypotheses.is_done8  sK     t9t~~%  )T5H5H,HHI""i/CJr7   )r%  rd  rc  rb  rf  )r   Fr9   )rB   rC   rD   rE   r  rf   r[  r%   ri  r   
LongTensorr   r!  r"  rG   r   r7   r5   r  r    s     *-(-!& "& 8<@!!@@ #5#3#34@&   r7   r  )r   r   r   N)Jr   r^  collectionsr   typingr   r   r   r   r   r   megatron_utilr	   r
   megatron_util.global_varsr   megatron_util.modelr   r   r   r   !megatron_util.model.fused_softmaxr   r   torch.nnr   r+   transformers.modeling_utilsr   modelscope.modelsr   modelscope.models.nlp.gpt3r   modelscope.outputsr   r   modelscope.utils.megatron_utilsr   $modelscope.utils.nlp.load_checkpointr   !modelscope.utils.streaming_outputr   Moduler   rK   rp   r   r   r   r   r   r   jitscriptTensorrf   r   r   r  r)  rC  r]  ra  rc  r  r  r  r  r  r]  r  r  r  r  r   r7   r5   <module>r     s,     	 # 8 8  ' >1 1 C  $ 7 ( 1 N > 9 B-#bii -#`>BII >B%299 %
@		 @FvBII vr	 	 ;ELL ; ;+0<<;',;16; ; < <ELL </4||<+0<5:\\< <k299 k\^bii ^B4299 4n? ?D00,-`F F<c5<<&7!8  !$)-c5<<.?)@XHj"6 XHv
5 5r7   