
    9iJ3                     X   S SK r S SKrS SKJrJr  S SKrS SKrS SKJr  S SKJ	r
  S SKJr  S SKJr  SSKJr   " S	 S
\R"                  5      r " S S\R"                  5      r " S S\R"                  5      r " S S\R"                  5      r " S S\R"                  5      r " S S\5      rg)    N)OptionalUnion)nn)
functional)PreTrainedModel)	ModelFile   )GPTMoEConfigc                   H   ^  \ rS rSrSrU 4S jrS r SS jrSS jrSr	U =r
$ )	GPTMoESelfAttention   zParallel self-attention layer abstract class.

Self-attention layer takes input with size [s, b, h]
and returns output of the same size.
c                 .  > [         TU ]  5         UR                  U l        UR                  U l        U R                  U R                  -  U l        [
        R                  " U R                  SU R                  -  5      U l        [
        R                  " SS9U l	        [
        R                  " UR                  5      U l        [
        R                  " U R                  U R                  5      U l        [
        R                  " UR                  5      U l        g )N   dim)super__init__hidden_sizenum_attention_headshidden_size_per_attention_headr   Linearquery_key_valueSoftmaxsoftmaxDropoutattention_probs_dropout_probattention_dropoutdensehidden_dropout_proboutput_dropoutselfconfig	__class__s     f/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/gpt_moe/backbone.pyr   GPTMoESelfAttention.__init__%   s    !--#)#=#=   8 88 	+  "yy)9)9)*T-=-=)= ?zzb)!#//"1 YYt//1A1AB
 jj)C)CD    c                     UR                  5       SS U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )zOTranspose a 3D tensor [b, s, np*hn] into a 4D tensor with
size [b, np, s, hn].
Nr   r      r	   r   )sizer   r   viewpermute)r#   tensornew_tensor_shapes      r&   _transpose_for_scores)GPTMoESelfAttention._transpose_for_scores8   sX     ";;="-$$d&I&I1K K./~~aAq))r(   c                     UR                  5       S-
  nUR                  5       U   U-  n[        R                  " XUS9nU(       a  [	        S U 5       5      $ U$ )Nr	   r   c              3   @   #    U  H  oR                  5       v   M     g 7fN)
contiguous).0chunks     r&   	<genexpr>CGPTMoESelfAttention._split_tensor_along_last_dim.<locals>.<genexpr>L   s     E))++s   )r   r+   torchsplittuple)r#   r.   num_partitionscontiguous_split_chunkslast_dimlast_dim_sizetensor_lists          r&   _split_tensor_along_last_dim0GPTMoESelfAttention._split_tensor_along_last_dimA   sS    
 ::<!#h/>Akk&XF"EEEEr(   c                 V   UR                  S5      n[        R                  " USSXD/5      nU R                  U5      nU R	                  US5      u  pgnU R                  U5      n	U R                  U5      n
U R                  U5      nUR                  5       n[        R                  " U	U
R                  SS5      5      nU[        R                  " U R                  5      -  nU(       af  U
R                  S5      n[        R                  " [        R                  " SXN4UR                  S95      R                  SSXN5      R                  U5      nSSU-
  -  n[        R                   " X5      U-
  R                  U5      nU R#                  U5      nU R%                  U5      n[        R                  " UU5      nUR'                  S	SSS5      R)                  5       nUR                  5       S S U R*                  4-   nUR                  " U6 nU R-                  U5      nU R/                  U5      nU$ )
Nr	   r   r   r*   )deviceg     @      ?r   )r+   r:   reshaper   rB   r0   typematmul	transposemathsqrtr   trilonesrF   r,   mulr   r   r-   r5   r   r   r!   )r#   hidden_states	ltor_maskis_infertgt_lenmixed_x_layermixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerprevious_typeattention_scoressrc_lenconverted_maskattention_probscontext_layernew_context_layer_shapeoutputs                       r&   forwardGPTMoESelfAttention.forwardP   s    
  $$Q'MM)aG-EF	,,];--mQ? 	@	-> 001BC..?	001BC#((* !<<(1(;(;B(CE+dii///1 1 nnQ'G



Aw0"/"6"6899= !W:77;tM7J  !C)O4!II&6B,-.2d=.A 	 ,,'78 00A _kB%--aAq9DDF"/"4"4"6s";?  #  &**,CD M*$$V,r(   )r   r   r   r   r   r!   r   r   )F)__name__
__module____qualname____firstlineno____doc__r   r0   rB   rd   __static_attributes____classcell__r%   s   @r&   r   r      s(    E&* >C6 6r(   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )	GPTMoEMLP   zMLP.

MLP will take the input with h hidden state, project it to 4*h
hidden dimension, perform nonlinear transformation, and project the
state back into h hidden dimension.
c                 *  > [         TU ]  5         UR                  n[        R                  " USU-  5      U l        [        R                  U l        [        R                  " SU-  U5      U l	        [        R                  " UR                  5      U l        g )N   )r   r   r   r   r   dense_h_to_4hFgeluactivation_funcdense_4h_to_hr   r    dropout)r#   r$   r   r%   s      r&   r   GPTMoEMLP.__init__   si    ((YY{AOD vvYYq;Dzz&"<"<=r(   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r4   )rs   rv   rw   rx   )r#   rQ   intermediate_parallelrc   s       r&   rd   GPTMoEMLP.forward   sK     !% 2 2= A $ 4 45J K##$9:f%r(   )rv   rw   rs   rx   	rf   rg   rh   ri   rj   r   rd   rk   rl   rm   s   @r&   ro   ro      s    
> r(   ro   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )GPTMoETransformerLayer   zwA single transformer layer.

Transformer layer takes input with size [s, b, h] and returns an
output of the same size.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  S9U l        [        U5      U l        [        R                  " UR                  UR
                  S9U l	        [        U5      U l        g N)eps)r   r   r   	LayerNormr   layernorm_epsiloninput_layernormr   	attentionpost_attention_layernormro   mlpr"   s     r&   r   GPTMoETransformerLayer.__init__   sr      "||F$<$< > -V4 )+F$<$<)>% V$r(   c                     U R                  U5      nU R                  X25      nX-   nU R                  U5      nU R                  U5      nXV-   nU$ r4   )r   r   r   r   )r#   rQ   rR   layernorm_outputattention_outputlayernorm_input
mlp_outputrc   s           r&   rd   GPTMoETransformerLayer.forward   sZ    
  //>>>*:F':88IXX./
 -r(   )r   r   r   r   r}   rm   s   @r&   r   r      s    %" r(   r   c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )GPTMoETransformer   zTransformer class.c                 Z  > [         TU ]  5         S U l        UR                  U l        [
        R                  R                  [        U R                  5       Vs/ s H  n[        U5      PM     sn5      U l
        [        R                  " UR                  UR                  S9U l        g s  snf r   )r   r   input_tensornum_hidden_layers
num_layersr:   r   
ModuleListranger   layersr   r   r   final_layernorm)r#   r$   _r%   s      r&   r   GPTMoETransformer.__init__   s      !22hh))5:4??5KL5K#F+5KLN  "||F$<$< > Ms   B(c                      U R                   U   $ r4   )r   )r#   layer_numbers     r&   
_get_layerGPTMoETransformer._get_layer   s    {{<((r(   c                     [        U R                  5       H  nU R                  U5      nU" X5      nM     U R                  U5      nU$ r4   )r   r   r   r   )r#   rQ   attention_maskindexlayers        r&   rd   GPTMoETransformer.forward   sG     4??+EOOE*E!-@M ,
 ,,];r(   )r   r   r   r   )
rf   rg   rh   ri   rj   r   r   rd   rk   rl   rm   s   @r&   r   r      s    >)
 
r(   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )GPTMoETransformerLanguageModel   a  Transformer language model.

Arguments:
    transformer_hparams: transformer hyperparameters
    vocab_size: vocabulary size
    max_sequence_length: maximum size of sequence. This
                         is used for positional embedding
    embedding_dropout_prob: dropout probability for embeddings
    num_tokentypes: size of the token-type embeddings. 0 value
                    will ignore this embedding
c                 L  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l        [        U5      U l        g r4   )r   r   r   	Embedding
vocab_sizer   word_embeddingsmax_position_embeddingsposition_embeddingsr   r    embedding_dropoutr   transformerr"   s     r&   r   'GPTMoETransformerLanguageModel.__init__  s|      "||F,=,=,2,>,> @#%<<0N0N060B0B$D !#F,F,F!G -V4r(   c                     U R                  U5      nU R                  U5      nXE-   nU R                  U5      nU R                  UU5      n[        R
                  " XR                   R                  5      n	U	$ r4   )r   r   r   r   rt   linearweight)
r#   	input_idsr   position_idswords_embeddingsr   
embeddingstransformer_inputtransformer_outputlogitss
             r&   rd   &GPTMoETransformerLanguageModel.forward  sv    //	:"66|D%;
 22:>!--.?.<> ,.B.B.I.IJr(   )r   r   r   r   r}   rm   s   @r&   r   r      s    
5
 
r(   r   c                      ^  \ rS rSr\rS rU 4S jr   S	S jr\	S\
\\\R                  4      4S j5       rS rSrU =r$ )
GPTMoEModeli  c                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        gg)zInitialize the weightsg        )meanstdNrG   )
isinstancer   r   r   datanormal_r$   initializer_rangebiaszero_r   padding_idxr   fill_)r#   modules     r&   _init_weightsGPTMoEModel._init_weights!  s   fbii(( MM&&dkk;; ' ={{&  &&( '--MM&&dkk;; ' =!!-""6#5#56<<> .--KK""$MM$$S) .r(   c                 D   > [         TU ]  U5        [        U5      U l        g r4   )r   r   r   language_modelr"   s     r&   r   GPTMoEModel.__init__3  s     <VDr(   c                 H   UR                  S5      n[        R                  " [        R                  " SSXf4[        R                  UR
                  S95      nUcN  [        R                  " U[        R                  UR
                  S9nUR                  S5      R                  U5      nU R                  XU5      nS nUbQ  [        R                  " 5       n	U	" UR                  SU R                  R                  5      UR                  S5      5      n[        R                   " XS9$ )Nr	   )dtyperF   r   r   )lossr   )r+   r:   rN   rO   longrF   arange	unsqueeze	expand_asr   r   CrossEntropyLossr,   r$   r   addictDict)
r#   r   r   r   labelskwargs
seq_lengthr   r   loss_fcts
             r&   rd   GPTMoEModel.forward7  s     ^^A&
JJ1j5"ZZ'..01  <<%**Y5E5EGL'11!4>>yIL$$YM**,HB 6 67RJD{{44r(   pretrained_model_name_or_pathc                    U R                   R                  U5      nU " U5      n[        R                  R	                  U[
        R                  5      n[        R                  " U5      nSU;   a  US   nUR                  5        VVs0 s H  u  pgUR                  SS5      U_M     nnnUR                  U5        U$ s  snnf )N
state_dictzmodel.language_modelr   )config_classfrom_pretrainedospathjoinr   TORCH_MODEL_BIN_FILEr:   loaditemsreplaceload_state_dict)clsr   r$   modelstate_dict_filer   kvs           r&   r   GPTMoEModel.from_pretrainedO  s     !!11)+F'',,'D'0'E'EGZZ0
:%#L1J #((*
* II,.>?B* 	 
 	j)
s   B;c                 
    SU0$ )Nr    )r#   r   argsr   s       r&   prepare_inputs_for_generation)GPTMoEModel.prepare_inputs_for_generationb  s    Y''r(   )r   )NNN)rf   rg   rh   ri   r
   r   r   r   rd   classmethodr   r   strr   PathLiker   r   rk   rl   rm   s   @r&   r   r     sg    L*$E  $!	50 08s?A{{@K :L 1M $( (r(   r   )rL   r   typingr   r   r   r:   r   torch.nnr   rt   transformers.modeling_utilsr   modelscope.utils.constantr   configurationr
   Moduler   ro   r   r   r   r   r   r(   r&   <module>r     s      	 "    $ 7 / 'h")) hV		 >)RYY )X		 D$RYY $NF(/ F(r(   