
    9i                        S SK JrJrJrJr  S SKrS SKrS SKrS SKJ	s  J
r  S SKJr  S SKJ	r	  S SKJrJr  SSKJrJr  \R(                  " \5      rS rS	 r\\R                  R                  R2                  \S
.r " S S\	R6                  5      r " S S\	R6                  5      r " S S\	R6                  5      r " S S\	R6                  5      r " S S\	R6                  5      r  " S S\	R6                  5      r! " S S\	R6                  5      r" " S S\	R6                  5      r# " S S\	R6                  5      r$ " S S\	R6                  5      r% " S S \	R6                  5      r& " S! S"\	R6                  5      r' " S# S$\	R6                  5      r( " S% S&\(5      r) " S' S(\	R6                  5      r* " S) S*\	R6                  5      r+ " S+ S,\(5      r, " S- S.\(5      r- " S/ S0\R                  R6                  5      r.g)1    )absolute_importdivisionprint_functionunicode_literalsN)mpu)nn)normal_init_methodscaled_init_method   )PlugNLGConfigPlugNLUConfigc                 n    U S-  S[         R                  " U [        R                  " S5      -  5      -   -  $ )zImplementation of the gelu activation function.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
g      ?      ?g       @)torcherfmathsqrtxs    c/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/plug/backbone.pygelur   "   s.    
 s7cEIIa$))C.&899::    c                 4    U [         R                  " U 5      -  $ N)r   sigmoidr   s    r   swishr   *   s    u}}Qr   )r   relur   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )BertLayerNorm1   c                    > [         [        U ]  5         [        R                  " [
        R                  " U5      5      U l        [        R                  " [
        R                  " U5      5      U l	        X l
        g)zWConstruct a layernorm module in the TF style (epsilon inside the square root).
        N)superr   __init__r   	Parameterr   onesweightzerosbiasvariance_epsilon)selfhidden_sizeeps	__class__s      r   r#   BertLayerNorm.__init__3   sJ     	mT+-ll5::k#:;LL[!9:	 #r   c                     UR                  SSS9nX-
  R                  S5      R                  SSS9nX-
  [        R                  " X0R                  -   5      -  nU R
                  U-  U R                  -   $ )NT)keepdim   )meanpowr   r   r)   r&   r(   )r*   r   uss       r   forwardBertLayerNorm.forward;   sh    FF2tF$UKKND1Uejj%:%:!:;;{{Q**r   )r(   r)   r&   )g-q=__name__
__module____qualname____firstlineno__r#   r7   __static_attributes____classcell__r-   s   @r   r   r   1   s    $+ +r   r   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )BertEmbeddingsB   zLConstruct the embeddings from word, position and token_type embeddings.
    c           	      b  > [         [        U ]  5         [        R                  " UR
                  UR                  [        SUR                  S9S9U l	        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        UR                   U l        UR"                  U l        UR$                  U l        ['        UR                  UR(                  S9U l        [        R,                  " UR.                  5      U l        g )N        r3   std)init_methodr,   )r"   rB   r#   r   VocabParallelEmbedding
vocab_sizer+   r	   initializer_rangeword_embeddingsr   	Embeddingmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsfp32_layernormfp32_embeddingfp32_tokentypesr   layernorm_epsilon	LayerNormDropouthidden_dropout_probdropoutr*   configr-   s     r   r#   BertEmbeddings.__init__F   s    nd,."99*f668 9
 $&<<0N0N060B0B$D %'\\&2H2H282D2D&F"
 %33$33%55&F$<$<>zz&"<"<=r   c                    UR                  S5      nUcN  [        R                  " U[        R                  UR                  S9nUR                  S5      R                  U5      nUc  [        R                  " U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  (       d  XV-   U-   nU R                  (       a!  U R                  (       d  UR                  5       nUR                  5       n	U R                  (       a  UR!                  5       nU R#                  U5      nU R                  (       a3  U R                  (       a  UR                  5       nOUR                  U	5      nOUR!                  5       UR!                  5       -   UR!                  5       -   nU R                  (       a!  U R                  (       d  UR                  5       nUR                  5       n	U R                  (       a  UR!                  5       nU R#                  U5      nU R                  (       a3  U R                  (       a  UR                  5       nOUR                  U	5      nU R%                  U5      nU$ )Nr   )dtypedevicer   )sizer   arangelongr`   	unsqueeze	expand_as
zeros_likerM   rP   rR   rU   rT   rS   halftypefloatrW   rZ   )
r*   	input_idstoken_type_idsposition_ids
seq_lengthwords_embeddingsrP   rR   
embeddingsprevious_types
             r   r7   BertEmbeddings.forward[   s   ^^A&
 <<%**Y5E5EGL'11!4>>yIL!"--i8N//	:"66|D $ : :> J##)?BWWJ""4+>+>'__.
&OO-M""'--/

3J""&&!+!2J!+!?J)//14G4M4M 5 %++-.J##D,?,?'__.
&OO-M""'--/

3J""''!+!2J!+!?J\\*-
r   )rW   rZ   rT   rS   rU   rP   rR   rM   )NN	r:   r;   r<   r=   __doc__r#   r7   r>   r?   r@   s   @r   rB   rB   B   s    >*) )r   rB   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BertSelfOutput   c           	        > [         [        U ]  5         [        US5      (       a1  UR                  (       a   [        SUR                  UR                  S9nO[        SUR                  S9n[        R                  " UR                  UR                  SSSUS9U l        UR                  U l        UR                  (       d$  [        UR                  UR                   S9U l        OS U l        [$        R&                  " UR(                  5      U l        g 	N	deep_initrE   r3   rG   
num_layersrF   Tr   )
input_sizeoutput_sizer(   input_is_parallelstriderH   rI   )r"   ru   r#   hasattrry   r
   rL   num_hidden_layersr	   r   RowParallelLinearr+   denserS   pre_lnr   rV   rW   r   rX   rY   rZ   r*   r\   rH   r-   s      r   r#   BertSelfOutput.__init__   s    nd,.6;''F,<,<,,,!335K
 -f668K**))**"#%
 %33}}*""(@(@BDN "DNzz&"<"<=r   c                 <   U R                  U5      nU R                  U5      nX-   nU R                  bf  UR                  5       nU R                  (       a  UR                  5       nU R                  U5      nU R                  (       a  UR                  U5      nU$ UnU$ r   r   rZ   rW   rh   rS   ri   r*   hidden_statesinput_tensorln_inputrp   s        r   r7   BertSelfOutput.forward       
 

=1]3 />>%$MMOM""#>>+ NN84M"" - 2 2= A  %Mr   rW   r   rZ   rS   r9   r@   s   @r   ru   ru          >2 r   ru   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BertAttention   c                   > [         [        U ]  5         UR                  U l        UR                  (       a$  [        UR                  UR                  S9U l        OS U l        [        R                  " UR                  UR                  UR                  S[        SUR                  S9UR                  S9U l        [#        U5      U l        g )NrI   TrE   rF   )r+   num_attention_headsdropout_proboutput_parallelrH   separate)r"   r   r#   rS   r   r   r+   rV   rW   r   BertParallelSelfAttentionr   attention_probs_dropout_probr	   rL   attn_separater*   ru   outputr[   s     r   r#   BertAttention.__init__   s    mT+-$33==*""(@(@BDN "DN11** & : :<< *f668))+	 %V,r   c                 Z   U R                   by  UnUR                  5       nU R                  (       a  UR                  5       nU R                  U5      nU R                  (       a  UR                  U5      nU R	                  UU5      nOU R	                  UU5      nU R                  UU5      nU$ r   )rW   rh   rS   ri   r*   r   )r*   r   attention_maskr   rp   	ln_outputself_outputattention_outputs           r   r7   BertAttention.forward   s    
 >>%#H(--/M""'--/x0I""%NN=9	))K
 ))K
  ;;
  r   )rW   rS   r   r*   r9   r@   s   @r   r   r      s    -$   r   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BertIntermediate   c                 D  > [         [        U ]  5         [        R                  " UR
                  UR                  SSS[        SUR                  S9S9U l	        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g )NTFr   rE   rF   )r|   r}   r(   gather_outputr   rH   )r"   r   r#   r   ColumnParallelLinearr+   intermediate_sizer	   rL   r   
isinstance
hidden_actstrACT2FNintermediate_act_fnr[   s     r   r#   BertIntermediate.__init__   s    .0--))00*f6689
 &++S11 $*&*;*;#< 7=7H7H 	 r   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   r*   r   s     r   r7   BertIntermediate.forward   s(     

=100?r   r   r9   r@   s   @r   r   r      s    I r   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )
BertOutput   c           	        > [         [        U ]  5         [        US5      (       a1  UR                  (       a   [        SUR                  UR                  S9nO[        SUR                  S9n[        R                  " UR                  UR                  SSSUS9U l        UR                  U l        UR                  (       d$  [!        UR                  UR"                  S9U l        OS U l        [&        R(                  " UR*                  5      U l        g rx   )r"   r   r#   r   ry   r
   rL   r   r	   r   r   r   r+   r   rS   r   r   rV   rW   r   rX   rY   rZ   r   s      r   r#   BertOutput.__init__  s    j$(*6;''F,<,<,,,!335K
 -f668K**//**"#%
 %33}}*""(@(@BDN "DNzz&"<"<=r   c                 <   U R                  U5      nU R                  U5      nX-   nU R                  bf  UR                  5       nU R                  (       a  UR                  5       nU R                  U5      nU R                  (       a  UR                  U5      nU$ UnU$ r   r   r   s        r   r7   BertOutput.forward  r   r   r   r9   r@   s   @r   r   r      r   r   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )	BertLayeri/  c                 $  > [         [        U ]  5         [        U5      U l        [        U5      U l        [        U5      U l        UR                  U l	        UR                  (       a$  [        UR                  UR                  S9U l        g S U l        g NrI   )r"   r   r#   r   	attentionr   intermediater   r   rS   r   r   r+   rV   rW   r[   s     r   r#   BertLayer.__init__1  sq    i')&v.,V4 ($33==*""(@(@BDN "DNr   c                 v   U R                  X5      nU R                  bx  UnUR                  5       nU R                  (       a  UR	                  5       nU R                  U5      nU R                  (       a  UR                  U5      nU R                  U5      nOU R                  U5      nU R                  Xs5      nU$ r   )r   rW   rh   rS   ri   r   r   )	r*   r   r   r   r   rp   r   intermediate_outputlayer_outputs	            r   r7   BertLayer.forward=  s    >>-H>>%'H,113M""+113x0I""%NN=9	"&"3"3I">"&"3"34D"E{{#6Ir   )rW   r   rS   r   r   r9   r@   s   @r   r   r   /  s    
" r   r   c                   8   ^  \ rS rSrU 4S jr   SS jrSrU =r$ )BertEncoderiN  c                 ^  > [         [        U ]  5         [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        UR                  U l	        UR                  (       a$  [        UR                  UR                  S9U l        g S U l        g s  snf r   )r"   r   r#   r   
ModuleListranger   r   layerrS   r   r   r+   rV   rW   r*   r\   _r-   s      r   r#   BertEncoder.__init__P  s    k4)+]](-f.F.F(GH(G1Yv(GHJ
$33==*""(@(@BDN "DN Is   B*c                   ^  / nU 4S jnU(       ac  Sn[        T R                  5      n	Sn
X:  aD  [        R                  " U" XU
-   5      UUS-  5      nXX:X  a  UR	                  5         X-  nX:  a  MD  O[        T R                  5       H  u  pU" X5      nX[:X  a  UR	                  5         U[        T R                  5      S-
  :X  aq  T R                  bd  UR                  5       nT R                  (       a  UR                  5       nT R                  U5      nT R                  (       a  UR                  U5      nU(       d  M  UR                  U5        M     U(       a  U(       a  T R                  bd  UR                  5       nT R                  (       a  UR                  5       nT R                  U5      nT R                  (       a  UR                  U5      nUR                  U5        U$ )Nc                    >^ ^ UUU 4S jnU$ )Nc                  X   > TR                   TT nU S   nU H  nU" X S   5      nM     U$ )Nr   r   r   )inputslayersx_r   endr*   starts       r   custom_forward;BertEncoder.forward.<locals>.custom.<locals>.custom_forwardg  s9    E#.AY#Er!9-B $	r    )r   r   r   r*   s   `` r   custom#BertEncoder.forward.<locals>.custome  s     "!r   r   r   )lenr   r   
checkpointdetach_	enumeraterW   rh   rS   ri   append)r*   r   r   output_all_encoded_layerscheckpoint_activationsdetach_indexall_encoder_layersr   	layer_idxr{   chunk_lengthilayer_modulerp   s   `             r   r7   BertEncoder.forward[  s     		" "ITZZJL( #9,&>?"Q&!(  ,!))+)	 ( $-TZZ#8 ,] K$!))+DJJ!++0J$1$6$6$8M**(5(;(;(=$(NN=$AM**(5(:(:=(I,,&--m< $9 ),B~~) - 2 2 4&&$1$7$7$9M $} =&&$1$6$6}$EM%%m4!!r   )rW   rS   r   )TFr0   r9   r@   s   @r   r   r   N  s    	" #'$9" 9"r   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )
BertPooleri  c                    > [         [        U ]  5         [        R                  " UR
                  UR
                  5      U l        [        R                  " 5       U l        g r   )	r"   r   r#   r   Linearr+   r   Tanh
activationr[   s     r   r#   BertPooler.__init__  s;    j$(*YYv1163E3EF
'')r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r   )r*   r   first_token_tensorpooled_outputs       r   r7   BertPooler.forward  s6     +1a40

#566r   )r   r   r9   r@   s   @r   r   r     s    $
 r   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BertPredictionHeadTransformi  c                 x  > [         [        U ]  5         [        R                  " UR
                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     OUR                  U l        [        UR
                  UR                  S9U l        UR                  U l        g r   )r"   r   r#   r   r   r+   r   r   r   r   r   transform_act_fnr   rV   rW   rS   r[   s     r   r#   $BertPredictionHeadTransform.__init__  s    )49;YYv1163E3EF
&++S11 !'v'8'8 97=7H7H 	&F$<$<>$33r   c                    U R                  U5      nU R                  U5      nUR                  5       nU R                  (       a  UR	                  5       nU R                  U5      nU R                  (       a  UR                  U5      nU$ r   )r   r   rh   rS   ri   rW   )r*   r   rp   s      r   r7   #BertPredictionHeadTransform.forward  sr    

=1--m<%**,)//1M}5)..}=Mr   )rW   r   rS   r   r9   r@   s   @r   r   r     s    4	 	r   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BertLMPredictionHeadi  c                 d  >^  [         [        T ]  5         [        U5      T l        UT l        [        R                  " [        R                  " UR                  S5      5      5      T l        ST R                  l        UR                  T l        UR                  T l        U 4S jnUT l        ST l        g )Nr   Tc                 J   > TR                   (       a  U R                  5       $ U $ r   )rT   rg   tensorr*   s    r   convert_to_type6BertLMPredictionHead.__init__.<locals>.convert_to_type  s    ""{{}$r   F)r"   r   r#   r   	transformdecoder_weightr   r$   r   r'   ra   r(   model_parallelrT   rS   type_converter	converted)r*   r\   bert_model_embedding_weightsr  r-   s   `   r   r#   BertLMPredictionHead.__init__  s    "D244V< ;LLKK499!<=?	#'		 $33$33	 .r   c                    U R                   (       dg  SU l         U R                  (       aO  U R                  R                  5         U R                  (       a$  U R                  R
                  R                  5         U R                  U R                  U5      5      n[        R                  " U5      n[        R                  " U R                  U5      U R                  U R                  5      U R                  U R                  5      5      nU$ )NT)r
  rT   r  rg   rS   rW   ri   r	  r   copy_to_model_parallel_regionFlinearr  r(   r   s     r   r7   BertLMPredictionHead.forward  s    ~~!DN""##%&&NN,,224t':':='IJ99-H. 3 34		*, r   )r(   r
  r  rT   rS   r  r	  r9   r@   s   @r   r   r     s    , r   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BertPreTrainingHeadsi  c                    > [         [        U ]  5         [        UU5      U l        [
        R                  " UR                  S5      U l        g )N   )	r"   r  r#   r   predictionsr   r   r+   seq_relationship)r*   r\   r  r-   s      r   r#   BertPreTrainingHeads.__init__  s=    "D24/0LN "		&*<*<a @r   c                     U R                  U5      nU R                  R                  5        H  nUc  M  UR                  U5      nM     U R                  U5      nX54$ r   )r  r  
parameterstype_as)r*   sequence_outputr   prediction_scorespseq_relationship_scores         r   r7   BertPreTrainingHeads.forward  s`     ,,_=&&113Ay)11!4M 4 "&!6!6}!E 88r   )r  r  r9   r@   s   @r   r  r    s    A9 9r   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )PreTrainedBertModeli  zyAn abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
c                   > [         [        U ]  5         [        U[        5      (       dX  [        U[
        5      (       dC  [        SR                  U R                  R                  U R                  R                  5      5      eXl
        g )NzParameter config in `{}(config)` should be an instance of class `BertConfig`. To create a model from a Google pretrained model use `model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`)r"   r"  r#   r   r   r   
ValueErrorformatr-   r:   r\   )r*   r\   r   kwargsr-   s       r   r#   PreTrainedBertModel.__init__  sp    !413&-00:' :'FFLfNN++T^^-D-DGFG G
 r   c                 4   [        U[        R                  [        R                  45      (       a9  UR                  R
                  R                  SU R                  R                  S9  O^[        U[        5      (       aI  UR                  R
                  R                  5         UR                  R
                  R                  S5        [        U[        R                  5      (       a3  UR                  b%  UR                  R
                  R                  5         ggg)z Initialize the weights.
        rE   rF   r   N)r   r   r   rN   r&   datanormal_r\   rL   r   r(   zero_fill_)r*   modules     r   init_bert_weights%PreTrainedBertModel.init_bert_weights  s     fryy",,788 MM&&dkk;; ' =..KK""$MM$$S)fbii((V[[-DKK""$ .E(r   )r\   )	r:   r;   r<   r=   rs   r#   r.  r>   r?   r@   s   @r   r"  r"    s    	% %r   r"  c                   @   ^  \ rS rSrSrU 4S jr     SS jrSrU =r$ )	BertModeli  a
  BERT model ("Bidirectional Embedding Representations from a Transformer").

Params:
    config: a BertConfig class instance with the configuration to build a new model

Inputs:
    `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
        with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
        `extract_features.py`, `run_classifier.py` and `run_squad.py`)
    `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
        types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
        a `sentence B` token (see BERT paper for more details).
    `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
        selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
        input sequence length in the current batch. It's the mask that we typically use for attention when
        a batch has varying length sentences.
    `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as
        described below. Default: `True`.

Outputs: Tuple of (encoded_layers, pooled_output)
    `encoded_layers`: controlled by `output_all_encoded_layers` argument:
        - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
            of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
            encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
        - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
            to the last attention block of shape [batch_size, sequence_length, hidden_size],
    `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
        classifier pretrained on top of the hidden state associated to the first character of the
        input (`CLF`) to train on the Next-Sentence task (see BERT's paper).

Examples:
    >>> # Already been converted into WordPiece token ids
    >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

    >>> config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
    >>>     num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    >>> model = modeling.BertModel(config=config)
    >>> all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
c                    > [         [        U ]  U5        [        U5      U l        [        U5      U l        [        U5      U l        U R                  U R                  5        g r   )r"   r1  r#   rB   ro   r   encoderr   poolerapplyr.  r[   s     r   r#   BertModel.__init__C  sH    i'/(0"6* (

4))*r   c                 (   Uc  [         R                  " U5      nUc  [         R                  " U5      nUR                  S5      R                  S5      nUR	                  [        U R                  R                  5       5      R                  S9nSU-
  S-  nU R                  X5      nU R                  UUUUUS9n	U	S   n
U R                  R                  5        H  nUc  M  U
R                  U5      n
  O   U
S S 2S4   nU(       a  U(       a  U	S   n	X4$ )	Nr   r2   r_   r        )r   r   r   r0   r   )r   	ones_likerf   rd   tonextr3  r  r_   ro   r4  r  )r*   rj   rk   r   r   r   r   extended_attention_maskembedding_outputencoded_layersr  r  r   s                r   r7   BertModel.forwardJ  s(    !"__Y7N!"--i8N #1":":1"="G"G"J #:"<"<t||..0177 #= #9#&)@#@H"L??9E#&?#9% & ' ),'')Ay-55a8O	 * (1-(,B+B/N,,r   )ro   r3  r4  )NNTFr0   rr   r@   s   @r   r1  r1    s)    )V+ "&$/- /-r   r1  c                   4   ^  \ rS rSrU 4S jr SS jrSrU =r$ )DecodeLayeri|  c           
      \  >^  [         [        T ]  5         [        SUR                  S9n[        SUR                  UR                  S9n[        R                  " UR                  UR                  UR                  UR                  UUS9T l        [        R                  " UR                  UR                  UR                  UR                  USUS9T l        [!        UR                  UR"                  S9T l        [!        UR                  UR"                  S9T l        [!        UR                  UR"                  S9T l        [        R*                  " UR                  UR,                  SUS9T l        [1        UR2                  [4        5      (       a  [6        UR2                     OUR2                  T l        [        R:                  " UR,                  UR                  S	US
9T l        [>        R@                  RC                  UR                  5      T l"        URF                  T l#        U 4S jnUT l$        g )NrE   rF   rz   )r+   r   attention_dropout_proboutput_dropout_probrH   output_layer_init_methodF)r+   r   rD  rE  rH   r   rF  rI   )r   rH   T)r~   rH   c                 J   > TR                   (       a  U R                  5       $ U $ r   )rS   ri   r  s    r   r  -DecodeLayer.__init__.<locals>.convert_to_type  s    ""||~%r   )%r"   rB  r#   r	   rL   r
   r   r   GPT2ParallelSelfAttentionr+   r   r   rY   r   PalmParallelCrossAttentioncross_attentionr   rV   input_layernormpost_attention_layernormpost_cross_attention_layernormr   r   r   r   r   r   r   r   r   r   r   r   rX   rZ   rS   r	  )r*   r\   rH   rF  r  r-   s   `    r   r#   DecodeLayer.__init__~  s   k4)+(&224#5((//$1 
 66** & : :#)#F#F & : :#%=
  #==** & : :#)#F#F & : :#%= 
  -F$<$< >(5F$<$<)>%.;F$<$</>+  44$$#	
 &++S11 $*&*;*;#<7=7H7H 	 ++$$"0	
 xx''(B(BC$33	 .r   c                    UnUR                  5       nU R                  U R                  U5      5      nU R                  (       a  UR                  U5      nU R	                  XUS9nXa-   nUnU R                  U R                  U5      5      nU R                  (       a  UR                  U5      nU R                  XU5      nXa-   nUnU R                  U R                  U5      5      nU R                  (       a  UR                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nXa-   nU$ )Nis_infer)rh   rL  r	  rS   r   rM  rK  rN  r   r   r   rZ   )r*   r   enc_hidden_statesenc_attn_maskdec_attn_maskrR  residualrp   s           r   r7   DecodeLayer.forward  sO    !%**,,,.0)..}=M8 ' = !0 55.0)..}=M,,]-:< 0 ;;.0)..}=M))-800?M2]3 0r   )r   rK  rZ   rS   rL  r   r   r   rM  rN  r	  )Fr9   r@   s   @r   rB  rB  |  s    ;.F % %r   rB  c                   8   ^  \ rS rSrU 4S jr   SS jrSrU =r$ )BertDecoderi  c                 ,  > [         [        U ]  5         [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        [        UR                  UR                  S9U l        UR                  U l        g s  snf r   )r"   rY  r#   r   r   r   dec_hidden_layersrB  r   r   r+   rV   final_layernormrS   r   s      r   r#   BertDecoder.__init__  sy    k4)+]]*/0H0H*IJ*IQ[ *IJL
  -F$<$< >$33	 Ks   Bc           	        ^ ^^ UUU 4S jnUR                   n	U(       aT  Sn
[        T R                  5      nSnX:  a5  [        R                  " U" XU-   5      UX#S-  5      nXl         X-  n
X:  a  M5  O([        T R                  5       H  u  pU" UUUTTS9nM     UR                  5       nT R                  (       a  UR                  5       nT R                  U5      nT R                  (       a  UR                  U5      nU/$ )Nc                     >^ ^ UUUUU 4S jnU$ )Nc            	      h   > TR                   TT nU S   nU H  nU" UU S   U S   TS-  TS9nM     U$ )Nr   r   r2   rQ  r   )	r   r   r   r   rU  r   rR  r*   r   s	       r   r   ;BertDecoder.forward.<locals>.custom.<locals>.custom_forward  sS    E#.AY#Eq	q	%)!)+B $ 	r   r   )r   r   r   rU  rR  r*   s   `` r   r   #BertDecoder.forward.<locals>.custom  s    
 
 "!r   r   r   rQ  )
r)  r   r   r   r   r   rh   rS   ri   r\  )r*   r   rS  rT  rU  r   r   rR  r   pre_enc_hiddenr   r{   r   r   r   rp   s   `   `  `        r   r7   BertDecoder.forward  s    	"  +//!ITZZJL( #9,&>?%q'8!: *8&)	 ( $-TZZ#8 ,!%!!%!' $9 &**,)//1M,,];)..}=Mr   )r\  rS   r   )FFFr9   r@   s   @r   rY  rY    s    4 (-*/4 4r   rY  c                   <   ^  \ rS rSrU 4S jr     SS jrSrU =r$ )DecodeModeli&  c                    > [         [        U ]  U5        [        U5      U l        U R                  U R                  5        g r   )r"   rf  r#   rY  decoderr5  r.  r[   s     r   r#   DecodeModel.__init__(  s0    k4)&1"6*

4))*r   c	           	         UR                  S5      R                  S5      n	U	R                  [        U R                  R	                  5       5      R
                  S9n	SU	-
  S-  n	U" U5      n
U R                  U
UU	USUS9nUS   $ )	Nr   r2   r8  r   r9  Fr   rR  r0   )rd   r;  r<  rh  r  r_   )r*   ro   r  decode_input_idsrl   rT  rU  r   rR  r=  r>  s              r   r7   DecodeModel.forward-  s     #0"9"9!"<"F"Fq"I"9"<"<t||..0177 #= #9#&)@#@H"L%&67,,##( '  r""r   )rh  )NNNFFr9   r@   s   @r   rf  rf  &  s#    + """',# #r   rf  c                   F   ^  \ rS rSrU 4S jr          SS jrSrU =r$ )PalmForPreTrainingiF  c                   > [         [        U ]  U5        [        U5      U l        [        XR                  R                  R                  R                  5      U l	        [        U5      U l        U R                  U R                  5        g r   )r"   ro  r#   r1  bertr  ro   rM   r&   clsrf  rh  r5  r.  r[   s     r   r#   PalmForPreTraining.__init__H  sc     $08f%	'II((88??A"6*

4))*r   c                    U
c)  U R                  UUUSUS9u  pU R                  X5      u  pO<S nU
R                  [        U R                  R                  5       5      R                  S9n
Uc  [        R                  " U5      nU R	                  U R                   R                  U
UUUUUU	S9n[        R                  " U5      n[        R                  " UU R                   R                  R                  R                  5      nU(       a  UU4$ U	(       a  U[        R                   " U5      U
4$ U[        R                   " U5      4$ )NF)r   r   r8  rk  )rq  rr  r;  r<  rh  r  r_   r   r:  ro   r   r  r  r  rM   r&   !gather_from_model_parallel_region)r*   rj   rk   r   rl  rl   decode_attention_mask	lm_labelsr   rR  r  parallel_outputr   r  r  decode_outputtransformer_output_parallellogits_parallels                     r   r7   PalmForPreTraining.forwardP  s]    "-1YY*/'= .7 .?*O 9=9055 !%-004<<2245;; 1 =O!"__Y7NII  !#9 %  '*&G&G'# ((#>#'99#7#7#G#G#N#NP $o55$c&K&K'!"12 2 #"G"G#  	r   )rq  rr  rh  )
NNNNNNFFNTr9   r@   s   @r   ro  ro  F  s2    +  $#!%!&*', $ $1 1r   ro  c                   \   ^  \ rS rSrSrU 4S jr         SS jrS	S jrS
S jrSr	U =r
$ )	PlugModeli  a  
The bare Plug Model transformer outputting raw hidden-states without any specific head on top.
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
    config ([`PlugNLGConfig`]): Model configuration class with all the parameters of the model.
        Initializing with a config file does not load the weights associated with the model, only the
        configuration. Check out the [`~DistributedPlug.initialize_model`] method to load the model weights.
Examples:

>>> # The PLUG model has 27B parameters and usually need to run on multiple GPUs. The example given
>>> # here only initializes a slice of the model on a single GPU.
>>> # Check out the [`~DistributedPipeline.__init__`] method to initialize entire PLUG model.
>>> from modelscope.models.nlp.plug import PlugNLGConfig, PlugModel

>>> # Initializing a Plug configuration
>>> configuration = PlugNLGConfig()

>>> # Initializing a model from the configuration
>>> model = PlugModel(configuration)
c                 j   > [         [        U ]  5         Xl        [	        U R                  5      U l        g r   )r"   r~  r#   r\   ro  modelr[   s     r   r#   PlugModel.__init__  s%    i')'4
r   c                 2    U R                  UUUUUUUUU	U
S9
$ )aM  
Parameters:
    input_tokens (`torch.LongTensor` of shape `(batch_size, input_tokens_length)`):
        `input_tokens_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
        Indices can be obtained using transformers [`BertTokenizer`]. See
        [`TextGenerationPreprocessor.__call__`] for details.
    token_type_ids (`torch.LongTensor` of shape `(batch_size, input_tokens_length)`, *optional*, defaults to
    None):
       Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
       1]`:

       - 0 corresponds to a *sentence A* token,
       - 1 corresponds to a *sentence B* token.

    attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to None):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

    target_tokens (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to None):
        Target token ids(labels) for language modeling. Note that the labels **are shifted** inside the model,
        i.e. you can set `target_tokens = input_tokens` Indices are selected in
        `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only
        computed for labels in `[0, ..., config.vocab_size]`

    position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to None):
        Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
        `[0, config.max_position_embeddings - 1]`.

    decode_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults
    to None):
        Mask to avoid performing attention on padding token indices of target tokens. Mask values selected in
        `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

    checkpoint_activations (`boolean`, *optional*, defaults to `False`):
        Whether gradient checkpointing is activated for this model or not.
    is_infer (`boolean`, *optional*, defaults to `False`):
        Whether or not to perform single inference.
    sequence_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*,
    defaults to None):
        Also known as last_hidden_state. Sequence of hidden-states at the output of the last layer of the
        model. A single forward() call can produce one single token. To generate the current token, the
        sequence_output generated by the `forward()` of the previous token is required.
    parallel_output (`boolean`, *optional*, defaults to `True`):
        To parallel return output, or gather it before return.


)r   rR  r  rx  )r  )r*   input_tokensrk   r   target_tokensrl   rv  r   rR  r  rx  s              r   r7   PlugModel.forward  s9    ~ zz!#9++  
- 
	-r   c                 6    U R                   R                  XUS9$ )N)destinationprefix	keep_vars)r  
state_dict)r*   r  r  r  s       r   r  PlugModel.state_dict  s%    zz$$#i % I 	Ir   c                 4    U R                   R                  XS9$ )N)strict)r  load_state_dict)r*   r  r  s      r   r  PlugModel.load_state_dict  s    zz))*)DDr   )r\   r  )	NNNNNFFNT)N F)T)r:   r;   r<   r=   rs   r#   r7   r  r  r>   r?   r@   s   @r   r~  r~    sC    .5  $#"!&*', $ $I-VIE Er   r~  )/
__future__r   r   r   r   loggingr   r   torch.nn.functionalr   
functionalr  megatron_utilr    modelscope.utils.nlp.distributedr	   r
   configurationr   r   	getLoggerr:   loggerr   r   r   r   Moduler   rB   ru   r   r   r   r   r   r   r   r   r  r"  r1  rB  rY  rf  ro  r~  r   r   r   <module>r     s  "* *       B 7			8	$;   3 3 8 85	I+BII +"BRYY BJ,RYY ,^/ BII / dryy 0, ,^		 >F"")) F"R  ")) .%299 %P9299 9$%")) %>b-# b-Je")) eP?")) ?D#% #@;, ;|mE mEr   