
    9i                        S r SSKJrJrJr  SSKrSSKrSSKrSSKr	SSK
r
SSKrSSKJr  SSKJs  Jr  SSKrSSKJr  S r " S S\5      r " S S	\R,                  5      r " S
 S\R,                  5      r " S S\R,                  5      rS-S jr " S S\R,                  5      r " S S\R,                  5      r " S S\R,                  5      r " S S\R,                  5      r " S S\R,                  5      r " S S\R,                  5      r  " S S\R,                  5      r! " S S\R,                  5      r" " S S \R,                  5      r# " S! S"\R,                  5      r$ " S# S$\R,                  5      r% " S% S&\R,                  5      r& " S' S(\R,                  5      r' " S) S*\R,                  5      r( " S+ S,\R,                  5      r)g).zPyTorch BERT model.    )absolute_importdivisionprint_functionN)CrossEntropyLossc                 n    U S-  S[         R                  " U [        R                  " S5      -  5      -   -  $ )zImplementation of the gelu activation function.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
g      ?      ?g       @)torcherfmathsqrt)xs    r/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/multi_modal/diffusion/structbert.pygelur      s.    
 s7cEIIa$))C.&899::    c                       \ rS rSrSr                          S
S jr\S 5       r\S 5       rS r	S r
Srg	)
BertConfig&   zEConfiguration class to store the configuration of a `BertModel`.
    c                 `   Xl         X l        X0l        X@l        XPl        X`l        Xpl        Xl        Xl        Xl	        Xl
        Xl        Xl        Xl        Xl        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        g)a  Constructs BertConfig.

Args:
    vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
    hidden_size: Size of the encoder layers and the pooler layer.
    num_hidden_layers: Number of hidden layers in the Transformer encoder.
    num_attention_heads: Number of attention heads for each attention layer in
        the Transformer encoder.
    intermediate_size: The size of the "intermediate" (i.e., feed-forward)
        layer in the Transformer encoder.
    hidden_act: The non-linear activation function (function or string) in the
        encoder and pooler.
    hidden_dropout_prob: The dropout probability for all fully connected
        layers in the embeddings, encoder, and pooler.
    attention_probs_dropout_prob: The dropout ratio for the attention
        probabilities.
    max_position_embeddings: The maximum sequence length that this model might
        ever be used with. Typically set this to something large just in case
        (e.g., 512 or 1024 or 2048).
    type_vocab_size: The vocabulary size of the `token_type_ids` passed into
        `BertModel`.
    initializer_range: The stdev of the truncated_normal_initializer for
        initializing all weight matrices.
N)
vocab_sizehidden_sizeemb_sizenum_hidden_layerstransformer_typetransition_functionweighted_transformernum_rolled_layersnum_attention_heads
hidden_actintermediate_sizehidden_dropout_probattention_probs_dropout_probmax_position_embeddingstype_vocab_sizeinitializer_rangeattention_typerezeropre_lnsqueeze_excitationtransfer_matrixdim_dropoutset_mask_zeroroberta_style
init_scale
safer_fp16grad_checkpoint)selfr   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r,   r+   r-   r.   r/   s                               r   __init__BertConfig.__init__*   s    h %& !2 0#6 $8!!2#6 $!2#6 ,H)'>$.!2,"4.&**$$.r   c                 r    [        SS9n[        R                  " U5       H  u  p4XBR                  U'   M     U$ )zAConstructs a `BertConfig` from a Python dictionary of parameters.N)r   )r   six	iteritems__dict__)clsjson_objectconfigkeyvalues        r   	from_dictBertConfig.from_dictz   s4     t,MM+6LS#(OOC  7r   c                     [        USSS9 nUR                  5       nSSS5        U R                  [        R                  " W5      5      $ ! , (       d  f       N3= f)z9Constructs a `BertConfig` from a json file of parameters.rzutf-8)encodingN)openreadr<   jsonloads)r7   	json_filereadertexts       r   from_json_fileBertConfig.from_json_file   sC     )S73v;;=D 4}}TZZ-.. 43s   A


Ac                 F    [         R                  " U R                  5      nU$ )z0Serializes this instance to a Python dictionary.)copydeepcopyr6   r0   outputs     r   to_dictBertConfig.to_dict   s    t}}-r   c                 P    [         R                  " U R                  5       SSS9S-   $ )z*Serializes this instance to a JSON string.   T)indent	sort_keys
)rC   dumpsrO   )r0   s    r   to_json_stringBertConfig.to_json_string   s     zz$,,.dCdJJr   )r!   r%   r*   r   r/   r   r    r   r-   r$   r   r"   r   r   r   r'   r&   r,   r.   r+   r(   r)   r   r   r#   r   r   N)i      originallinearr      rZ   i   r   皙?r^   i      g{Gz?r0   FFFFFFFFFF)__name__
__module____qualname____firstlineno____doc__r1   classmethodr<   rH   rO   rW   __static_attributes__ r   r   r   r   &   s    
 !#%",%-&'#$%'#'"%(.1),!##' &$)!&"$$!!!&7N/`   / /
Kr   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )BERTLayerNorm   c                 R  > [         [        U ]  5         Xl        Ub  UOUR                  n[
        R                  " [        R                  " U5      5      U l	        [
        R                  " [        R                  " U5      5      U l        UR                  (       d  X l        gSU l        g)zWConstruct a layernorm module in the TF style (epsilon inside the square root).
        Ngh㈵>)superri   r1   r9   r   nn	Parameterr	   onesgammazerosbetar,   variance_epsilon)r0   r9   rs   special_sizer   	__class__s        r   r1   BERTLayerNorm.__init__   sr     	mT+-&2&>lFDVDV\\%**["9:
LL[!9:	8>8L8L 0RVr   c                    UR                  5       nU R                  R                  (       a  UR                  5       nUR	                  SSS9nX-
  R                  S5      R	                  SSS9nX-
  [        R                  " X@R                  -   5      -  nU R                  R                  (       a+  U R                  U-  U R                  -   R                  U5      $ U R                  U-  U R                  -   $ )NrY   TkeepdimrR   )typer9   r.   floatmeanpowr	   r   rs   rp   rr   )r0   r   previous_typeuss        r   forwardBERTLayerNorm.forward   s    ;;!!	AFF2tF$UKKND1Uejj%:%:!:;;;;!!JJNTYY.44]CC::>DII--r   )rr   r9   rp   rs   )g-q=Nr`   ra   rb   rc   r1   r   rf   __classcell__ru   s   @r   ri   ri      s    W
. 
.r   ri   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )BERTEmbeddings   c                   > [         [        U ]  5          UR                  S:  a  UR                  OUR                  n[
        R                  " UR                  UUR                  (       a  SOS S9U l	        [
        R                  " UR                  UUR                  (       a  SOS S9U l        [
        R                  " UR                  U5      U l        Xl        UR                  S:  a  S O*[
        R                  " UR                  UR                  5      U l        [#        XS9U l        [
        R&                  " UR(                  5      U l        g )Nr      )padding_idx)rt   )rl   r   r1   r   r   rm   	Embeddingr   r,   word_embeddingsr"   position_embeddingsr#   token_type_embeddingsr9   Linearprojri   	LayerNormDropoutr    dropout)r0   r9   r   ru   s      r   r1   BERTEmbeddings.__init__   s    nd,.	,2OOa,?f((V__!||#11t = $&<<**#11t$=  &(\\&2H2H2=&?""OOa/DRYYOOV//61	 'vHzz&"<"<=r   c                    UR                  S5      nU R                  R                  (       dO  [        R                  " U[        R
                  UR                  S9nUR                  S5      R                  U5      nOWUR                  S5      R                  5       n[        R                  " USS9R                  U5      U-  R                  5       S-   nUc  [        R                  " U5      nUc  U R                  U5      OUnU R                  R                  (       a  SXqS:H  '   U R!                  U5      nU R#                  U5      n	U R                  R                  (       d  Xx-   U	-   n
OXx-   n
U R%                  U
5      n
U R'                  U
5      n
U R(                  b#  U R)                  U
5      n
U R'                  U
5      n
g X4$ )Nr   )dtypedevicer   dim        g   )sizer9   r,   r	   arangelongr   	unsqueeze	expand_asneintcumsumtype_as
zeros_liker   r+   r   r   r   r   r   )r0   	input_idstoken_type_idsadv_embedding
seq_lengthposition_idsmaskwords_embeddingsr   r   
embeddingss              r   r   BERTEmbeddings.forward   s   ^^A&
{{(( <<%**Y5E5EGL'11!4>>yIL<<?&&(D!LL15==dC"#$(DFQ/L!"--i8N (/  //5B 	;;$$13#-."66|D $ : :> J{{(()?BWWJ)?J^^J/
\\*-
99 :.Jj1J//r   )r   r9   r   r   r   r   r   )NNr   r   s   @r   r   r      s    >.0 0r   r   c                   4   ^  \ rS rSrU 4S jrS rS rSrU =r$ )BERTFactorizedAttention   c                   > [         [        U ]  5         UR                  UR                  -  S:w  a$  [        SUR                  UR                  4-  5      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        g Nr   zLThe hidden size (%d) is not a multiple of the number of attention heads (%d))rl   r   r1   r   r   
ValueErrorr   attention_head_sizeall_head_sizerm   r   queryr:   r;   r   r!   r   r0   r9   ru   s     r   r1    BERTFactorizedAttention.__init__   s   %t57 : ::a?##V%?%?@AB B $*#=#= #&v'9'9)/)C)C(D $E !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EFr   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  U5      $ )NrY   r   r   r   viewpermute)r0   r   r   new_x_shapes       r   transpose_for_scores,BERTFactorizedAttention.transpose_for_scores   sL    ffhsmt'?'?'+'?'?'A AFFK yyr   c                 x   U R                  U5      nU R                  U5      nU R                  U5      nU R                  USSSS5      nU R                  USSSS5      nU R                  USSSS5      nXb-   n	[        R
                  " SS9" U	5      n
U R                  U
5      n
[        R
                  " SS9" U5      n[        R                  " X5      n[        R                  " X5      nUR                  SSSS5      R                  5       nUR                  5       S S U R                  4-   nUR                  " U6 nU$ )Nr   rR   r]   r   rY   r   )r   r:   r;   r   rm   Softmaxr   r	   matmulr   
contiguousr   r   r   )r0   hidden_statesattention_maskmixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layers_attention_scoress_attention_probsc_attention_probss_context_layercontext_layernew_context_layer_shapes                  r   r   BERTFactorizedAttention.forward   s7    JJ}5((=1 JJ}5//0A1aAN--oq!QJ	//0A1aAN(9JJ2./AB LL):;JJ2.y9,,'8F%6H%--aAq9DDF"/"4"4"6s";?" #"%**,CDr   )r   r   r   r:   r   r   r;   	r`   ra   rb   rc   r1   r   r   rf   r   r   s   @r   r   r      s    G$ r   r   c                 &   U(       a  US:X  a  U $ SU-
  nU R                   R                  U R                  5       5      R                  5       S-   n[        R
                  " XE-  5      nXfR                  U5      [        R                  " XbSS9-  -  U -  $ )Nr   r   T)r   ry   )datanewr   zero_r	   	bernoullisum)r   pr   trainingabdropout_masks          r   r*   r*     s    qAv	
QA	
AFFH		#	#	%	)A??15)L,,S1EIIt5- - .012 2r   c                   8   ^  \ rS rSrU 4S jrS rSS jrSrU =r$ )BERTSelfAttentioni"  c                   > [         [        U ]  5         UR                  UR                  -  S:w  a$  [        SUR                  UR                  4-  5      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        Xl        UR$                  (       a  ['        U5      U l        g g r   )rl   r   r1   r   r   r   r   r   r   rm   r   r   r:   r;   r   r!   r   r9   r'   ri   r   r   s     r   r1   BERTSelfAttention.__init__$  s3   /1 : ::a?##V%?%?@AB B $*#=#= #&v'9'9)/)C)C(D $E !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF==*62DN r   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )NrY   r   rR   r   r]   r   )r0   r   r   s      r   r   &BERTSelfAttention.transpose_for_scores9  sS    ffhsmt'?'?'+'?'?'A AFFK yyAq!$$r   c                    U R                   R                  (       a  U R                  U5      nU R                  U5      nU R	                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      n	[        R                  " UUR                  SS5      5      n
U
[        R                  " U R                  5      -  n
Ub>  U R                  (       d-  [        U5       H  u  pX;   S:X  d  M  SU
S S 2US S 2S S 24'   M      X-   n
[        R                   " SS9" U
5      nU R                   R"                  (       d  U R%                  U5      nO*[#        UU R                   R&                  SU R                  S9n[        R                  " X5      nUR)                  SSSS	5      R+                  5       nUR-                  5       S S U R.                  4-   nUR0                  " U6 nU$ )
NrY   r   r   r   r   )r   r   r   r   rR   r]   )r9   r'   r   r   r:   r;   r   r	   r   	transposer   r   r   r   	enumeraterm   r   r*   r   r!   r   r   r   r   r   )r0   r   r   	head_maskr   r   r   r   r   r   attention_scoresir   attention_probsr   r   s                   r   r   BERTSelfAttention.forward?  s   ;; NN=9M JJ}5((=1 JJ}5//0AB--o>	//0AB !<<(1(;(;B(CE+dii$$/& &  $Y/<1$35$Q1aZ0 0 ,< **,-=> {{&&"ll?;O)++::	(O _B%--aAq9DDF"/"4"4"6s";?" #"%**,CDr   )	r   r   r   r9   r   r:   r   r   r;   Nr   r   s   @r   r   r   "  s    3*%* *r   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BERTSelfOutputil  c                   > [         [        U ]  5         Xl        [        R
                  " UR                  UR                  5      U l        UR                  (       d!  UR                  (       d  [        U5      U l        [        R                  " UR                  5      U l        UR                  (       a  [        R                  " [         R"                  " S5      R%                  S5      R'                  [)        U R+                  5       5      R,                  S95      U l        [        R                  " [         R0                  " S5      R'                  [)        U R+                  5       5      R,                  S95      U l        g g )Nr   Gz?r   )rl   r   r1   r9   rm   r   r   denser'   r&   ri   r   r   r    r   rn   r	   Tensorfill_tonext
parametersr   
res_factorro   factorr   s     r   r1   BERTSelfOutput.__init__n  s    nd,.YYv1163E3EF
}}V]]*62DNzz&"<"<=== llQ%%d+..t0177 / 9:DO ,,

1  tDOO,='>'D'D EGDK	 r   c                 >   U R                  U5      nU R                  U5      nU R                  R                  (       d0  U R                  R                  (       d  U R                  X-   5      nU$ U R                  R                  (       a  XR                  U-  -   nU$  U$ r   )r   r   r9   r&   r'   r   r   r0   r   input_tensors      r   r   BERTSelfOutput.forward|  s    

=1]3{{!!$++*<*< NN=+GHM
 	 [[)KK,,FFM  r   )r   r9   r   r   r   r   r   r   s   @r   r   r   l  s    G	 	r   r   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )BERTAttentioni  c                 N  > [         [        U ]  5         UR                  R	                  5       S:X  a  [        U5      U l        OSUR                  R	                  5       S:X  a  [        U5      U l        O$[        SR                  UR                  5      5      e[        U5      U l        g )Nr0   
factorizedz5Attention type must in [self, factorized], but got {})rl   r  r1   r%   lowerr   r0   r   r   formatr   rN   r   s     r   r1   BERTAttention.__init__  s    mT+-  &&(F2)&1DI""((*l:/7DIGNN))+, , %V,r   c                 L    U R                  XU5      nU R                  XA5      nU$ r   rM   )r0   r  r   r   self_outputattention_outputs         r   r   BERTAttention.forward  s'    iiiH;;{Ar   )rN   r0   r   r   r   s   @r   r  r    s    
-   r   r  c                   <   ^  \ rS rSr     SU 4S jjrS rSrU =r$ )DepthwiseSeparableConv1di  c                    > [         [        U ]  5         US-
  S-  n[        R                  " UUUUUUUUS9U l        [        R                  " XSSSSSUS9U l        g )Nr   rR   )groupsbiasr   )r  )rl   r  r1   rm   Conv1d	depthwise	pointwise)	r0   in_channelsout_channelskernel_sizestridepaddingdilationr  ru   s	           r   r1   !DepthwiseSeparableConv1d.__init__  sk     	&68?q( q!Q14Ar   c                 J    U R                  U5      nU R                  U5      nU$ r   r  r  )r0   r   s     r   r    DepthwiseSeparableConv1d.forward  s#    NN1NN1r   r  )r   r   r   r   Fr   r   s   @r   r  r    s$    
 A, r   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BERTIntermediatei  c                 F  > [         [        U ]  5         Xl        U R                  R                  (       a  [        U5      U l        [        U l        UR                  R                  5       S:X  a1  [        R                  " UR                  UR                  5      U l        g UR                  R                  5       S:X  a(  [!        UR                  SUR                  -  SS9U l        g UR                  R                  R                  5       S:X  a  [%        S5      e['        S5      e)	Nr\   cnn      r  rnn.rnn transition function is not implemented yetOnly support linear/cnn/rnn)rl   r!  r1   r9   r'   ri   r   r   intermediate_act_fnr   r  rm   r   r   r   r   r  r#  NotImplementedErrorr   r   s     r   r1   BERTIntermediate.__init__  s    .0;;*62DN#' %%++-96#5#5#)#;#;=DJ''--/58/""A(:(:$:KDH]]&&,,.%7%@B B :;;r   c                    U R                   R                  (       a  U R                  U5      nU R                   R                  R	                  5       S:X  a  U R                  U5      nO[U R                   R                  R	                  5       S:X  a2  U R                  UR                  SS5      5      R                  SS5      nO U R                  U5      nU$ Nr\   r#  rY   r   )	r9   r'   r   r   r  r   r#  r   r*  )r0   r   s     r   r   BERTIntermediate.forward  s    ;; NN=9M;;**002h> JJ}5M[[,,224= HH]%<%<R=?&A BBK)ACRCI  00?r   )r   r#  r9   r   r*  r   r   s   @r   r!  r!    s    <$ r   r!  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SqueezeExcitationBlocki  c                    > [         [        U ]  5         [        R                  " UR
                  UR
                  S-  5      U l        [        R                  " UR
                  S-  UR
                  5      U l        g )Nr$  )rl   r1  r1   rm   r   r   down_samplingup_samplingr   s     r   r1   SqueezeExcitationBlock.__init__  s`    $d46YYv'9'9'-'9'9Q'>@99V%7%71%<%+%7%79r   c           	          [         R                  " USSS9n[         R                  " U R                  [	        U R                  U5      5      5      5      nX-  $ )Nr   Trx   )r	   r|   sigmoidr4  r   r3  )r0   r   squeeze
excitations       r   r   SqueezeExcitationBlock.forward  sJ    **]At<]]T$"4"4W"=>?A
))r   )r3  r4  r   r   s   @r   r1  r1    s    9* *r   r1  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )
BERTOutputi  c                   > [         [        U ]  5         Xl        UR                  R                  5       S:X  a1  [        R                  " UR                  UR                  5      U l
        OUR                  R                  5       S:X  a(  [        SUR                  -  UR                  SS9U l        O>UR                  R                  R                  5       S:X  a  [        S5      e[        S5      eUR                  (       d!  UR                   (       d  [#        U5      U l        [        R&                  " UR(                  5      U l        UR,                  (       a  [/        U5      U l        UR                   (       a  [        R2                  " [4        R6                  " S	5      R9                  S
5      R;                  [=        U R?                  5       5      R@                  S95      U l!        [        R2                  " [4        RD                  " S	5      R;                  [=        U R?                  5       5      R@                  S95      U l#        g g )Nr\   r#  r$  r%  r&  r'  r(  r)  r   r   r   )$rl   r<  r1   r9   r   r  rm   r   r   r   r   r  r#  r+  r   r'   r&   ri   r   r   r    r   r(   r1  SEblockrn   r	   r   r   r   r   r   r   r   ro   r   r   s     r   r1   BERTOutput.__init__  s   j$(*%%++-96#;#;#)#5#57DJ''--/58/F&&&(:(:KDH]]&&,,.%7%@B B :;;}}V]]*62DNzz&"<"<=$$1&9DL== llQ%%d+..t0177 / 9:DO ,,

1  tDOO,='>'D'D EGDK	 r   c                    U R                   R                  R                  5       S:X  a  U R                  U5      nO[U R                   R                  R                  5       S:X  a2  U R	                  UR                  SS5      5      R                  SS5      nO U R                  U5      nU R                   R                  (       a  U R                  U5      nU R                   R                  (       d0  U R                   R                  (       d  U R                  X-   5      nU$ U R                   R                  (       a  XR                  U-  -   nU$  U$ r.  )r9   r   r  r   r#  r   r   r(   r>  r&   r'   r   r   r   s      r   r   BERTOutput.forward	  s   ;;**002h> JJ}5M[[,,224= HH]%<%<R=?&A BBK)ACRCI  ]3;;)) LL7M{{!!$++*<*< NN=+GHM
 	 [[)KK,,FFM  r   )r   r>  r#  r9   r   r   r   r   r   r   s   @r   r<  r<    s    G4 r   r<  c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )	BERTLayeri  c                    > [         [        U ]  5         [        U5      U l        [        U5      U l        [        U5      U l        g r   )	rl   rC  r1   r  	attentionr!  intermediater<  rN   r   s     r   r1   BERTLayer.__init__   s5    i')&v.,V4 (r   c                 p    U R                  XU5      nU R                  U5      nU R                  XT5      nXF4$ r   rE  rF  rN   )r0   r   r   r   r  intermediate_outputlayer_outputs          r   r   BERTLayer.forward&  s@    >>-*35"//0@A{{#6I--r   rI  r   r   r   s   @r   rC  rC    s    ). .r   rC  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BERTWeightedLayeri.  c           	        > [         [        U ]  5         Xl        [	        U5      U l        U R
                  R                  U l        [        R                  " [        UR                  5       Vs/ s H.  n[        R                  " U R                  UR                  5      PM0     sn5      U l        [        R                  " UR                  5      U l        [        R"                  " U R                   U R                   R%                  5       -  5      U l        [        R                  " UR                  5      U l        [        R"                  " U R&                  U R&                  R%                  5       -  5      U l        [)        U5      U l        [        R                  " UR,                  UR                  5      U l        [1        U5      U l        [        R4                  " UR6                  5      U l        g s  snf r   )rl   rN  r1   r9   r   r0   r   rm   
ModuleListranger   r   r   w_or	   randw_kprn   r   w_ar!  rF  r   rN   ri   r   r   r    r   )r0   r9   _ru   s      r   r1   BERTWeightedLayer.__init__0  sG   /1%f-	#'99#@#@ ==6556"
6 IId..0B0BC6"
  JJv99:	LLTYY]]_!<=	::f889<<488<<> 9:,V4ii 8 8&:L:LM&v.zz&"<"<="
s   ,5G-c                    U R                  X5      nUR                  U R                   R                  SS9n[        [	        U5      5       Vs/ s H  oPR
                  U   " XE   5      PM     nn[        [	        U5      5       Vs/ s H  oPR                  XE   5      PM     nn[        U R                  U5       VVs/ s H	  u  pgXg-  PM     nnn[        [	        U5      5       Vs/ s H  nU R                  XE   5      PM     nn[        [	        U5      5       Vs/ s H  oPR                  XE   5      PM     nn[        [	        U5      5       Vs/ s H  oPR                  XE   5      PM     nn[        U R                  U5       VVs/ s H	  u  pX-  PM     nnn[        U5      nU R                  X-   5      $ s  snf s  snf s  snnf s  snf s  snf s  snf s  snnf )NrY   r   )r0   splitr   rQ  lenrR  r   ziprT  rF  rN   rU  r   r   )	r0   r   r   r  self_outputsr   kapparN   alphas	            r   r   BERTWeightedLayer.forwardD  s   ii>"(()F)FB(O27L8I2J
2JQHHQK(2J 	 
 49\9J3K
3KaLL)3K 	 
 14DII|0L
0L}uEN0L 	 

 3|,-
- lo.- 	 

 38L8I2J
2JQKK(2J 	 
 49\9J3K
3KaLL)3K 	 
 14DHHl0K
0K}uEN0K 	 
 \"~~m455/






s*    F5F:;F?$GG
GG)
r   r   r9   r   rF  rN   r0   rU  rT  rR  r   r   s   @r   rN  rN  .  s    >(6 6r   rN  c                   6   ^  \ rS rSrU 4S jr  SS jrSrU =r$ )BERTEncoderia  c           	      P  > [         [        U ]  5         [        R                  " 5       U l        [        UR                  5       H^  nUR                  (       a&  U R
                  R                  [        U5      5        M:  U R
                  R                  [        U5      5        M`     UR                  (       Gaj  [        U R
                  5       GHP  u  p4[        R                  " [        R                   " S5      R#                  S5      R%                  ['        U R)                  5       5      R*                  S95      UR,                  l        [        R                  " [        R                   " S5      R#                  S5      R%                  ['        U R)                  5       5      R*                  S95      UR,                  l        UR,                  R.                  UR2                  R,                  l        UR,                  R0                  UR2                  R,                  l        GMS     Xl        g )Nr   r   r   )rl   ra  r1   rm   rP  layerrQ  r   r   appendrN  rC  r&   r   rn   r	   r   r   r   r   r   r   rN   r   r   rE  r9   )r0   r9   rV  indexrc  ru   s        r   r1   BERTEncoder.__init__c  sm   k4)+]]_
v//0A**

!!"3F";<

!!)F"34	 1
 === )$** 5*,,,LLO))"-00"4??#45;; 1 =+>' ')llLLO))!,//"4??#45;; 0 ='># 5:LL4K4K&&1050C0C&&- !6 r   c                    U/nUS:w  a+  [        [        U R                  5      S-  5      SU-
  -  S-
  nOSn[        U R                  5       H  u  pxUcU  U R                  R
                  (       d  U" XS 5      u  pO<[        R                  R                  R                  XUS 5      u  pOU" UUXG   5      u  pXg:X  a  UR                  5         UR                  U	5        UR                  U5        M     U$ )NrY   r]   rR   r   )r   rZ  rc  r   r9   r/   r	   utils
checkpointdetach_rd  )
r0   r   r   epoch_id
head_masksall_encoder_layersdetach_indexre  layer_moduleself_outs
             r   r   BERTEncoder.forwardw  s    
 ,_r>s4::23q8|DqHLL#,TZZ#8E!{{22.:%t/=+Hm /4kk.D.D.O.O$^T/K+Hm +7}7E7A7H+J' $%%'%%h/%%m4 $9  "!r   r9   rc  rY   Nr   r   s   @r   ra  ra  a  s    . 	" "r   ra  c                   6   ^  \ rS rSrU 4S jr  SS jrSrU =r$ )BERTEncoderRolledi  c                    > [         [        U ]  5         [        U5      nXl        [
        R                  " [        UR                  5       Vs/ s H  n[        R                  " U5      PM     sn5      U l        g s  snf r   )rl   ru  r1   rC  r9   rm   rP  rQ  r   rK   rL   rc  )r0   r9   rc  rV  ru   s       r   r1   BERTEncoderRolled.__init__  s\    /1&!]]+01I1I+JK+JaT]]5!+JKM
Ks    A9c                    U/n[        U R                  R                  5       H  nU R                  R                  R	                  5       S:X  a,  U R
                  X`R                  R                  -     " X5      nOmU R                  R                  R	                  5       S:X  aE  UU R                  R                  U R                  R                  -  -  nU R
                  U   " X5      nUR                  U5        M     U$ )N	universalalbert)rQ  r9   r   r   r  rc  r   rd  )r0   r   r   rk  rl  rm  r   r   s           r   r   BERTEncoderRolled.forward  s    
 ,_t{{445A{{++113{B $

1{{/L/L+L M!!3--335AKK11{{4456 !%

1m L%%m4 6 "!r   rr  rs  r   r   s   @r   ru  ru    s    M 	" "r   ru  c                   4   ^  \ rS rSrU 4S jrS rS rSrU =r$ )BERTEncoderACTi  c                   > [         [        U ]  5         [        U5      U l        [
        R                  " UR                  S5      n[
        R                  " [        UR                  5       Vs/ s H  n[        R                  " U5      PM     sn5      U l        U R                   H(  nUR                  R                  R!                  S5        M*     Xl        UR                  U l        SU l        g s  snf )Nr   r   r   )rl   r}  r1   rC  rc  rm   r   r   rP  rQ  r   rK   rL   r   r  r   r   r9   act_max_steps	threshold)r0   r9   r   rV  moduleru   s        r   r1   BERTEncoderACT.__init__  s    nd,.v&
IIf((!,',V-E-E'FG'F!T]]1'FGI ffFKK""2& #55 Hs   , C/c                     UR                  U R                  5      R                  UR                  U R                  5      5      R	                  5       $ r   )ltr  __and__r  any)r0   halting_probability	n_updatess      r   should_continueBERTEncoderACT.should_continue  s=    #&&t~~6>>LL++,./2su	5r   c                    U/nUR                  5       u  pEn[        R                  " XE5      R                  5       n[        R                  " XE5      R                  5       n[        R                  " XE5      R                  5       n	[	        U R
                  5       GH;  n
[        R                  " U R                  U
   " U5      R                  S5      5      nUR                  S5      R                  5       nX{U-  -   R                  U R                  5      R                  5       U-  nX{U-  -   R                  U R                  5      R                  5       U-  nX{U-  -   nXSU-
  -  -   nX}U-  -   nX-   U-   n	X-  X-  -   R                  S5      nU R                  X5      nX-  USU-
  -  -   nUR!                  U5        U R#                  Xy5      (       a  GM<    O   U[        R$                  " X-   5      4$ )NrR   r   r   )r   r	   rq   cudarQ  r  r7  r   r8  r  r{   gtr  ler   rc  rd  r  r|   )r0   r   r   rm  
batch_sizeseq_lenhdimr  
remaindersr  r   r   still_running
new_haltedupdate_weightstransformed_statess                   r   r   BERTEncoderACT.forward  s   +_$1$6$6$8!
T#kk*>CCE[[5::<
KK
499;	t))*AdffQi6>>qABA/2237==?M-M0AAEE  %-8J0}3DDHH  %-8M"5M8I"I#A8K4K&LLJ"5Z8O"O!1J>I/ * 789B1 !%M!J.?-N"C$ $M%%m4''(;GG% +& "5::i.D#EEEr   )r  r9   rc  r   r  )	r`   ra   rb   rc   r1   r  r   rf   r   r   s   @r   r}  r}    s    5F Fr   r}  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )
BERTPooleri  c                    > [         [        U ]  5         [        R                  " UR
                  UR
                  5      U l        [        R                  " 5       U l        g r   )	rl   r  r1   rm   r   r   r   Tanh
activationr   s     r   r1   BERTPooler.__init__  s;    j$(*YYv1163E3EF
'')r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r  )r0   r   first_token_tensorpooled_outputs       r   r   BERTPooler.forward  s6     +1a40

#566r   )r  r   r   r   s   @r   r  r    s    $
 r   r  c                   H   ^  \ rS rSrSrS\4U 4S jjr     SS jrSrU =r	$ )	BertModeli  au  BERT model ("Bidirectional Embedding Representations from a Transformer").

Example:
    >>> # Already been converted into WordPiece token ids
    >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])

    >>> config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
    >>>     num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)

    >>> model = modeling.BertModel(config=config)
    >>> all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
r9   c                   > [         [        U ]  5         Xl        [	        U5      U l        UR                  R                  5       S:X  a  [        U5      U l	        OUR                  R                  5       S:X  a  [        U5      U l	        OUR                  R                  5       S:X  a  [        U5      U l	        OUR                  R                  5       S:X  a  [        U5      U l	        OmUR                  R                  5       S:X  a  SSKJnJnJn  [!        XUU5      U l	        O2[#        SR%                  UR                  R                  5       5      5      e['        U5      U l        g	)
zEConstructor for BertModel.

Args:
    config: `BertConfig` instance.
r[   ry  rz  acttextnasr   )
input_dictop_dict	skip_dictz Not support transformer type: {}N)rl   r  r1   r9   r   r   r   r  ra  encoderru  r}  textnas_finalr  r  r  TextNASEncoderr   r  r  pooler)r0   r9   r  r  r  ru   s        r   r1   BertModel.__init__  s    	i')(0""((*j8&v.DL$$**,;,V4DL$$**,8,V4DL$$**,5)&1DL$$**,	9DD)&:*35DL ?FF''--/1 2 2 (r   c                    Uc  [         R                  " U5      nUc  [         R                  " U5      nUR                  S5      R                  S5      nUR	                  [        U R                  5       5      R                  S9nSU-
  S-  nU R                  XU5      u  pU R                  R                  R                  5       S:X  a  U R                  X5      u  pOPU R                  R                  R                  5       S:X  a  U R                  U5      nX/n
OU R                  UUXE5      n
U
R                  SU	5        U
S	   nU R                  R                  (       d  U R                  U5      nX4$ US S 2S4   nX4$ )
Nr   rR   r   r   g     r  reformerr   rY   )r	   	ones_liker   r   r   r   r   r   r   r9   r   r  r  insertr.   r  )r0   r   r   r   rk  rl  r   extended_attention_maskembedding_outputr   rm  act_losssequence_outputr  s                 r   r   BertModel.forward  sz    !"__Y7N!"--i8N #1":":1"="G"G"J #:"<"<t()// #= #1#&)@#@H"L,0OO}-6);;''--/58+/<< ,;([[))//1Z?"ll+;<O"1!C!%.>.E.6"D 	!!!_5,R0{{%% KK8M "00 ,AqD1M!00r   )r9   r   r  r  )NNrY   NN)
r`   ra   rb   rc   rd   r   r1   r   rf   r   r   s   @r   r  r    s/    )z )8  $#".1 .1r   r  c                   D   ^  \ rS rSrSrU 4S jr       SS jrSrU =r$ )&BertForSequenceClassificationMultiTaskiJ  a  BERT model for classification.
This module is composed of the BERT model with a linear layer on top of
the pooled output.

Example:
    >>> # Already been converted into WordPiece token ids
    >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])

    >>> config = BertConfig(vocab_size=32000, hidden_size=512,
    >>>     num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)

    >>> num_labels = 2

    >>> model = BertForSequenceClassification(config, num_labels)
    >>> logits = model(input_ids, token_type_ids, input_mask)
c           	      D  >^ [         [        U ]  5         UR                  5       S:X  a  [	        T5      U l        O?UR                  5       S:X  a  [        T5      U l        O[        SR                  U5      5      e[        R                  " TR                  5      U l        [        R                  " 5       U l        U HF  nU R                  R                  [        R                   " TR"                  [%        U5      5      5        MH     X l        U4S jnU R)                  U5        g )Nbertlstmz%Only support lstm or bert, but got {}c                 ,  > [        U [        R                  [        R                  45      (       a/  U R                  R
                  R                  STR                  S9  Oq[        U [        5      (       a\  U R                  R
                  R                  STR                  S9  U R                  R
                  R                  STR                  S9  [        U [        R                  5      (       a%  U R                  R
                  R                  5         g g )Nr   )r|   std)
isinstancerm   r   r   weightr   normal_r$   ri   rr   rp   r  r   )r  r9   s    r   init_weightsEBertForSequenceClassificationMultiTask.__init__.<locals>.init_weightsm  s    &299bll";<< ""**&":": + <FM22  ((&":": ) <!!))&":": * <&")),,  &&( -r   )rl   r  r1   r  r  r  	LSTMModelr   r  rm   r   r    r   rP  
classifierrd  r   r   rZ  
label_listapply)r0   r9   r  core_encoderlabelr  ru   s    `    r   r1   /BertForSequenceClassificationMultiTask.__init__^  s    4dDF6)!&)DI!V+!&)DI7>>|LN Nzz&"<"<=--/EOO""299V-?-?U#LM  $	) 	

< r   c                 
   U R                  UUX6UU5      u  pU R                  U5      nU R                   Vs/ s H
  o" U5      PM     nnUGb0  [        SS9n[        R
                  " SS9n[        R                  " US5      n/ n[        [        UU5      5       H  u  nu  nn[        U R                  U   5      S:w  a  U" UUR                  5       5      nOU" UR                  S5      U5      nUU:H  R                  [        U R!                  5       5      R"                  S9nU
b  UU
U   -  n[        R$                  " UU-  5      nUR'                  U5        M     U	(       d  [)        U5      U4$ [)        U5      XS   4$ U$ s  snf )Nnone)	reductionr   rY   r   r   )r  r   r  r   rm   MSELossr	   unbindr   r[  rZ  r  r   r8  r   r   r   r   r|   rd  r   )r0   r   r   r   labelslabels_indexrk  rl  r   return_embeddingloss_weightrm  r  r  logitsloss_fctregression_loss_fct
labels_lstloss_lstre  r  logitlosslabels_masks                           r   r   .BertForSequenceClassificationMultiTask.forward}  sy    -1IIi6D6D6@6C	-E)
 ]3>BooNo
*]+oN'&9H"$**v">fa0JH)23z63J)K%~utu-.!3#E5::<8D.u}}R/@%HD+u488t0177 9 9*+e"44Dzz$"45% *L $8}f,,8}f.CCCM- Os   F )r  r  r   r  )NNrY   NNFN)	r`   ra   rb   rc   rd   r1   r   rf   r   r   s   @r   r  r  J  s/    &!F !"!& ' 'r   r  )r   rY   F)*rd   
__future__r   r   r   rK   r   rC   numpynpr4   r	   torch.nnrm   torch.nn.functional
functionalFtorch.utils.checkpointr   r   objectr   Moduleri   r   r   r*   r   r   r  r  r!  r1  r<  rC  rN  ra  ru  r}  r  r  r  rg   r   r   <module>r     s}    @ @     
      %;jK jKZ.BII .270RYY 70t/bii /d2G		 GTRYY 8 BII  (ryy < ryy  F*RYY * . .b.		 . 06		 06f0"")) 0"f"		 "8,FRYY ,F^  X1		 X1vZRYY Zr   