
    9i~                        S r SSKJr  SSKJrJrJr  SSKrSSKrSSKJ	r	  SSK
Jr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJr  SSKJr  SSK J!r!  \RD                  " 5       r " S S\	RF                  5      r$ " S S\RJ                  RL                  5      r' " S S\(5      r)S r* " S S\RJ                  RL                  5      r+ " S S\	RF                  5      r, " S S\	RF                  5      r- " S S \	RF                  5      r. " S! S"\	RF                  5      r/ " S# S$\	RF                  5      r0 " S% S&\	RF                  5      r1 " S' S(\	RF                  5      r2 " S) S*\	RF                  5      r3S+ r4  S9S, jr5\Rl                  Rn                  S- 5       r8\Rl                  Rn                  S. 5       r9\Rl                  Rn                  S/ 5       r: " S0 S1\	RF                  5      r; " S2 S3\	RF                  5      r< " S4 S5\\5      r=\R|                  " \R~                  \R                  S69 " S7 S8\=5      5       rAg):zPyTorch DeBERTa-v2 model.    )Sequence)OptionalTupleUnionN)nn)	LayerNorm)ACT2FN)PreTrainedModel)softmax_backward_data)Models)Model
TorchModel)MODELS)AttentionBackboneModelOutput)logger)Tasks   )DebertaV2Configc                   >   ^  \ rS rSrU 4S jrS r\S 5       rSrU =r	$ )ContextPooler(   c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  5      U l        Xl	        g N)
super__init__r   Linearpooler_hidden_sizedenseStableDropoutpooler_dropoutdropoutconfigselfr"   	__class__s     i/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/deberta_v2/backbone.pyr   ContextPooler.__init__*   sF    YYv88%88:
$V%:%:;    c                     US S 2S4   nU R                  U5      nU R                  U5      n[        U R                  R                     " U5      nU$ Nr   )r!   r   r	   r"   pooler_hidden_act)r$   hidden_statescontext_tokenpooled_outputs       r&   forwardContextPooler.forward1   sM     &ad+]3

=1t{{<<=mLr(   c                 .    U R                   R                  $ r   )r"   hidden_sizer$   s    r&   
output_dimContextPooler.output_dim;   s    {{&&&r(   )r"   r   r!   )
__name__
__module____qualname____firstlineno__r   r/   propertyr4   __static_attributes____classcell__r%   s   @r&   r   r   (   s!     ' 'r(   r   c                   H    \ rS rSrSr\S 5       r\S 5       r\S 5       rSr	g)XSoftmaxA   ae  
Masked Softmax which is optimized for saving memory

Args:
    input (`torch.tensor`): The input tensor that will apply softmax.
    mask (`torch.IntTensor`):
        The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
    dim (int): The dimension that will apply softmax

Example:

>>> import torch
>>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax

>>> # Make a tensor
>>> x = torch.randn([4, 20, 100])

>>> # Create a mask
>>> mask = (x > 0).int()

>>> # Specify the dimension to apply softmax
>>> dim = -1

>>> y = XSoftmax.apply(x, mask, dim)
c                 t   X0l         UR                  [        R                  5      ) nUR	                  U[        R
                  " [        R                  " UR                  5      R                  5      5      n[        R                  " XPR                   5      nUR                  US5        U R                  U5        U$ r*   )dimtotorchboolmasked_filltensorfinfodtypeminsoftmaxmasked_fill_save_for_backward)r$   inputmaskrB   rmaskoutputs         r&   r/   XSoftmax.forward\   s    ''%**%&""5#(<<EKK0H0L0L#MOvxx0E1%v&r(   c                 V    U R                   u  n[        XX R                  U5      nUS S 4$ r   )saved_tensorsr   rB   )r$   grad_outputrQ   	inputGrads       r&   backwardXSoftmax.backwardh   s1    ''
)$VXX*02	$$$r(   c                 j   SS K Js  Jn  SSKJnJn  U R                  SX$R                  S   S9nU R                  SU R                  SU R                  S[        R                  " S[        R                  S	9S
9U5      UR                  S   S9nU" XUU R                  S[        R                  " [        R                  " UR                  5       R                  5       5      R                  5      S
95      n	U" X	U5      n	U" X	UU R                  S[        R                  " S[        R                  S	9S
95      $ )Nr   )rF   rK   CastLong)to_iSubConstantr   rI   )value_tByte)torch.onnx.symbolic_helperonnxsymbolic_helpertorch.onnx.symbolic_opset9rF   rK   opcast_pytorch_to_onnxrD   rG   int64rH   typerI   rJ   uint8)
gr$   rO   rB   sym_helprF   rK   mask_cast_valuer_maskrQ   s
             r&   symbolicXSoftmax.symbolico   s   55C$$D<<VD  FDDj%,,q*LM " ..v6  
 VDDU[[1B1B1D%E%I%IJ  LM
 C(vDDU\\!5;;%GDHJ 	Jr(   )rB   N)
r6   r7   r8   r9   __doc__staticmethodr/   rW   ro   r;    r(   r&   r?   r?   A   sE    4 	 	 % % J Jr(   r?   c                       \ rS rSrS rSrg)DropoutContext   c                 <    SU l         S U l        SU l        SU l        g )Nr   r   T)r!   rO   scale
reuse_maskr3   s    r&   r   DropoutContext.__init__   s    	
r(   )r!   rO   ry   rx   N)r6   r7   r8   r9   r   r;   rs   r(   r&   ru   ru      s    r(   ru   c                    [        U[        5      (       d  UnS nO9UR                  nX!R                  -  nUR                  (       a  UR
                  OS nUS:  aK  UcH  S[        R                  " U 5      R                  SU-
  5      -
  R                  [        R                  5      n[        U[        5      (       a  UR
                  c  X1l        X24$ )Nr   r   )
isinstanceru   r!   rx   ry   rO   rD   
empty_like
bernoulli_rC   rE   )rN   local_contextr!   rO   s       r&   get_maskr      s    m^44''&&&%2%=%=}!!4{t|E$$U+66q7{CCGGJJ -00%!%=r(   c            	           \ rS rSrSr\S 5       r\S 5       r\S\R                  R                  S\R                  R                  S\\\4   S\R                  R                  4S	 j5       rS
rg)XDropout   zlOptimized dropout function to save computation and memory by using mask operation instead of multiplication.c                     [        X5      u  p4SSU-
  -  U l        US:  a0  U R                  U5        UR                  US5      U R                  -  $ U$ )Ng      ?r   r   )r   rx   rM   rF   )ctxrN   	local_ctxrO   r!   s        r&   r/   XDropout.forward   sT     21w;'	Q;!!$'$$T1-		99Lr(   c                     U R                   S:  a/  U R                  u  nUR                  US5      U R                   -  S 4$ US 4$ )Nr   r   )rx   rT   rF   )r   rU   rO   s      r&   rW   XDropout.backward   sF    99q=((HT**43cii?EE$$r(   rk   rN   r   returnc                 |    SSK Jn  Un[        U[        5      (       a  UR                  nSnUR	                  XXE5      $ )Nr   )symbolic_opset12T)
torch.onnxr   r|   ru   r!   )rk   rN   r   r   	dropout_ptrains         r&   ro   XDropout.symbolic   s>     	0	i00!))I  '')CCr(   rs   N)r6   r7   r8   r9   rq   rr   r/   rW   rD   _CGraphValuer   floatru   ro   r;   rs   r(   r&   r   r      s    v  % % DEHHNN D588>> D!%"78D=BXX^^D Dr(   r   c                   H   ^  \ rS rSrSrU 4S jrS rS rS	S jrS r	Sr
U =r$ )
r      zo
Optimized dropout module for stabilizing the training

Args:
    drop_prob (float): the dropout probabilities
c                 J   > [         TU ]  5         Xl        SU l        S U l        g r*   )r   r   	drop_probcountcontext_stack)r$   r   r%   s     r&   r   StableDropout.__init__   s"    "
!r(   c                     U R                   (       a3  U R                  S:  a#  [        R                  XR	                  5       5      $ U$ )zR
Call the module

Args:
    x (`torch.tensor`): The input tensor to apply dropout
r   )trainingr   r   applyget_context)r$   xs     r&   r/   StableDropout.forward   s3     ==T^^a/>>!%5%5%788r(   c                      SU l         S U l        g r*   )r   r   r3   s    r&   clear_contextStableDropout.clear_context   s    
!r(   c                 x    U R                   c  / U l         SU l        U R                    H  nXl        X#l        M     g r*   )r   r   ry   rx   )r$   ry   rx   cs       r&   init_contextStableDropout.init_context   s7    %!#D
##A%LG $r(   c                 B   U R                   b  U R                  [        U R                   5      :  a#  U R                   R                  [	        5       5        U R                   U R                     nU R
                  Ul        U =R                  S-  sl        U$ U R
                  $ )Nr   )r   r   lenappendru   r   r!   )r$   r   s     r&   r   StableDropout.get_context   sw    )zzS!3!344"")).*:;$$TZZ0C..CKJJ!OJJ>>!r(   )r   r   r   )Tr   )r6   r7   r8   r9   rq   r   r/   r   r   r   r;   r<   r=   s   @r&   r   r      s&    "	"	" 	"r(   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )DebertaV2SelfOutputi  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  UR                  5      U l        [        UR                  5      U l
        g r   )r   r   r   r   r2   r   r   layer_norm_epsr   hidden_dropout_probr!   r#   s     r&   r   DebertaV2SelfOutput.__init__  sX    YYv1163E3EF
"6#5#5v7L7LM$V%?%?@r(   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r!   r   r$   r,   input_tensors      r&   r/   DebertaV2SelfOutput.forward  5    

=1]3}'CDr(   )r   r   r!   r6   r7   r8   r9   r   r/   r;   r<   r=   s   @r&   r   r     s    A r(   r   c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )DebertaV2Attentioni  c                 n   > [         TU ]  5         [        U5      U l        [	        U5      U l        Xl        g r   )r   r   DisentangledSelfAttentionr$   r   rQ   r"   r#   s     r&   r   DebertaV2Attention.__init__  s+    -f5	)&1r(   c           	          U R                  UUUUUUS9nU(       a  Uu  pxUc  UnU R                  Xt5      n	U(       a  U	W4$ U	$ )N)query_statesrelative_posrel_embeddings)r$   rQ   )
r$   r,   attention_maskoutput_attentionsr   r   r   self_output
att_matrixattention_outputs
             r&   r/   DebertaV2Attention.forward  sd     ii%%)   
 &1#K(L;;{A$j11##r(   )r"   rQ   r$   FNNNr   r=   s   @r&   r   r     s       $ $r(   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )DebertaV2Intermediatei9  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r   r   r   r   r2   intermediate_sizer   r|   
hidden_actstrr	   intermediate_act_fnr#   s     r&   r   DebertaV2Intermediate.__init__;  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r(   r,   r   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   )r$   r,   s     r&   r/   DebertaV2Intermediate.forwardC  s&    

=100?r(   r   )
r6   r7   r8   r9   r   rD   Tensorr/   r;   r<   r=   s   @r&   r   r   9  s(    9U\\ ell  r(   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )DebertaV2OutputiJ  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR
                  UR                  5      U l        [        UR                  5      U l        Xl        g r   )r   r   r   r   r   r2   r   r   r   r   r   r!   r"   r#   s     r&   r   DebertaV2Output.__init__L  s]    YYv779K9KL
"6#5#5v7L7LM$V%?%?@r(   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      r&   r/   DebertaV2Output.forwardS  r   r(   )r   r"   r   r!   r   r=   s   @r&   r   r   J  s     r(   r   c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )DebertaV2Layeri[  c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        U5      U l        g r   )r   r   r   	attentionr   intermediater   rQ   r#   s     r&   r   DebertaV2Layer.__init__]  s3    +F31&9%f-r(   c           	          U R                  UUUUUUS9nU(       a  Uu  pxU R                  U5      n	U R                  X5      n
U(       a  U
W4$ U
$ )Nr   r   r   r   r   r   rQ   )r$   r,   r   r   r   r   r   r   r   intermediate_outputlayer_outputs              r&   r/   DebertaV2Layer.forwardc  sm      >>/%%) * 
 +;("//0@A{{#6I *--r(   r   )NNNFr   r=   s   @r&   r   r   [  s     .    r(   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )	ConvLayeri~  c                 r  > [         TU ]  5         [        USS5      n[        USS5      n[        USS5      U l        [        R
                  " UR                  UR                  UUS-
  S-  US9U l        [        UR                  UR                  5      U l        [        UR                  5      U l        Xl        g )	Nconv_kernel_size   conv_groupsr   conv_acttanh   )paddinggroups)r   r   getattrr   r   Conv1dr2   convr   r   r   r   r!   r"   )r$   r"   kernel_sizer   r%   s       r&   r   ConvLayer.__init__  s    f&8!<2
F;II 1_*	 #6#5#5v7L7LM$V%?%?@r(   c                    U R                  UR                  SSS5      R                  5       5      R                  SSS5      R                  5       nSU-
  R                  5       nUR	                  UR                  S5      R                  UR                  5       5      S5        [        U R                     " U R                  U5      5      nX$-   nU R                  U5      R                  U5      nUc  UnU$ UR                  5       UR                  5       :w  aE  UR                  5       S:X  a   UR                  S5      R                  S5      nUR                  S5      nUR                  UR                  5      nXs-  nU$ )Nr   r   r      )r   permute
contiguousrE   rL   	unsqueezeexpandsizer	   r   r!   r   rC   rB   squeezerI   )	r$   r,   residual_states
input_maskoutrP   layer_norm_inputrQ   output_statess	            r&   r/   ConvLayer.forward  sH   ii--aA6AACDLLq!Z\ 	Z%%',33CHHJ?CT]]#DLL$56*0 01445EF"M  ~~#3#7#7#99>>#q(!+!3!3A!6!>!>q!AJ'11!4
#v||4J"/Mr(   )r   r"   r   r   r!   r   r=   s   @r&   r   r   ~  s     r(   r   c                   V   ^  \ rS rSrSrU 4S jrS rS rS	S jr     S
S jr	Sr
U =r$ )DebertaV2Encoderi  z8Modified BertEncoder with relative position bias supportc                   > [         TU ]  5         [        R                  " [	        UR
                  5       Vs/ s H  n[        U5      PM     sn5      U l        [        USS5      U l	        U R                  (       a  [        USS5      U l
        U R                  S:  a  UR                  U l
        [        USS5      U l        U R                  S-  nU R                  S:  a  U R                  S-  n[        R                  " UUR                  5      U l        [        US	S
5      R!                  5       R#                  S5       Vs/ s H  nUR%                  5       PM     snU l        SU R&                  ;   a$  [)        UR                  UR*                  SS9U l        [        USS5      S:  a  [-        U5      OS U l        SU l        g s  snf s  snf )Nrelative_attentionFmax_relative_positionsr   r   position_bucketsr   r   norm_rel_ebdnone|
layer_normT)elementwise_affiner   )r   r   r   
ModuleListrangenum_hidden_layersr   layerr   r
  r  max_position_embeddingsr  	Embeddingr2   r   lowersplitstripr  r   r   r   r   gradient_checkpointing)r$   r"   _pos_ebd_sizer   r%   s        r&   r   DebertaV2Encoder.__init__  s   ]]-263K3K-LM-L^F#-LMO
")&2F"N""*1&2JB+PD'**Q..4.L.L+$+F4F$KD!66:L$$q(#44q8"$,,|/5/A/A#CD
 V^V<BBDJJ3O
O GGIO

 4,,,&""%%#')DN
 *19K12*467*8If%=A 		&+#? N$
s   G Gc                     U R                   (       a  U R                  R                  OS nUb!  SU R                  ;   a  U R	                  U5      nU$ )Nr  )r
  r   weightr  r   )r$   r   s     r&   get_rel_embedding"DebertaV2Encoder.get_rel_embedding  sE    7;7N7N,,33TX%<4;L;L+L!^^N;Nr(   c                     UR                  5       S::  aT  UR                  S5      R                  S5      nX"R                  S5      R                  S5      -  nUR                  5       nU$ UR                  5       S:X  a  UR                  S5      nU$ )Nr   r   r   r   )rB   r   r   byte)r$   r   extended_attention_masks      r&   get_attention_mask#DebertaV2Encoder.get_attention_mask  s    1$&4&>&>q&A&K&KA&N#47V7V8IbM"N+002N  !Q&+55a8Nr(   c                     U R                   (       aW  UcT  Ub  UR                  S5      OUR                  S5      n[        UUR                  S5      U R                  U R                  S9nU$ )Nr$  bucket_sizemax_position)r
  r   build_relative_positionr  r  )r$   r,   r   r   qs        r&   get_rel_posDebertaV2Encoder.get_rel_pos  sq    ""|';#/ !!5B5G5G5K 2""2& 11!88	:L
 r(   c           
        ^ UR                  5       S::  a  UnO"UR                  S5      S:  R                  5       nU R                  U5      nU R	                  XU5      nU(       a  SOS n	T(       a  SOS n
[        U[        5      (       a  US   nOUnU R                  5       nUn[        U R                  5       H  u  pU(       a  X4-   n	U R                  (       aL  U R                  (       a;  U4S jn[        R                  R                  R                  U" U5      UUUUU5      nOU" UUUUUTS9nT(       a  Uu  nnUS:X  a  U R                  b  U R                  XU5      nUb=  Un[        U[        5      (       a%  US-   [!        U R                  5      :  a  XS-      OS nOUnT(       d  M  U
W4-   n
M     U(       a  X4-   n	U(       d  [#        S XU
4 5       5      $ [%        UU	U
S	9$ )
Nr   r$  r   rs   c                    >^  U U4S jnU$ )Nc                     > T" / U QTP76 $ r   rs   )inputsmoduler   s    r&   custom_forwardODebertaV2Encoder.forward.<locals>.create_custom_forward.<locals>.custom_forward  s    %AvA/@AAr(   rs   )r5  r6  r   s   ` r&   create_custom_forward7DebertaV2Encoder.forward.<locals>.create_custom_forward  s    B *)r(   )r   r   r   r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   rs   ).0vs     r&   	<genexpr>+DebertaV2Encoder.forward.<locals>.<genexpr><  s      "Ma Ms   	last_hidden_stater,   
attentions)rB   sumr%  r'  r/  r|   r   r!  	enumerater  r  r   rD   utils
checkpointr   r   tupler   )r$   r,   r   output_hidden_statesr   r   r   return_dictr  all_hidden_statesall_attentionsnext_kvr   r  ilayer_moduler8  att_ms       `             r&   r/   DebertaV2Encoder.forward  s	    1$'J(,,R014::<J00@''(46 #7BD0dmX..#A&G#G//1(4OA#$58I$I!**t}}* !& 6 6 A A),7"  "! !-"!-!-#1&7! !'4$uAv$))/ $		-*4!6 ',mX6667!ec

?$ 7$mE2)-  (  !/5)!;a  5d   14E E ")nM" " " ,++%' 	'r(   )	r   r   r  r  r  r  r  r   r
  )NN)TFNNT)r6   r7   r8   r9   rq   r   r!  r'  r/  r/   r;   r<   r=   s   @r&   r  r    s8    B#,J		 "W' W'r(   r  c           	         [         R                  " U 5      nUS-  n[         R                  " X:  X* :  -  [         R                  " US-
  5      R	                  U 5      [         R
                  " U 5      5      n[         R                  " [         R                  " XT-  5      [         R                  " [         R                  " US-
  U-  5      5      -  US-
  -  5      U-   n[         R                  " XT:*  U R	                  U5      Xc-  5      nU$ )Nr   r   )rD   signwhererG   type_asabsceillog)r   r+  r,  rQ  midabs_poslog_pos
bucket_poss           r&   make_log_bucket_positionr[  E  s    ::l#D

Ckk		t 34S1W%%l3		,G 	

IIgm$ii!S(* ++.1Ag7	8 ;>	> 
 W^\-A-A'-J$^-Jr(   c                 .   [         R                  " SU 5      n[         R                  " SU5      nUSS2S4   USSS24   -
  nUS:  a  US:  a  [        XbU5      nUR                  [         R                  5      nUSU 2SS24   nUR                  S5      nU$ )a6  
Build relative position according to the query and key

We assume the absolute position of query \(P_q\) is range from (0, query_size) and the absolute position of key
\(P_k\) is range from (0, key_size), The relative positions from query to key is \(R_{q \rightarrow k} = P_q -
P_k\)

Args:
    query_size (int): the length of query
    key_size (int): the length of key
    bucket_size (int): the size of position bucket
    max_position (int): the maximum allowed absolute position

Return:
    `torch.LongTensor`: A tensor with shape [1, query_size, key_size]

r   N)rD   aranger[  rC   longr   )
query_sizekey_sizer+  r,  q_idsk_idsrel_pos_idss          r&   r-  r-  W  s    * LLJ'ELLH%E4.5q>1KQ<!+.{/;=..,Kkzk1n-K''*Kr(   c                     U R                  UR                  S5      UR                  S5      UR                  S5      UR                  S5      /5      $ )Nr   r   r   r   r   r   )c2p_posquery_layerr   s      r&   c2p_dynamic_expandrh  x  sR     >>"	  r(   c                     U R                  UR                  S5      UR                  S5      UR                  S5      UR                  S5      /5      $ )Nr   r   r$  re  )rf  rg  	key_layers      r&   p2c_dynamic_expandrk    sN     >>rr	  r(   c                     U R                  UR                  5       S S U R                  S5      UR                  S5      4-   5      $ )Nr   r$  re  )	pos_indexp2c_attrj  s      r&   pos_dynamic_expandro    sH     GLLN2A.(~~b19>>"3EFG H Hr(   c                   J   ^  \ rS rSrSrU 4S jrS r    SS jrS rSr	U =r
$ )	r   i  a  
Disentangled self-attention module

Parameters:
    config (`DebertaV2Config`):
        A model config class instance with the configuration to build a new model. The schema is similar to
        *BertConfig*, for more details, please refer [`DebertaV2Config`]

c                   > [         TU ]  5         UR                  UR                  -  S:w  a&  [	        SUR                   SUR                   S35      eUR                  U l        UR                  UR                  -  n[        USU5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  SS9U l
        [        R                  " UR                  U R                  SS9U l        [        R                  " UR                  U R                  SS9U l        [        USS	5      U l        UR                  b  UR                  O/ U l        [        US
S	5      U l        U R                  (       Ga!  [        USS5      U l        [        USS5      U l        U R"                  S:  a  UR$                  U l        U R"                  U l        U R                   S:  a  U R                   U l        [)        UR*                  5      U l        U R                  (       d  SU R                  ;   a/  [        R                  " UR                  U R                  SS9U l        SU R                  ;   a0  [        R                  " UR                  U R                  5      U l        [)        UR2                  5      U l        g )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()attention_head_sizeTbiasshare_att_keyFr
  r  r   r  r   c2pp2c)r   r   r2   num_attention_heads
ValueErrorr   rs  all_head_sizer   r   
query_projkey_proj
value_projrv  pos_att_typer
  r  r  r  r  r   r   pos_dropoutpos_key_projpos_query_projattention_probs_dropout_probr!   )r$   r"   _attention_head_sizer%   s      r&   r   "DisentangledSelfAttention.__init__  sg    : ::a?#F$6$6#7 8 445Q89 9 $*#=#= %11V5O5OO#*63H+?$A !558P8PP)) 2 2?		 2 2?)) 2 2? %V_eD393F3F3RF//XZ")&2F"N"""$+F4F$KD!*1&2JB+PD'**Q..4.L.L+ $ ; ;D$$q($($9$9!,V-G-GHD%%D---(*		**D,>,>T)KD%D---*,))F4F4F484F4F+HD' %V%H%HIr(   c                     UR                  5       S S US4-   nUR                  U5      nUR                  SSSS5      R                  5       R                  SUR                  S5      UR                  S5      5      $ )Nr   r   r   r   r   )r   viewr   r   )r$   r   attention_headsnew_x_shapes       r&   transpose_for_scores.DisentangledSelfAttention.transpose_for_scores  so    ffhsm&;;FF;yyAq!$//166r166!978vvbzC 	Cr(   c           	      t   Uc  UnU R                  U R                  U5      U R                  5      nU R                  U R                  U5      U R                  5      nU R                  U R	                  U5      U R                  5      n	Sn
SnSU R
                  ;   a  US-  nSU R
                  ;   a  US-  n[        R                  " [        R                  " UR                  S5      [        R                  S9U-  5      n[        R                  " XxR                  SS5      5      [        R                  " XR                  S9-  nU R                  (       a%  U R                  U5      nU R!                  XxUUU5      n
U
b  X-   nUnUR#                  SU R                  UR                  S5      UR                  S5      5      n[$        R'                  XS5      nU R)                  U5      n[        R                  " UR#                  SUR                  S5      UR                  S5      5      U	5      nUR#                  SU R                  UR                  S5      UR                  S5      5      R+                  SS	SS
5      R-                  5       nUR                  5       SS S-   nUR#                  U5      nU(       a  X4$ U$ )a-  
Call the module

Args:
    hidden_states (`torch.FloatTensor`):
        Input states to the module usually the output from previous layer, it will be the Q,K and V in
        *Attention(Q,K,V)*

    attention_mask (`torch.ByteTensor`):
        An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
        sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
        th token.

    output_attentions (`bool`, optional):
        Whether return the attention matrix.

    query_states (`torch.FloatTensor`, optional):
        The *Q* state in *Attention(Q,K,V)*.

    relative_pos (`torch.LongTensor`):
        The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
        values ranging in [*-max_relative_positions*, *max_relative_positions*].

    rel_embeddings (`torch.FloatTensor`):
        The embedding of relative distances. It's a tensor of shape [\(2 \times
        \text{max_relative_positions}\), *hidden_size*].


Nr   rw  rx  r   r_   r$  r   r   r   )r   )r  r|  ry  r}  r~  r  rD   sqrtrG   r   r   bmm	transposerI   r
  r  disentangled_attention_biasr  r?   r   r!   r   r   )r$   r,   r   r   r   r   r   rg  rj  value_layerrel_attscale_factorrx   attention_scoresattention_probscontext_layernew_context_layer_shapes                    r&   r/   !DisentangledSelfAttention.forward  s   L (L//OOL)4+C+CE--MM-($*B*BD	//OOM*D,D,DF D%%%ALD%%%AL

LL))"-U[[A !99[2E2E3 ||..00 ""!--n=N66{7C7E7CEG
 /9++00T5M5M1A1F1Fr1J1A1F1Fr1JL
 #..)92N,,7		  _%9%9"%=!0!5!5b!9;<GI r4#;#;,11"5,11"577>wq!Q?@8BBL*,	 	
 #0"4"4"6s";f"D%**+BC!33  r(   c                 P   Uc@  UR                  S5      n[        UUR                  S5      U R                  U R                  S9nUR	                  5       S:X  a!  UR                  S5      R                  S5      nOVUR	                  5       S:X  a  UR                  S5      nO0UR	                  5       S:w  a  [        SUR	                  5        35      eU R                  nUR                  5       R                  UR                  5      nUSUS-  2S S 24   R                  S5      nU R                  (       a  U R                  U R                  U5      U R                  5      R                  UR                  S5      U R                  -  SS5      nU R                  U R!                  U5      U R                  5      R                  UR                  S5      U R                  -  SS5      n	OS	U R"                  ;   aX  U R                  U R%                  U5      U R                  5      R                  UR                  S5      U R                  -  SS5      n	S
U R"                  ;   aX  U R                  U R'                  U5      U R                  5      R                  UR                  S5      U R                  -  SS5      nSn
S	U R"                  ;   Ga  [(        R*                  " [(        R,                  " W	R                  S5      [(        R.                  S9U-  5      n[(        R0                  " XR3                  SS5      5      n[(        R4                  " X7-   SUS-  S-
  5      n[(        R6                  " USUR9                  S5      R;                  UR                  S5      UR                  S5      UR                  S5      /5      S9nX[(        R,                  " XR<                  S9-  -  n
S
U R"                  ;   Ga  [(        R*                  " [(        R,                  " WR                  S5      [(        R.                  S9U-  5      nUR                  S5      UR                  S5      :w  ai  [        UR                  S5      UR                  S5      U R                  U R                  S9R                  UR                  5      nUR                  S5      nOUn[(        R4                  " U* U-   SUS-  S-
  5      n[(        R0                  " X(R3                  SS5      5      n[(        R6                  " USUR9                  S5      R;                  UR                  S5      UR                  S5      UR                  S5      /5      S9R3                  SS5      nU
U[(        R,                  " UUR<                  S9-  -  n
U
$ )Nr$  r*  r   r   r   r   r   z2Relative position ids must be of dim 2 or 3 or 4. rw  rx  r   r_   )rB   index)r   r-  r  r  rB   r   rz  r  r^  rC   devicerv  r  r|  ry  repeatr}  r  r  r  rD   r  rG   r   r  r  clampgatherr   r   rI   )r$   rg  rj  r   r   r  r.  att_spanpos_query_layerpos_key_layerscorerx   c2p_attrf  r_posp2c_posrn  s                    r&   r  5DisentangledSelfAttention.disentangled_attention_bias.  s     $A2r" 11!88	:L
 "'11!4>>qAL1$'11!4L1$D\EUEUEWDXY  $$#((*--k.@.@A'(Q,(9:DDQG"77/((**0&$$Q'4+C+CCQ+K  !55n-((**0&$$Q'4+C+CCQ+K 
 ))) $ 9 9%%n5,,!..4f#((+t/G/GG/ 
 )))"&";";''7,,#..4f#((+t/G/GG/   D%%%JJ]//35;;G E ii-D-DR-LMGkk,"91hlQ>NOGllooa(//$$Q'$$Q' %%b)1 G u||EGGGE D%%%JJ_11"5U[[I E ~~b![%5%5b%99/NN2&NN2& $ 5 5!%!<!<	
 "[''(  *$kk5&8"3Q1q8HIGii	+D+DR+LMGllooa(//$$Q'NN2&NN2&1  iB  Wu||EGGGEr(   )r{  rs  r!   r}  r  ry  r  r  r  r  r  r  r|  r
  rv  r~  r   )r6   r7   r8   r9   rq   r   r  r/   r  r;   r<   r=   s   @r&   r   r     s6    *JXC  Z!x^ ^r(   r   c                   @   ^  \ rS rSrSrU 4S jr     SS jrSrU =r$ )DebertaV2Embeddingsi  zGConstruct the embeddings from word, position and token_type embeddings.c                   > [         TU ]  5         [        USS5      n[        USUR                  5      U l        [
        R                  " UR                  U R                  US9U l        [        USS5      U l	        U R                  (       d  S U l
        O0[
        R                  " UR                  U R                  5      U l
        UR                  S:  a0  [
        R                  " UR                  U R                  5      U l        U R                  UR                  :w  a/  [
        R                  " U R                  UR                  SS9U l        [!        UR                  UR"                  5      U l        [%        UR&                  5      U l        Xl        U R-                  S	[.        R0                  " UR                  5      R3                  S
5      5        g )Npad_token_idr   embedding_size)padding_idxposition_biased_inputTFrt  position_ids)r   r   )r   r   r   r2   r  r   r  
vocab_sizeword_embeddingsr  position_embeddingsr  type_vocab_sizetoken_type_embeddingsr   
embed_projr   r   r   r   r!   r"   register_bufferrD   r]  r   )r$   r"   r  r%   s      r&   r   DebertaV2Embeddings.__init__  sv   v~q9%f.>&,&8&8:!||t22 N &-V5L-1&3"))'+D$')||..0C0C(ED$ !!A%)+f6L6L6:6I6I*KD& &"4"44 ii##V%7%7eEDO"6#5#5v7L7LM$V%?%?@ 	LL778??H	Jr(   c                    Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUc8  [        R                  " U[        R                  U R                  R
                  S9nUc  U R                  U5      nU R                  b   U R                  UR	                  5       5      nO[        R                  " U5      nUn	U R                  (       a  X-  n	U R                  R                  S:  a  U R                  U5      n
X-  n	U R                  U R                  R                  :w  a  U R                  U	5      n	U R!                  U	5      n	Ub  UR#                  5       U	R#                  5       :w  aE  UR#                  5       S:X  a   UR%                  S5      R%                  S5      nUR'                  S5      nUR)                  U	R*                  5      nX-  n	U R-                  U	5      n	U	$ )Nr   r   rI   r  r   r   r   )r   r  rD   zerosr^  r  r  r  
zeros_liker  r"   r  r  r  r2   r  r   rB   r   r   rC   rI   r!   )r$   	input_idstoken_type_idsr  rO   inputs_embedsinput_shape
seq_lengthr  
embeddingsr  s              r&   r/   DebertaV2Embeddings.forward  s     #..*K',,.s3K ^
,,Q^<L!"[[5::d6G6G6N6NPN   00;M##/"&":":<;L;L;N"O"'"2"2="A"
%%-J;;&&*$($>$>~$N!/J$++"9"994J^^J/
xxzZ^^--88:?<<?2215D~~a(77:++,D#*J\\*-
r(   )	r   r"   r!   r  r  r  r  r  r  )NNNNN)	r6   r7   r8   r9   rq   r   r/   r;   r<   r=   s   @r&   r  r    s(    QJB #!"2 2r(   r  c                   j   ^  \ rS rSrSr\rSrS/rS/r	Sr
U 4S jrS rSS	 jr\U 4S
 j5       rSrU =r$ )DebertaV2PreTrainedModeli  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
debertar  r  Tc                 b   > [         TU ]  " UR                  40 UD6  [         [        U ]  U5        g r   )r   r   name_or_pathr   r$   r"   kwargsr%   s      r&   r   !DebertaV2PreTrainedModel.__init__  s*    ,,77eT#F+r(   c                 F   [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         ggg)zInitialize the weights.g        )meanstdN)r|   r   r   r   datanormal_r"   initializer_rangeru  zero_r  r  )r$   r5  s     r&   _init_weights&DebertaV2PreTrainedModel._init_weights  s    fbii(( MM&&dkk;; ' ={{&  &&( '--MM&&dkk;; ' =!!-""6#5#56<<> . .r(   c                 <    [        U[        5      (       a  X!l        g g r   )r|   r  r  )r$   r5  values      r&   _set_gradient_checkpointing4DebertaV2PreTrainedModel._set_gradient_checkpointing  s    f.//,1) 0r(   c                 ~   > UR                  SS 5      nUc  [        S0 UD6nU " U5      nU$ [        [        U ]  US9nU$ )N	model_dir)pretrained_model_name_or_pathrs   )popr   r   r   from_pretrained)clsr  r  ponet_configmodelr%   s        r&   _instantiate%DebertaV2PreTrainedModel._instantiate  s[    JJ{D1	*4V4L%E
  %CL_M  r(   rs   )F)r6   r7   r8   r9   rq   r   config_classbase_model_prefix_keys_to_ignore_on_load_missing"_keys_to_ignore_on_load_unexpectedsupports_gradient_checkpointingr   r  r  classmethodr  r;   r<   r=   s   @r&   r  r    sP    
 #L!'5&6#*?)@&&*#,?2 	 	r(   r  )module_namec                     ^  \ rS rSrSrU 4S jrS rS rS r        SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\4   4S jjrSrU =r$ )DebertaV2Modeli  a  The bare DeBERTa_v2 Model transformer outputting raw hidden-states without any specific head on top.

The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.

This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.

Parameters:
    config (`DebertaV2Config`): Model configuration class with all the parameters of the model.
        Initializing with a config file does not load the weights associated with the model, only the
        configuration.
c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        SU l        Xl        U R                  5         g r*   )	r   r   r  r  r  encoderz_stepsr"   	post_initr  s      r&   r   DebertaV2Model.__init__-  s>     -f5'/r(   c                 .    U R                   R                  $ r   r  r  r3   s    r&   get_input_embeddings#DebertaV2Model.get_input_embeddings7  s    ...r(   c                 $    XR                   l        g r   r  )r$   new_embeddingss     r&   set_input_embeddings#DebertaV2Model.set_input_embeddings:  s    *8'r(   c                     [        S5      e)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
z7The prune function is not implemented in DeBERTa model.)NotImplementedError)r$   heads_to_prunes     r&   _prune_headsDebertaV2Model._prune_heads=  s    
 "EG 	Gr(   r  r   r  r  r  r   rG  rH  r   c	           
      r   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [	        S5      eUb  UR                  5       n	O"Ub  UR                  5       SS n	O[	        S5      eUb  UR                  OUR                  n
Uc  [        R                  " XS9nUc$  [        R                  " U	[        R                  U
S9nU R                  UUUUUS9nU R                  UUSUUS	9nUS
   nU R                  S
:  a  US   n[        U R                  5       Vs/ s H  oR                  R                  S   PM     nnUS   nU R                  R!                  5       nU R                  R#                  U5      nU R                  R%                  U5      nUS
S  H  nU" UUSUUUS9nUR'                  U5        M!     US   nU(       d  U4UU(       a  S
S -   $ SS -   $ [)        UU(       a  UR*                  OSUR,                  S9$ s  snf )u	  
Args:
    input_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`):
        Indices of input sequence tokens in the vocabulary.

    attention_mask (`torch.FloatTensor` of shape `('batch_size, sequence_length')`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

    token_type_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
        Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
        1]`:

        - 0 corresponds to a *sentence A* token,
        - 1 corresponds to a *sentence B* token.

    position_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
        Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
        `[0,config.max_position_embeddings - 1]`.

    inputs_embeds (`torch.FloatTensor` of shape `('batch_size, sequence_length', hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert *input_ids* indices into associated
        vectors than the model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
        tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
        more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a dataclass instead of a plain tuple.

Returns:
    Returns `modelscope.outputs.AttentionBackboneModelOutput`

Examples:
    >>> from modelscope.models import Model
    >>> from modelscope.preprocessors import Preprocessor
    >>> model = Model.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite', task='backbone')
    >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite')
    >>> print(model(**preprocessor('这是个测试')))
NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)r  r  )r  r  r  rO   r  T)rG  r   rH  r   r$  Fr   r   r?  )r"   r   rG  use_return_dictrz  r   r  rD   onesr  r^  r  r  r  r  r  r!  r'  r/  r   r   r,   rA  )r$   r  r   r  r  r  r   rG  rH  r  r  embedding_outputencoder_outputsencoded_layersr,   r  layersr   r   rel_posr  sequence_outputs                         r&   r/   DebertaV2Model.forwardE  s   r 2C1N-TXT_T_TqTq$8$D KK,, 	 &1%<k$++B]B] ]%>V  "#..*K&',,.s3KGI I &/%:!!@T@T!"ZZCN!"[[5::f>N  ??)%' + 
 ,,!%/# ' 
 )+<<!*2.M6;DLL6IJ6Ill((,6IFJ)"-L!\\;;=N!\\<<^LNll../?@G$!"&+!-!(#1  %%l3 $ ),#&*3*5 5 5013*5 5 5 ,-# *77)-&11	
 	
- Ks   	"H4)r"   r  r  r  )NNNNNNNN)r6   r7   r8   r9   rq   r   r  r  r  r   rD   r   rE   r   r   r   r/   r;   r<   r=   s   @r&   r  r    s    "/9G -11515/304,0/3&*A
ELL)A
 !.A
 !.	A

 u||,A
  -A
 $D>A
 'tnA
 d^A
 
u22	3A
 A
r(   r  )r   r   )Brq   collections.abcr   typingr   r   r   rD   torch.utils.checkpointr   torch.nnr   transformers.activationsr	   transformers.modeling_utilsr
   transformers.pytorch_utilsr   modelscope.metainfor   modelscope.modelsr   r   modelscope.models.builderr   modelscope.outputsr   modelscope.utilsr   loggingmodelscope.utils.constantr   configurationr   
get_loggerModuler   autogradFunctionr?   objectru   r   r   r   r   r   r   r   r   r   r  r[  r-  jitscriptrh  rk  ro  r   r  r  register_modulebackbone
deberta_v2r  rs   r(   r&   <module>r     s   ! $ ) )     + 7 < & / , ; . + *				'BII '2DJu~~&& DJPV *%Du~~&& %DR."BII ."d"))  "$ "$LBII "bii "  RYY   F&		 &R['ryy ['|( )+)+B     H H
w		 wvU")) Ur-z? -` F4E4EFk
- k
 Gk
r(   