
    9i                     p   S r SSKrSSKJr  SSKJrJr  SSKrSSKJ	r	  SSK
rSSKJrJr  SSKJrJr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJrJ r   SSK!J"r"  SSK#J$r$  \RJ                  " 5       r/ SQr& " S S\	RN                  5      r( " S S\	RN                  5      r) " S S\	RN                  5      r* " S S\	RN                  5      r+ " S S\	RN                  5      r, " S S\	RN                  5      r- " S S\	RN                  5      r. " S S \	RN                  5      r/ " S! S"\	RN                  5      r0 " S# S$\	RN                  5      r1 " S% S&\\5      r2\ " S' S(\5      5       r3\ " S) S*\5      5       r4S+r5S,r6\" S-\55       " S. S/\25      5       r7 " S0 S1\25      r8 " S2 S3\	RN                  5      r9g)4zPyTorch PEER model.     N)	dataclass)OptionalTuple)ACT2FNget_activation)ModelOutputadd_start_docstrings))BaseModelOutputWithPastAndCrossAttentions)PreTrainedModel)Model
TorchModel)logger)parse_labels_in_order)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer   )
PeerConfig)SequenceSideInfo)zgoogle/peer-small-generatorzgoogle/peer-base-generatorzgoogle/peer-large-generatorzgoogle/peer-small-discriminatorzgoogle/peer-base-discriminatorzgoogle/peer-large-discriminatorc                   L   ^  \ rS rSrSrU 4S jrSSSSS\" 5       4S jrSrU =r	$ )PeerEmbeddings4   zGConstruct the embeddings from word, position and token_type embeddings.c                 .  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      5        [+        USS/5      U l        SU R,                  ;   a8  SU l        [        R                  " U R.                  UR
                  5      U l        g g )	N)padding_idxepsposition_ids)r   position_embedding_typeabsolute#absolute_token_position_in_sentence   )super__init__nn	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandgetattrr   side_info_size/position_embeddings__token_position_in_sentenceselfconfig	__class__s     c/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/peer/backbone.pyr$   PeerEmbeddings.__init__7   sF   !||!!++ - $&<<0N0N060E0E$G %'\\&2H2H282G2G&I"
 !!v'<'<>zz&"<"<= 	LL778??H	J (/v/H0:|(=$ 1D4P4PP"$DCE<<##V%:%:D<D@ Q    Nr   c                    Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2XXU-   24   nUc8  [        R                  " U[        R                  U R                  R
                  S9nUc  U R                  U5      nU R                  U5      n	XI-   n
SU R                  ;   a  U R                  U5      nX-  n
SU R                  ;   a;  [        R                  " US   SU R                  S-
  S9nU R                  U5      nX-  n
S	U R                  ;   a*  S
U;  a  [        5       US
'   U R                  U5      US
   S'   U R                  U
5      n
U R                  U
5      n
U
$ )Nr   r   dtypedevicer    r!   ss_token_position_in_sentencer   minmaxabsolute_self_only
embeddingsss_token_position_in_sequence)sizer   r5   zeroslongrE   r*   r.   r   r,   clampr9   r:   dictr/   r3   )r<   	input_idstoken_type_idsr   inputs_embedspast_key_values_lengthside_info_setsinput_shape
seq_lengthr.   rK   r,   position_idxr:   s                 r?   forwardPeerEmbeddings.forwardU   s     #..*K',,.s3K ^
,,Q-C/EEF .F.F GL !"[[5::d6G6G6N6NPN   00;M $ : :> J":
555"&":":<"H-J0D4P4PP ;;>?''!+-L ?C>r>r?;IJ  4#?#??>1/3v|,373K3K 4" <(/1 ^^J/
\\*-
rA   )r/   r3   r   r,   r:   r9   r.   r*   )
__name__
__module____qualname____firstlineno____doc__r$   rQ   rZ   __static_attributes____classcell__r>   s   @r?   r   r   4   s,    Q<@ #$65 5rA   r   c                   P   ^  \ rS rSrU 4S jrS rSSSSSS\" 5       4S jrSrU =r	$ )PeerSelfAttention   c                   > [         TU ]  5         UR                  UR                  -  S:w  a5  [	        US5      (       d$  [        SUR                  UR                  4-  5      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        [#        USS/5      U l        SU R$                  ;   aI  UR&                  S-  U l        [        R*                  " SU R(                  -  U R                  5      U l        GOS	U R$                  ;   a]  UR&                  S-  U l        S
U l        [        R*                  " SU R(                  -  U R.                  -  U R                  5      U l        GOSU R$                  ;   a  UR&                  S-  U l        U R(                  S-  U l        [        R*                  " SU R(                  -  U R                  5      U l        [        R*                  " SU R0                  -  U R                  5      U l        OSU R$                  ;   an  UR&                  S-  U l        U R(                  S-  U l        SU R(                  -  SU R0                  -  -  n[        R*                  " UU R                  5      U l        OgSU R$                  ;   d  SU R$                  ;   aG  UR&                  S-  U l        [        R*                  " SU R(                  -  U R                  5      U l        UR4                  U l        g )Nr   r(   zLThe hidden size (%d) is not a multiple of the number of attention heads (%d)r   r    relative_scalar_bias      )relative_scalar_bias_with_side_info_tokenr"   (relative_scalar_bias_token_plus_sentence,relative_scalar_bias_with_side_info_sentencerelative_keyrelative_key_query)r#   r$   hidden_sizenum_attention_headshasattr
ValueErrorintattention_head_sizeall_head_sizer%   Linearquerykeyvaluer1   attention_probs_dropout_probr3   r8   r   r+    max_relative_position_embeddingsr&   distance_embeddingr9   $max_sen_relative_position_embeddingsdistance_embedding_sentence
is_decoder)r<   r=   vocabr>   s      r?   r$   PeerSelfAttention.__init__   sY    : ::a?(I* I*##V%?%?@AB B
 $*#=#= #&v'9'9)/)C)C(D $E !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'.v/H0:|(=$ "T%A%AA4:4R4RVW4WD1&(llD999(('*D# 9D<X<XX4:4R4RVW4WD1"$D&(llT:::%%&'+'?'?'AD# 84;W;WW4:4R4RVW4WD18<8]8]ab8bD5&(llD999(('*D# 02||D===((0*D, <t?[?[[4:4R4RVW4WD18<8]8]ab8bD5>>>D===?E&(ll5373K3K'MD# t;;;?SW[WsWs?s4:4R4RVW4WD1&(llD999(('*D# !++rA   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )Nr   r   rj   r      )rM   rq   ru   viewpermute)r<   xnew_x_shapes      r?   transpose_for_scores&PeerSelfAttention.transpose_for_scores   sS    ffhsmt'?'?'+'?'?'A AFFK yyAq!$$rA   NFc	                 
   U R                  U5      n	US Ln
U
(       a  Ub  US   nUS   nUnGOU
(       aC  U R                  U R                  U5      5      nU R                  U R                  U5      5      nUnOUbu  U R                  U R                  U5      5      nU R                  U R                  U5      5      n[        R
                  " US   U/SS9n[        R
                  " US   U/SS9nO@U R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U	5      nU R                  (       a  X4n[        R                  " XR                  SS5      5      [        R                  " U R                  5      -  nSnSU R                  ;   a  XS   -  nUS-  nS	U R                  ;   dQ  S
U R                  ;   dA  SU R                  ;   d1  SU R                  ;   d!  SU R                  ;   d  SU R                  ;   GaX  US   nU R                  U5      nUR                  UR                  S9nSU R                  ;   a@  UR!                  / SQ5      R#                  S5      nU[        R                  " U5      -  U-   nGOSU R                  ;   d  SU R                  ;   a1  UR!                  / SQ5      nU[        R                  " U5      -  U-   nGOSU R                  ;   a  UR!                  / SQ5      R#                  S5      nUS   nU R%                  U5      nUR                  UR                  S9nUR!                  / SQ5      nU[        R                  " U5      -  U-   U-   nGOS	U R                  ;   a[  [        R&                  " SUU5      [        R                  " U R                  5      -  nUS-  nUU-   [        R                  " U5      -  nOS
U R                  ;   aw  [        R&                  " SUU5      n[        R&                  " SUU5      nUU-   [        R                  " U R                  5      -  nUS-  nUU-   [        R                  " U5      -  nOU[        R                  " U5      -  nUb  X-   n[(        R*                  " SS9" U5      nU R-                  U5      nUb  UU-  n[        R                  " UU5      nUR!                  SSSS5      R/                  5       nUR1                  5       S S U R2                  4-   nUR4                  " U6 nU(       a  UU4OU4nU R                  (       a  UU4-   nU$ )Nr   r   rj   dimr   rJ   side_info_attention_scoresrn   ro   rh   rk   rl   rm   distance_idx)rD   )rj   r   r   )r   r   r   rj   distance_idx_sentencezbhld,lrd->bhlrzbhrd,lrd->bhlrr   )rx   r   ry   rz   r5   catr   matmul	transposemathsqrtru   r   r}   torD   r   	unsqueezer   einsumr%   Softmaxr3   
contiguousrM   rv   r   )r<   hidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsrV   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layerattention_scoresattention_scores_termsr   positional_embeddingrh   r   positional_embedding_sentencerelative_scalar_bias_sentencerelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                                r?   rZ   PeerSelfAttention.forward   s    !JJ}5
 3$>."<&q)I(+K3N11./1I33

013K3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB?? (5N !<<5H5H6 yy!9!9:;!"4#?#??,!. ."a'"T999=QUYUqUq=q)T-I-II>$B^B^^=A]A]]ATEaEaa).9L#'#:#:<#H #7#:#:!'' $; $)  &)E)EE';'C'C((y| %#3dii*7, $,.B$C  >112C112';'C'C ("$#3dii*7, $,.B$C  <t?[?[[';'C'C((y| % )77N(O%040P0P)1+-0M0P0P%++ 1Q 1--0M0U0U 1"- $4dii*7 $($)+H$I   4#?#??+0<<$k(,*,0IId6N6N,O,P( '!+&$47O$O(,		2H(I$J %)E)EE16$k3G2I./4||$i1E0G, 3236:ii0072,2( '!+&$47O$O(,		2H(I$J   0$))&3(  ( %/@ **,-=> ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";?" #"%**,CD (9 !"$?L>O 	 ?? 22GrA   )rv   ru   r}   r   r3   r   ry   r|   r~   rq   r   rx   r9   rz   )
r\   r]   r^   r_   r$   r   rQ   rZ   ra   rb   rc   s   @r?   re   re      s3    >,@%  "&#'#6Z ZrA   re   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )PeerSelfOutputir  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr   )r#   r$   r%   rw   rp   denser/   r0   r1   r2   r3   r;   s     r?   r$   PeerSelfOutput.__init__t  sc    YYv1163E3EF
F$9$9;zz&"<"<=rA   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ Nr   r3   r/   r<   r   input_tensors      r?   rZ   PeerSelfOutput.forward{  5    

=1]3}'CDrA   r/   r   r3   r\   r]   r^   r_   r$   rZ   ra   rb   rc   s   @r?   r   r   r      > rA   r   c                   P   ^  \ rS rSrU 4S jrS rSSSSSS\" 5       4S jrSrU =r	$ )PeerAttentioni  c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g r   )r#   r$   re   r<   r   outputsetpruned_headsr;   s     r?   r$   PeerAttention.__init__  s0    %f-	$V,ErA   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   r<   rq   ru   r   r   rx   ry   rz   r   r   rv   union)r<   headsindexs      r?   prune_headsPeerAttention.prune_heads  s   u:?79900II))4+<+<>
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EI )		%"&))"?"?$))B_B_"_		 --33E:rA   NFc	           
      r    U R                  UUUUUUUU5      n	U R                  U	S   U5      n
U
4U	SS  -   nU$ )Nr   r   )r<   r   )r<   r   r   r   r   r   r   r   rV   self_outputsattention_outputr   s               r?   rZ   PeerAttention.forward  s_     yy!"	
  ;;|AF# #AB'(rA   )r   r   r<   )
r\   r]   r^   r_   r$   r   rQ   rZ   ra   rb   rc   s   @r?   r   r     s0    ";,  "&#'#6 rA   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )PeerIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r#   r$   r%   rw   rp   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr;   s     r?   r$   PeerIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$rA   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   )r<   r   s     r?   rZ   PeerIntermediate.forward  s&    

=100?rA   r   r   rc   s   @r?   r   r     s    9 rA   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )
PeerOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )r#   r$   r%   rw   r   rp   r   r/   r0   r1   r2   r3   r;   s     r?   r$   PeerOutput.__init__  sc    YYv779K9KL
F$9$9;zz&"<"<=rA   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      r?   rZ   PeerOutput.forward  r   rA   r   r   rc   s   @r?   r   r     r   rA   r   c                   P   ^  \ rS rSrU 4S jrSSSSSS\" 5       4S jrS rSrU =r	$ )	PeerLayeri  c                 n  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        UR                  U l        U R                  (       a+  U R                  (       d
   U  S35       e[	        U5      U l        [        U5      U l
        [        U5      U l        g )Nr   z> should be used as a decoder model if cross attention is added)r#   r$   chunk_size_feed_forwardseq_len_dimr   	attentionr   add_cross_attentioncrossattentionr   intermediater   r   r;   s     r?   r$   PeerLayer.__init__  s    '-'E'E$&v. ++#)#=#= ##??ktf,j$kk?"/"7D,V4 (rA   NFc	           	         Ub  US S OS n	U R                  UUUUU	US9n
U
S   nU R                  (       a  U
SS nU
S   nOU
SS  nS nU R                  (       aW  UbT  [        U S5      (       d   SU  S35       eUb  US	S  OS nU R                  UUUUUUU5      nUS   nUUSS -   nUS   nWU-   n[	        U R
                  U R                  U R                  U5      nU4U-   nU R                  (       a  UW4-   nU$ )
Nrj   )r   r   rV   r   r   r   r   z'If `encoder_hidden_states` are passed, zp has to be instantiated                 with cross-attention layers by setting `config.add_cross_attention=True`r   )r   r   rr   r   r   feed_forward_chunkr   r   )r<   r   r   r   r   r   r   r   rV   self_attn_past_key_valueself_attention_outputsr   r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputs                     r?   rZ   PeerLayer.forward  s    9G8R $2 3423$5X\ 	!!%/3) "0 "
 2!4 ??,Qr2G 6r :,G (,$??4@&  Z8 ?Y ZZ  '2 )7)8< &&*&9&9 %&)!'#  7q9 7"! G ,C2+F( 14P P01H1H151M1M151A1A1AC  "W, ??!2 55GrA   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r   r   )r<   r   intermediate_outputr   s       r?   r   PeerLayer.feed_forward_chunk.  s)    "//0@A{{#6IrA   )r   r   r   r   r   r   r   r   )
r\   r]   r^   r_   r$   rQ   rZ   r   ra   rb   rc   s   @r?   r   r     s2    )   "&#'#6EN rA   r   c            
       V   ^  \ rS rSrU 4S jrS rSSSSSSSS\" 5       S4
S jrSrU =r	$ )	PeerEncoderi4  c                 d  > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        [        USS/5      U l
        SU R                  ;   Ga*  U R                  S   R                  R                  U l        U R                  R                  U l        U R                  R                  U l        U R                  R                   U l        [        R"                  " U R                  R$                  R&                  U R                  R$                  R(                  5      U l        [        R"                  " U R                  R,                  R&                  U R                  R,                  R(                  5      U l        g g s  snf )Nr   r    rJ   r   )r#   r$   r=   r%   
ModuleListrangenum_hidden_layersr   layerr8   r   r   r<   self_attentionrq   ru   rv   rw   rx   in_featuresout_features	pos_queryry   pos_key)r<   r=   _r>   s      r?   r$   PeerEncoder.__init__6  sI   ]](-f.F.F(GH(G1Yv(GHJ
 (/v/H0:|(=$  4#?#??"&**Q-"9"9">">D'+':':'N'ND$'+':':'N'ND$!%!4!4!B!BDYYt':':'@'@'L'L'+':':'@'@'M'MODN99T%8%8%<%<%H%H%)%8%8%<%<%I%IKDL @ Is   F-c                 B   U R                   R                  U R                  U5      5      nU R                   R                  U R                  U5      5      n[        R
                  " UUR                  SS5      5      nU[        R                  " U R                  5      -  nU$ )Nr   r   )
r  r   r
  r  r5   r   r   r   r   ru   )r<   r   r   r   r   s        r?   get_position_attention_score(PeerEncoder.get_position_attention_scoreL  s    ))>>NN=)+''<<LL')	 !<<(1(;(;B(CE+dii$$/& &rA   NFTc                 
  ^^" SU R                   ;   a  U R                  U
S   S   S9nUU
S'   SU R                   ;   dQ  SU R                   ;   dA  SU R                   ;   d1  S	U R                   ;   d!  S
U R                   ;   d  SU R                   ;   Ga  UR                  S   nUR                  S   n[        R                  " U[        R
                  UR                  S9R                  SS5      n[        R                  " U[        R
                  UR                  S9R                  SS5      nU R                  S   R                  R                  R                  n[        R                  " UU-
  U-   S-
  SSU-  S-
  S9nSU-  S-
  USS S 24'   SU-  S-
  US S 2S4'   SU-  S-
  US'   SU-  nS	U R                   ;   a  [        R                  " U
S   SU R                  S   R                  R                  R                  S-
  S9R                  S5      R                  SSU5      nUR                  S5      R                  USS5      nUU-  U-   nGOFSU R                   ;   a  U
S   R                  USS5      nU
S   R                  USS5      nU R                  S   R                  R                  R                   n[        R                  " UU-
  U-   SSU-  S-
  S9nUR                  S5      R                  USS5      nUU-  U-   nOS
U R                   ;   a  U
S   R                  USS5      nU
S   R                  USS5      nU R                  S   R                  R                  R                   n[        R                  " UU-
  U-   SSU-  S-
  S9nUU
S'   UU
S'   U	(       a  SOS nT(       a  SOS nT(       a  U R"                  R$                  (       a  SOS nU(       a  SOS n['        U R                  5       H  u  nnU	(       a  UU4-   nUb  UU   OS nUb  UU   OS m"[)        U R"                  SS5      (       a=  UU"4S jn [        R*                  R,                  R-                  U " U5      UUUUUU
5      n!OU" UUUUUT"TU
5      n!U!S   nU(       a	  UU!S   4-  nT(       d  M  UU!S   4-   nU R"                  R$                  (       d  M  UU!S   4-   nM     U	(       a  UU4-   nU(       d  [/        S UUUUU4 5       5      $ [1        UUUUUS9$ )NrJ   rK   rL   )r   r   rn   ro   rh   rk   rl   rm   r   r   rC   r   rj   ri   rG   r   )r   r   rF    ss_sentence_position_in_sequencer   r    gradient_checkpointingFc                    >^  U UU4S jnU$ )Nc                     > T" / U QTPTP76 $ r   r  )inputsmoduler   r   s    r?   custom_forwardJPeerEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forward  s&    %  9v  9~  9&7 9 9rA   r  )r  r  r   r   s   ` r?   create_custom_forward2PeerEncoder.forward.<locals>.create_custom_forward  s    9 *)rA   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r  ).0vs     r?   	<genexpr>&PeerEncoder.forward.<locals>.<genexpr>  s"        %q   %s   	)last_hidden_statepast_key_valuesr   
attentionscross_attentions)r   r  shaper5   r6   rO   rE   r   r  r   r<   r|   rP   r9   r   repeatr~   r=   r   	enumerater8   utils
checkpointtupler
   )#r<   r   r   r   r   r   r#  	use_cacher   output_hidden_statesrV   return_dictr   rX   
batch_sizeposition_ids_lposition_ids_rr|   r   distance_idx_maxidx1idx2sen_position_ids_lsen_position_ids_rr~   all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskr  layer_outputsr   s#           `                         @r?   rZ   PeerEncoder.forwardY  s     4#?#??)-)J)J,\:02 *K *3& 1K ,. T999=QUYUqUq=q)T-I-II>$B^B^^=A]A]]ATEaEaa&,,Q/J&,,Q/J"\\%**$++--1T"a[  #\\%**$++--1T!R[  04zz09TT"B"B - ;;/23567881<	>L <<q@ 1  "BBQF   <<q@  #CC ;d>Z>ZZ{{"#BC

1//44CC #1ffQ:&>	 
 $--a077
AqI#&66=?4C_C__%36&88<"B9+ # &46&88<"Ar9+ # 8<zz8 y&J&J 5{{&);;:;@@1D	F
 $--a077
AqI#&66=;t?[?[[%36&88<"B9+ # &46&88<"Ar9+ # 8<zz8 y&J&J 5{{&);;:;@@1D	F
 ;?67-9N>*"6BD$5b44;;#B#B 
HL 	 $-R$(4OA|#$58I$I!.7.CilO%1 -7; t{{$<eDD* !& 6 6 A A),7!"#)*"! !-!"#)*"%"	! *!,M"}R'8&;;"  &9!!$=( '(#;;222+?%a(C, ,,([  5`   1]4E E  "!#$%       9+.+*1
 	
rA   )	rv   ru   r=   r  rq   r  r
  r   r  )
r\   r]   r^   r_   r$   r  rQ   rZ   ra   rb   rc   s   @r?   r  r  4  s<    K,    "&#' #!&6f
 f
rA   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )PeerDiscriminatorPredictionsi  zEPrediction module for the discriminator, made up of two dense layers.c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  S5      U l        Xl        g )Nr   )r#   r$   r%   rw   rp   r   dense_predictionr=   r;   s     r?   r$   %PeerDiscriminatorPredictions.__init__  sJ    YYv1163E3EF
 "		&*<*<a @rA   c                     U R                  U5      n[        U R                  R                  5      " U5      nU R	                  U5      R                  S5      nU$ )Nr   )r   r   r=   r   rC  squeeze)r<   discriminator_hidden_statesr   logitss       r?   rZ   $PeerDiscriminatorPredictions.forward  sJ    

#>?&t{{'='=>}M&&}5==bArA   )r=   r   rC  	r\   r]   r^   r_   r`   r$   rZ   ra   rb   rc   s   @r?   rA  rA    s    O rA   rA  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )PeerGeneratorPredictionsi  zAPrediction module for the generator, made up of two dense layers.c                    > [         TU ]  5         [        R                  " UR                  5      U l        [        R
                  " UR                  UR                  5      U l        g r   )r#   r$   r%   r/   r(   rw   rp   r   r;   s     r?   r$   !PeerGeneratorPredictions.__init__  sB    f&;&;<YYv1163H3HI
rA   c                 l    U R                  U5      n[        S5      " U5      nU R                  U5      nU$ )Ngelu)r   r   r/   )r<   generator_hidden_statesr   s      r?   rZ    PeerGeneratorPredictions.forward  s4    

#:;&v.}=}5rA   )r/   r   rJ  rc   s   @r?   rL  rL    s    KJ rA   rL  c                   R   ^  \ rS rSrSr\rSrS/rSS/r	S r
\U 4S j5       rS	rU =r$ )
PeerPreTrainedModeli%  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
teams1_shared_bottomr   z peer\.embeddings_project\.weightzpeer\.embeddings_project\.biasc                 H   [        U[        R                  [        R                  45      (       a9  UR                  R
                  R                  SU R                  R                  S9  Oh[        U[        R                  5      (       aI  UR                  R
                  R                  5         UR                  R
                  R                  S5        [        U[        R                  5      (       a3  UR                  b%  UR                  R
                  R                  5         ggg)zInitialize the weights g        )meanstdg      ?N)r   r%   rw   r&   weightdatanormal_r=   initializer_ranger/   biaszero_fill_)r<   r  s     r?   _init_weights!PeerPreTrainedModel._init_weights2  s    fryy",,788 MM&&dkk;; ' =--KK""$MM$$S)fbii((V[[-DKK""$ .E(rA   c                    > UR                  SS5      nUR                  SS5      n[        X#40 UD6nUc  [        S0 UD6nU " U5      nU$ [        [        U ]  " SSU0UD6nU$ )a$  Instantiate the model.

Args:
    kwargs: Input args.
            model_dir: The model dir used to load the checkpoint and the label information.
            num_labels: An optional arg to tell the model how many classes to initialize.
                            Method will call utils.parse_label_mapping if num_labels is not input.
            label2id: An optional label2id mapping, which will cover the label2id in configuration (if exists).

Returns:
    The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
	model_dirNcfgpretrained_model_name_or_pathr  )popr   r   r#   r   from_pretrained)clskwargsrc  rd  
model_argsr=   modelr>   s          r?   _instantiate PeerPreTrainedModel._instantiate?  s     JJ{D1	jj%*9DVD
-*-FKE  %5 G.7G;EGErA   r  )r\   r]   r^   r_   r`   r   config_classbase_model_prefix_keys_to_ignore_on_load_missing"_keys_to_ignore_on_load_unexpectedr`  classmethodrl  ra   rb   rc   s   @r?   rT  rT  %  sE    
 L.'6&7#+-N*&%  rA   rT  c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\R                  \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	PeerForRTDOutputi[  a  
Output type of :class:`~transformers.PeerForRTD`.

Args:
    loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
        Total loss of the PEER objective.
    logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
        Prediction scores of the head (scores for each token before SoftMax).
    hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`,
        returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
        Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
        of shape :obj:`(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (:obj:`tuple(torch.FloatTensor)`, `optional`,
        returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
        Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
        sequence_length, sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
NlossrH  r   r$  r  )r\   r]   r^   r_   r`   ru  r   r5   FloatTensor__annotations__rH  r   r   r$  ra   r  rA   r?   rt  rt  [  sb    . )-D(5$$
%, $FE$8<M8E%"3"345<59Ju00129rA   rt  c                   2   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\R                  \	S'   Sr\R                  \	S'   Sr\\\R                        \	S	'   Sr\\\R                        \	S
'   Srg)PeerForPreTrainingOutputiz  a  
Output type of :class:`~transformers.PeerForPreTraining`.

Args:
    loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
        Total loss of the PEER objective.
    logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
        Prediction scores of the head (scores for each token before SoftMax).
    hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`,
        returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
        Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
        of shape :obj:`(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (:obj:`tuple(torch.FloatTensor)`, `optional`,
        returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
        Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
        sequence_length, sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nru  mlm_lossrtd_loss
mlm_logits
rtd_logitsr   r$  r  )r\   r]   r^   r_   r`   ru  r   r5   rv  rw  rz  r{  r|  r}  r   r   r$  ra   r  rA   r?   ry  ry  z  s    . )-D(5$$
%,,0Hhu(()0,0Hhu(()0$(J!!($(J!!(8<M8E%"3"345<59Ju00129rA   ry  a  

    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)

    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
    general usage and behavior.

    Parameters:
        config (:class:`~transformers.PeerConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
            weights.
a  
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`~transformers.PeerTokenizer`. See
            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
            details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
            1]``:

            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.

            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
            config.max_position_embeddings - 1]``.

            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
            vectors than the model's internal embedding lookup matrix.
        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (:obj:`bool`, `optional`):
            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
            tensors for more detail.
        output_hidden_states (:obj:`bool`, `optional`):
            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
            more detail.
        return_dict (:obj:`bool`, `optional`):
            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
aY  The bare Peer Model transformer outputting raw hidden-states without any specific head on top. Identical to the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the hidden size and embedding size are different.Both the generator and discriminator checkpoints may be loaded into this model.c            
       h   ^  \ rS rSrU 4S jrS rS rS rS rSSSSSSSS\	" 5       S4
S jr
S	rU =r$ )
	PeerModeli  c                   > [         TU ]  U5        [        U5      U l        UR                  UR
                  :w  a0  [        R                  " UR                  UR
                  5      U l        [        U5      U l
        Xl        U R                  5         U R                  R                  (       a  [        5       U l        [!        5       U l        g g r   )r#   r$   r   rK   r(   rp   r%   rw   embeddings_projectr  encoderr=   init_weightsseq_side_info_embeddingsrQ   input_sequence_side_infor   sequence_side_infor;   s     r?   r$   PeerModel.__init__  s     (0  F$6$66&(ii0E0E060B0B'DD# #6*;;//,0FD)&6&8D# 0rA   c                 .    U R                   R                  $ r   rK   r*   )r<   s    r?   get_input_embeddingsPeerModel.get_input_embeddings  s    ...rA   c                 $    XR                   l        g r   r  )r<   rz   s     r?   set_input_embeddingsPeerModel.set_input_embeddings  s    */'rA   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     gz
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
Nitemsr  r  r   r   r<   heads_to_pruner  r   s       r?   _prune_headsPeerModel._prune_heads
  <    
 +002LELLu%//;;EB 3rA   c                 6   UR                   nSU;  d  [        US   5      S:X  a1  U R                  R                  U R                  R
                  U5      nOUS   nUR                  5        H$  nXE   R                  US9R                  5       XE'   M&     0 UEUEnU$ Nr  r   rE   	rE   r   r  generate_seq_side_infor=   r  keysr   rO   r<   rV   rR   rE   r  sss         r?   update_seq_side_infoPeerModel.update_seq_side_info      !!%^;s9:@<?@@A'+'>'>'U'U44i(A$ (6*(,$ +//1B+C+G+J+J ,K ,#tv %( 2 HNG.FGrA   Nc           
      F   U R                   R                  (       a  U R                  U	U5      n	Ub  UOU R                   R                  nUb  UOU R                   R                  nU
b  U
OU R                   R
                  n
Ub  Ub  [        S5      eUb  UR                  5       nO"Ub  UR                  5       S S nO[        S5      eUb  UR                  OUR                  nUc  [        R                  " XS9nUc$  [        R                  " U[        R                  US9nU R                  X+U5      nU R                  UU R                   R                  5      nU R!                  UUUUU	S9n[#        U S5      (       a  U R%                  U5      nU R'                  UUUUUU	U
S9nU$ )	NDYou cannot specify both input_ids and inputs_embeds at the same timer   5You have to specify either input_ids or inputs_embedsr  rC   )rR   r   rS   rT   rV   r  r   r   r   r-  rV   r.  )r=   r  r  r   r-  use_return_dictrs   rM   rE   r5   onesrN   rO   get_extended_attention_maskget_head_maskr  rK   rr   r  r  )r<   rR   r   rS   r   r   rT   r   r-  rV   r.  rW   rE   extended_attention_maskr   s                  r?   rZ   PeerModel.forward&  s    ;;//!66~7@BN 2C1N-TXT_T_TqTq$8$D KK,, 	 &1%<k$++B]B] ]%>V  "#..*K&',,.s3KGI I &/%:!!@T@T!"ZZCN!"[[5::f>N #'"B"B#1&&y'+{{'D'DF	 %)') ( 
 4-.. 33MBM2/!5)# % 
 rA   )r=   rK   r  r  r  r  )r\   r]   r^   r_   r$   r  r  r  r  rQ   rZ   ra   rb   rc   s   @r?   r  r    sK    9 /0C, "!%6E ErA   r  c            
       \   ^  \ rS rSrU 4S jrS rS rSSSSSSSS\" 5       S4
S jrSr	U =r
$ )PeerTopModelin  c                    > [         TU ]  U5        [        U5      U l        Xl        U R                  5         U R                  R                  (       a  [        5       U l        [        5       U l
        g g r   )r#   r$   r  r  r=   r  r  rQ   r  r   r  r;   s     r?   r$   PeerTopModel.__init__p  sT     "6*;;//,0FD)&6&8D# 0rA   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     gr  r  r  s       r?   r  PeerTopModel._prune_heads{  r  rA   c                 6   UR                   nSU;  d  [        US   5      S:X  a1  U R                  R                  U R                  R
                  U5      nOUS   nUR                  5        H$  nXE   R                  US9R                  5       XE'   M&     0 UEUEnU$ r  r  r  s         r?   r  !PeerTopModel.update_seq_side_info  r  rA   Nc           
         U R                   R                  (       a  U R                  U
U5      n
Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R
                  nUb  Ub  [        S5      eUb  UR                  5       nO"Ub  UR                  5       S S nO[        S5      eUb  UR                  OUR                  nUc  [        R                  " XS9nUc$  [        R                  " U[        R                  US9nU R                  X<U5      nU R                  UU R                   R                  5      nU R!                  UUUUU	U
US9nU$ )Nr  r   r  r  rC   r  )r=   r  r  r   r-  r  rs   rM   rE   r5   r  rN   rO   r  r  r  r  )r<   r   rR   r   rS   r   r   rT   r   r-  rV   r.  rW   rE   r  s                  r?   rZ   PeerTopModel.forward  s    ;;//!66~7@BN 2C1N-TXT_T_TqTq$8$D KK,, 	 &1%<k$++B]B] ]%>V  "#..*K&',,.s3KGI I &/%:!!@T@T!"ZZCN!"[[5::f>N #'"B"B#1&&y'+{{'D'DF	 2/!5)# % 
 rA   )r=   r  r  r  )r\   r]   r^   r_   r$   r  r  rQ   rZ   ra   rb   rc   s   @r?   r  r  n  s?    	9C. "!%6< <rA   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )PeerClassificationHeadi  z-Head for sentence-level classification tasks.c                 ,  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l
        g r   )r#   r$   r%   rw   rp   r   r1   r2   r3   
num_labelsout_projr;   s     r?   r$   PeerClassificationHead.__init__  s`    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHrA   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        S5      " U5      nU R                  U5      nU R                  U5      nU$ )Nr   rP  )r3   r   r   r  )r<   featuresri  r   s       r?   rZ   PeerClassificationHead.forward  s`    Q1WLLOJJqM6"
 LLOMM!rA   )r   r3   r  rJ  rc   s   @r?   r  r    s    7I	 	rA   r  ):r`   r   dataclassesr   typingr   r   r5   torch.nnr%   torch.utils.checkpointtransformers.activationsr   r   transformers.file_utilsr   r	   transformers.modeling_outputsr
   transformers.modeling_utilsr   modelscope.modelsr   r   modelscope.utilsr   loggingmodelscope.utils.nlp.utilsr   modelscope.utils.torch_utilsr   r   r   configurationr   	sas_utilsr   
get_logger"PEER_PRETRAINED_MODEL_ARCHIVE_LISTModuler   re   r   r   r   r   r   r  rA  rL  rT  rt  ry  PEER_START_DOCSTRINGPEER_INPUTS_DOCSTRINGr  r  r  r  rA   r?   <module>r     s     ! "    ; E. 7 / . <> > & '				& "VRYY Vrb		 bJRYY  3BII 3lryy    Y		 YxK
")) K
\299 $ryy "3*o 3l :{ : :< :{ : :B ": z V
 y# yyxe& ePRYY rA   