
    9i.                       S r SSKrSSKrSSKrSSKrSSKJrJrJr  SSK	r	SSK	J
r
  SSKJr  SSKJr  SSKJr  SSKJrJrJr  SS	KJrJrJrJrJrJr  SS
KJrJr  SSKJ r   SSK!J"r"J#r#J$r$  SSK%J&r&  SSK'J(r(J)r)  SSK*J+r+  SSK,J-r-  SSK.J/r/  \-" 5       r0S r1 " S S\
Rd                  5      r3 " S S\
Rd                  5      r4 " S S\
Rd                  5      r5 " S S\
Rd                  5      r6 " S S\
Rd                  5      r7 " S S\
Rd                  5      r8 " S  S!\
Rd                  5      r9 " S" S#\
Rd                  5      r: " S$ S%\$\5      r; " S& S'\;5      r<S(r=\&R|                  " \+R~                  \ R                  S)9 " S* S+\;5      5       rAg),zPyTorch T5 model.    N)OptionalTupleUnion)nn)
checkpoint)ACT2FN))BaseModelOutputWithPastAndCrossAttentions)PreTrainedModel find_pruneable_heads_and_indicesprune_linear_layer)DUMMY_INPUTS
DUMMY_MASKadd_start_docstrings%add_start_docstrings_to_model_forwardis_torch_fx_proxyreplace_return_docstrings)assert_device_mapget_device_map)Models)ModelTensor
TorchModel)MODELS)AttentionBackboneModelOutputSeq2SeqModelOutput)Tasks)
get_logger   )T5Configc           	      
    SSK nSSKnSSKn[        R                  R                  U5      n[        R                  SU 35        UR                  R                  U5      n/ n0 n	U HP  u  p[        R                  SU
 SU 35        UR                  R                  Xj5      nUR                  U
5        XU
'   MR     U GHy  nUR                  S5      n
[        S U
 5       5      (       a;  [        R                  S	SR!                  U
5       35        U	R#                  US5        Mg  S
U
S   ;   a;  [        R                  S	SR!                  U
5       35        U	R#                  US5        M  U nX   nU
 GH  nUR%                  SU5      (       a  UR                  SU5      nOU/nUS   S;   a  ['        US5      nGOdUS   S:X  a  ['        US5      nUS   nGOHUS   S:X  a  ['        US5      nUS   nGO,US   S:X  a  ['        US5      nUS   nGOUS   S:X  a<  [)        US5      (       a  ['        US5      nO[)        US5      (       a  ['        US5      nOUS   S:X  a  ['        US5      nOUS   S:X  d	  US   S:X  a  ['        US5      nOUS   S:X  a  ['        US5      nOUS   S:X  a  U
S   S :X  a  GMA  US   S :X  a  ['        US!5      nOUUS   S":X  a<  [+        U5      S:  a-  US   R-                  5       (       a  ['        US#US    35      nGM   ['        UUS   5      n[+        U5      S:  d  GM  [1        US   5      nUU   nGM     WS   S;  a  ['        US5      nUS   S$:w  a6  [        R                  S%UR2                   S&U
 35        UR5                  U5      n UR2                  UR2                  :X  d"   S'UR2                   S(UR2                   S)35       e [        R                  S*U
 35        [:        R<                  " UR?                  UR@                  5      5      Ul!        U	R#                  US5        GM|     [        R                  S+S,R!                  U	RE                  5       5       S-35        U $ ! [         a    [        R                  S5        e f = f! [.         a,    [        R                  S	SR!                  U
5       35         GM\  f = f! [6         a1  nU=R8                  UR2                  UR2                  4-  sl        e SnAff = f).z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape /c              3   ,   #    U  H
  nUS ;   v   M     g7f))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0ns     a/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/T5/backbone.py	<genexpr>(load_tf_weights_in_t5.<locals>.<genexpr>R   s%       a  
 
 s   z	Skipping _slot_z[A-Za-z]+_\d+z_(\d+))kernelscale	embeddingweightself_attentionlayerenc_dec_attentionr   dense_relu_dense   rms_norm
layer_normfinal_layer_normr1   output_biasbetabiassquad
classifierdecoderlogitslm_headwiwi_r2   z"Transposing numpy weight of shape z for zPointer shape z and array shape z mismatchedzInitialize PyTorch weight z%Weights not copied to PyTorch model: z, .)#renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendsplitanyjoinpop	fullmatchgetattrhasattrlenisdigitAttributeErrorintshape	transposeAssertionErrorargstorch
from_numpyastypefloat32datakeys)modelconfigtf_checkpoint_pathrG   nptftf_path	init_varsnames
tf_weightsnamer`   arraytxt_namepointerm_namescope_namesnumes                      r+   load_tf_weights_in_t5r{   5   s   
 ggoo01G
KK8	BC''0IEJ (l5'BC&&w5T 4	 ! ~~c"      KK)CHHTN#345NN8T*tBxKK)CHHTN#345NN8T*$F||,f55 hhy&9%h1~!AA!'84Q#33!'73!!*Q#66!'73!!*Q#55!'73!!*Q:-7L11%g|<GW&899%g/ABGQ7*!'84Q=0KNf4L!'62Q7*!'<8Q9,aH1DQ8+!'95Q4'C-!#$-%)4Q)?)?)A)A!'SQ0@+AB%g{1~>G ;1$+a.)!#,W X q>!AAgx0Gq>[(KK4U[[MtfMOLL'E	,Y.?}KXY,
 	078''RZZ(@Ax&[ ^ KK
/		*//:K0L/MQO LI  Q	
 	T & KK)CHHTN+; <=   	FFw}}ekk22F	s5   Q2 R<S2!R1SS
T
,TT
c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )T5LayerNorm   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)zS
Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
N)super__init__r   	Parameterrd   onesr3   variance_epsilon)selfhidden_sizeeps	__class__s      r+   r   T5LayerNorm.__init__   s/     	ll5::k#:; #    c                    UR                  [        R                  5      R                  S5      R	                  SSS9nU[        R
                  " UU R                  -   5      -  nU R                  R                  [        R                  [        R                  4;   a%  UR                  U R                  R                  5      nU R                  U-  $ )Nr8   r/   T)keepdim)tord   rg   powmeanrsqrtr   r3   dtypefloat16bfloat16)r   hidden_statesvariances      r+   forwardT5LayerNorm.forward   s     !##EMM266q9>> ? %H6:6K6K5L )M M ;; ??),,T[[->->?M{{]**r   )r   r3   )gư>__name__
__module____qualname____firstlineno__r   r   __static_attributes____classcell__r   s   @r+   r}   r}      s    $+ +r   r}   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )T5DenseReluDense   rk   c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l
        g NFr>   )r   r   r   Lineard_modeld_ffrD   woDropoutdropout_ratedropoutr   rk   r   s     r+   r   T5DenseReluDense.__init__   s\    ))FNNFKKeD))FKKeDzz&"5"56r   c                     U R                  U5      n[        R                  R                  U5      nU R	                  U5      nU R                  U5      nU$ N)rD   r   
functionalrelur   r   )r   r   s     r+   r   T5DenseReluDense.forward   sF    .**=9]3.r   )r   rD   r   	r   r   r   r   r   r   r   r   r   r   s   @r+   r   r      s    7x 7 r   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )T5DenseGatedGeluDense   rk   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        S   U l        g )NFr   gelu_new)r   r   r   r   r   r   wi_0wi_1r   r   r   r   r   gelu_actr   s     r+   r   T5DenseGatedGeluDense.__init__   s    IIfnnfkkF	IIfnnfkkF	))FKKeDzz&"5"56z*r   c                     U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      nU R	                  U5      nU$ r   )r   r   r   r   r   )r   r   hidden_geluhidden_linears       r+   r   T5DenseGatedGeluDense.forward   sQ    mmDIIm$<=		-0#3]3.r   )r   r   r   r   r   r   r   s   @r+   r   r      s    +x + r   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )	T5LayerFF   rk   c                 z  > [         TU ]  5         UR                  S:X  a  [        U5      U l        OCUR                  S:X  a  [        U5      U l        O"[        U R                  R                   S35      e[        UR                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr   z
gated-geluz9 is not supported. Choose between `relu` and `gated-gelu`r   )r   r   feed_forward_projr   DenseReluDenser   
ValueErrorrk   r}   r   layer_norm_epsilonr:   r   r   r   r   r   s     r+   r   T5LayerFF.__init__   s    ##v-"26":D%%5"7"?D;;0011jk  &NN 9 9;zz&"5"56r   c                 p    U R                  U5      nU R                  U5      nXR                  U5      -   nU$ r   )r:   r   r   )r   r   forwarded_statess      r+   r   T5LayerFF.forward   s;    ??=9../?@%5E(FFr   )r   r   r:   r   r   s   @r+   r   r      s    7x 7 r   r   c                   t   ^  \ rS rSrS	S\4U 4S jjjrS r\   S
S j5       rS r	        SS jr
SrU =r$ )T5Attention   rk   c                 ~  > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  U R                  -  U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        U R                  (       a0  [        R(                  " U R                  U R                  5      U l        [-        5       U l        SU l        g r   )r   r   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancer   d_kvkey_value_proj_dim	num_headsn_headsr   r   	inner_dimr   r   qkvo	Embeddingrelative_attention_biassetpruned_headsgradient_checkpointingr   rk   r   r   s      r+   r   T5Attention.__init__   s2    +++F(.4.S.S+/5/U/U,~~"(++''**(?(?? 4<<eD4<<eD4<<eD4>>4<<eD+++-<<33T\\,CD(E&+#r   c                 
   [        U5      S:X  a  g [        XR                  U R                  U R                  5      u  p[        U R                  U5      U l        [        U R                  U5      U l        [        U R                  U5      U l        [        U R                  USS9U l	        U R                  [        U5      -
  U l        U R                  U R                  -  U l
        U R                  R                  U5      U l        g )Nr   r   dim)r\   r   r   r   r   r   r   r   r   r   r   union)r   headsindexs      r+   prune_headsT5Attention.prune_heads  s    u:?7<<!8!8$:K:KM $DFFE2#DFFE2#DFFE2#DFFEq9||c%j0004<<? --33E:r   c                 j   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  n[        R                  " U R                  5       U-  5      n[        R                  " X5-  5      nXx-  X%-
  -  n	XYR                  [        R                  5      -   n
[        R                  " U
[        R                  " XS-
  5      5      n
U[        R                  " X`U
5      -  nU$ )aR  
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on

Args:
    relative_position: an int32 Tensor
    bidirectional: a boolean - whether the attention is bidirectional
    num_buckets: an integer
    max_distance: an integer

Returns:
    a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
r   r8   r   )r   rd   longabsmin
zeros_likelogfloatmath	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelateive_pos_logmax_dis_logorigin_relative_positionrelative_postion_if_larges              r+   _relative_position_bucket%T5Attention._relative_position_bucket$  s8   2 AKQ!6 : :

!)!* * %		*; <!&+<+0+;+;<M+N"P !P
  1$	$0 "II&7&=&=&?)&KLhh|78#4#B#$% $-0K0KJJ1 %!$)II%OO5QG%I! 	EKK(AC 	Cr   c                    [         R                  " U[         R                  U R                  R                  R
                  S9SS2S4   n[         R                  " U[         R                  U R                  R                  R
                  S9SSS24   nXC-
  nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      nUR                  / SQ5      R                  S5      nU$ )z%Compute binned relative position bias)r   deviceN)r   r   r   )r8   r   r   r   )rd   aranger   r   r3   r  r  r   r   r   permute	unsqueeze)r   query_length
key_lengthcontext_positionmemory_positionr   relative_position_bucketvaluess           r+   compute_biasT5Attention.compute_bias\  s     <<**//66==? @A$wH  ,,**//66==? @DQwH ,>#'#A#A#.;;==	 $B $
  --$
 	*44r   c
                   ^ ^^ UR                   SS u  mn
U
nUb=  [        U5      S:X  d   S[        U5       S35       eUUc  US   R                   S   OU-  nUc  UOUR                   S   nUU 4S jmUU 4S jnU4S	 jnT" T R                  U5      5      nU" UT R                  UUb  US   OS5      nU" UT R                  UUb  US   OS5      n[
        R                  " UUR                  S
S5      5      nUc  T R                  (       da  [
        R                  " ST R                  X4UR                  UR                  S9nT R                  (       a  T R                  (       a  SUl        OT R!                  X5      nUb!  USS2SS2UR#                  S5      * S2SS24   nUb  XB-   nUU-  n[$        R&                  R)                  UR+                  5       SS9R-                  U5      n[$        R&                  R/                  UT R.                  T R                  S9nUb  UU-  nU" [
        R                  " UU5      5      nT R1                  U5      nT R2                  (       a  U(       a  UU4OSnU4U4-   U4-   nU	(       a  UU4-   nU$ )zp
Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
Nr8   z?past_key_value should have 2 past states: keys and values. Got z past statesr   r   c                 t   > U R                  TSTR                  TR                  5      R                  SS5      $ )
projectionr/   r   r8   )viewr   r   ra   states
batch_sizer   s    r+   r`   "T5Attention.forward.<locals>.shape  s3    ;;z2t||#6688A	!QHr   c                 z   > U R                  SS5      R                  5       R                  TSTR                  5      $ )reshaper   r8   r/   )ra   
contiguousr  r   r  s    r+   unshape$T5Attention.forward.<locals>.unshape  s7    ##Aq)446;;B0 0r   c                    > Uc  T" U" U 5      5      n OUc  T" U" U5      5      n Ub  Uc  [         R                  " X0/SS9n U $ Un U $ )z4projects hidden states correctly to key/query statesr8   r   )rd   cat)r   
proj_layerkey_value_statespast_key_valuer`   s       r+   project$T5Attention.forward.<locals>.project  sm      ' !&j&? @' !&j1A&B C)#+ %*II~.M23%5M
 !  %3M  r      r  r   Tr/   r   )ptraining)r`   r\   r   r   r   rd   matmulra   r   zerosr   r  r   r   r*  requires_gradr  sizer   r   softmaxr   type_asr   r   r   )r   r   maskr#  position_biasr$  layer_head_maskr  	use_cacheoutput_attentions
seq_lengthreal_seq_lengthr  r  r%  query_states
key_statesvalue_statesscoresattn_weightsattn_outputpresent_key_value_stateoutputsr  r`   s   `                      @@r+   r   T5Attention.forwardt  s   $ "/!4!4Ra!8
J$%N#q(tPRUVdRePffrst("*  .a066 0<=O )9(@_FVF\F\G
	H
	0
	!0 TVV  466#3!/!;N1G
 466#3!/!;N1G
 *..q!4
  33 %oB!== ,,!( ..4==26M/ $ 1 1/ N ) -a]5G5G5J4J4KQ.N O - 4-}},,LLN - $$+G% 	 }},,DLL4== - 

 &'/9Lell,( )ff[) 6:__9B $.#/#1IM 	  /%<$??C   00Gr   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   F)T       )NNNNNNFF)r   r   r   r   r   r   r   staticmethodr  r  r   r   r   r   s   @r+   r   r      sb    ,x , ,0; 04.0/25  5 n6 } }r   r   c                   B   ^  \ rS rSrSU 4S jjr      SS jrSrU =r$ )T5LayerSelfAttentioni  c                    > [         TU ]  5         [        XS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr   r   )r   r   r   SelfAttentionr}   r   r   r:   r   r   r   r   r   s      r+   r   T5LayerSelfAttention.__init__  sP    (M%NN 9 9;zz&"5"56r   c           
          U R                  U5      nU R                  UUUUUUUS9n	XR                  U	S   5      -   nU4U	SS  -   n
U
$ )N)r1  r2  r3  r$  r4  r5  r   r   )r:   rI  r   )r   r   attention_maskr2  r3  r$  r4  r5  normed_hidden_statesattention_outputr?  s              r+   r   T5LayerSelfAttention.forward  su      $}=-- '+)/ . 
 &5Ea5H(II  '+,r   )rI  r   r:   rA  )NNNNFFr   r   s   @r+   rF  rF    s&    7  r   rF  c                   @   ^  \ rS rSrU 4S jr       SS jrSrU =r$ )T5LayerCrossAttentioni  c                    > [         TU ]  5         [        USS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )NFrH  r   )r   r   r   EncDecAttentionr}   r   r   r:   r   r   r   r   r   s     r+   r   T5LayerCrossAttention.__init__  sQ    * 7%NN 9 9;zz&"5"56r   c
                     U R                  U5      n
U R                  U
UUUUUUUU	S9	nXR                  US   5      -   nU4USS  -   nU$ )N)r1  r#  r2  r3  r$  r4  r  r5  r   r   )r:   rS  r   )r   r   r#  rL  r2  r3  r$  r4  r  r5  rM  rN  layer_outputr?  s                 r+   r   T5LayerCrossAttention.forward"  s{      $}=// -'+)%/ 0 

 %||4DQ4G'HH '+,r   )rS  r   r:   )NNNNFNFr   r   s   @r+   rQ  rQ    s)    7  r   rQ  c                   L   ^  \ rS rSrSU 4S jjr           SS jrSrU =r$ )T5Blocki@  c                 p  > [         TU ]  5         UR                  U l        [        R                  " 5       U l        U R
                  R                  [        UUS95        U R                  (       a$  U R
                  R                  [        U5      5        U R
                  R                  [        U5      5        g )NrH  )
r   r   r   r   
ModuleListr5   rT   rF  rQ  r   r   s      r+   r   T5Block.__init__B  s     ++]]_


 ,GI	J ??JJ3F;<

)F+,r   c                    U	bm  U R                   (       d  [        R                  S5        Uc  SOSn[        U	5      U:w  a&  [	        SU SUS:X  a  SOS S[        U	5       S	35      eU	S S nU	SS  nOS
u  pU R
                  S   " UUUUUU
US9nUS S u  nnUSS  nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nU R                   =(       a    US LnU(       a  Ub  US   R                  S   nOS nU R
                  S   " UUUUUUUU
US9	nUS   nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nUb  UUS   -   nUUSS  -   nU R
                  S   " U5      nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nU4nU
(       a  UU4-   U-   nU$ UU-   nU$ )NzN`past_key_values` is passed to the encoder. Please make sure this is intended.r8      zThere should be z past states. z$2 (past / key) for cross attention.  zGot z past key / value states)NNr   )rL  r2  r3  r$  r4  r5  i  )r   maxr   )r#  rL  r2  r3  r$  r  r4  r5  r/   )r   rK   warningr\   r   r5   r   rd   r   isinfrV   finfor`  clampr`   )r   r   rL  r2  encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr3  cross_attn_layer_head_maskr$  r4  r5  return_dictexpected_num_past_key_valuesself_attn_past_key_valuecross_attn_past_key_valueself_attention_outputsr>  attention_outputsclamp_valuedo_cross_attentionr  cross_attention_outputsr?  s                           r+   r   T5Block.forwardO  s     %??d 1F0M1ST(>"&BB &'C&DNA]abAb=hjk3~.//GIJ J
 (6bq'9$(6qr(:%BL?$!%A)'+3/"
 2H1K..2B %--/EKK5"su5%++m&9&9:>>EK!KKK<[BM "__R1Fd1R '26q9??B#&*jjm!65; :8)#"3
'# 4A6M ""emm3!9##&359)#kk-*=*=>BBTI %!|!F '2*AD[E +' !24KAB4O O 

2}5 %--/EKK5"su5%++m&9&9:>>EK!KKK<[BM !#!8 ;;>OOG   11G
 r   )r   r5   rA  )NNNNNNNNFFTr   r   s   @r+   rY  rY  @  s7    -  "#&*#'o or   rY  c                   |   ^  \ rS rSrSr\r\rSr	Sr
SrU 4S jr\S 5       rS rSS jrS	 r\U 4S
 j5       rSrU =r$ )T5PreTrainedModeli  zz
An abstract class to handle weights initialization and a simple interface
for downloading and loading pretrained models.
transformerTc                 b   > [         TU ]  " UR                  40 UD6  [         [        U ]  U5        g r   )r   r   name_or_pathr   )r   rk   kwargsr   s      r+   r   T5PreTrainedModel.__init__  s*    ,,77eT#F+r   c                 z    [         R                  " [        5      n[         R                  " [        5      nUUUS.nU$ )N)decoder_input_ids	input_idsdecoder_attention_mask)rd   tensorr   r   )r   r|  
input_maskdummy_inputss       r+   r  T5PreTrainedModel.dummy_inputs  s6    LL.	\\*-
!*"&0

 r   c                    U R                   R                  n[        U[        5      (       a)  UR                  R
                  R                  US-  5        g[        U[        5      (       a2  UR                  R                  R
                  R                  SUS-  S9  g[        U[        5      (       GaQ  UR                  R                  R
                  R                  SX R                   R                  S-  -  S9  [        UR                  S5      (       aE  UR                  R                  b.  UR                  R                  R
                  R                  5         UR                   R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR                   S5      (       aG  UR                   R                  b/  UR                   R                  R
                  R                  5         ggg[        U[$        5      (       Ga  UR&                  R                  R
                  R                  SX R                   R                  S-  -  S9  [        UR&                  S5      (       aE  UR&                  R                  b.  UR&                  R                  R
                  R                  5         UR(                  R                  R
                  R                  SX R                   R                  S-  -  S9  [        UR(                  S5      (       aE  UR(                  R                  b.  UR(                  R                  R
                  R                  5         UR                   R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR                   S5      (       aG  UR                   R                  b/  UR                   R                  R
                  R                  5         ggg[        U[*        5      (       GaZ  U R                   R                  nU R                   R,                  nU R                   R.                  nUR0                  R                  R
                  R                  SX#U-  S-  -  S9  UR2                  R                  R
                  R                  SX#S-  -  S9  UR4                  R                  R
                  R                  SX#S-  -  S9  UR6                  R                  R
                  R                  SX%U-  S-  -  S9  UR8                  (       a4  UR:                  R                  R
                  R                  SX#S-  -  S9  ggg)zInitialize the weightsg      ?g        )r   stdg      r>   N)rk   initializer_factor
isinstancer}   r3   rh   fill_T5Modelsharednormal_r   rD   r   r[   r>   zero_r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   modulefactorr   r   r   s         r+   _init_weightsT5PreTrainedModel._init_weights  s-   //fk**MM$$Vc\2(( MM  %%--3FSL-I 011
 II!!))f)<)<t(CD * Fvyy&))fiinn.H		##))+II!!))f)9)9D(@A * Cvyy&))fiinn.H		##))+ /I) 566KK##++f)<)<t(CD , Fv{{F++0@0@0L  %%++-KK##++f)<)<t(CD , Fv{{F++0@0@0L  %%++-II!!))f)9)9D(@A * Cvyy&))fiinn.H		##))+ /I),, kk))G!%!1!1kk++GHHOO  ((f3E)E(LM ) OHHOO  ((f6 ) 8HHOO  ((f6 ) 8HHOO  ((f3E)E(LM ) O11..55::BB&tO"< C > 2 -r   c                 H    [        U[        [        45      (       a  X!l        g g r   )r  r   T5Stackr   )r   r  values      r+   _set_gradient_checkpointing-T5PreTrainedModel._set_gradient_checkpointing  s    f{G455,1) 6r   c                 :   U R                   R                  nU R                   R                  nUc   S5       e[        U5      (       aE  [        R
                  " UR                  S S S-   U5      n[        R                  " XASS S24   /SS9nO=UR                  UR                  5      nUSS S24   R                  5       USSS 24'   X$S'   Uc   S5       eUR                  US	:H  U5        [        R                  " US
:  5      R                  5       (       d   S5       eU$ )Nz;self.model.config.decoder_start_token_id has to be defined.r/   )r   .r   r   ).r   z1self.model.config.pad_token_id has to be defined.ir   z8Verify that `shifted_input_ids` has only positive values)rk   decoder_start_token_idpad_token_idr   rd   fullr`   r!  	new_zerosclonemasked_fill_allitem)r   r|  r  r  shifted_input_idss        r+   _shift_rightT5PreTrainedModel._shift_right  s>   !%!C!C{{// #.	IH	I. Y'' %

9??3B+?%+G+A!C %		"c3B3h$78b!B !* 3 3IOO D)238)<)B)B)Dc12g&(>f%'\)\\'&&'8D'@,Oyy*a/055 
 
 	FE	F 
 ! r   c                    > UR                  SS5      nUc  [        S0 UD6nU " U5      nO0 n[        [        U ]  " SSU0UD6nX$l        U$ )ah  Instantiate the model.

Args:
    kwargs: Input args.
            model_dir: The model dir used to load the checkpoint and the
            label information. num_labels: An optional arg to tell the
            model how many classes to initialize.
                            Method will call utils.parse_label_mapping
                            if num_labels not supplied. If num_labels is
                            not found, the model will use the default
                            setting (2 classes).

Returns:
    The loaded model, which is initialized by
    transformers.PreTrainedModel.from_pretrained
	model_dirNpretrained_model_name_or_pathr(   )getr   r   r   from_pretrainedr  )clsrx  r  rk   rj   model_kwargsr   s         r+   _instantiateT5PreTrainedModel._instantiate3  sd    & JJ{D1	''FKEL%5 I.7I;GIE#r   r(   rA  )r   r   r   r   __doc__r   config_classr{   load_tf_weightsbase_model_prefixis_parallelizablesupports_gradient_checkpointingr   propertyr  r  r  r  classmethodr  r   r   r   s   @r+   rt  rt    sc    
 L+O%&*#,  4>l2!:  r   rt  c                   j   ^  \ rS rSrS	U 4S jjrS	S jrS rS rS r            S
S jr	Sr
U =r$ )r  iR  c                   > [         TU ]  U5        X l        UR                  U l        [        R
                  " [        UR                  5       Vs/ s H  n[        U[        US:H  5      S9PM     sn5      U l
        [        UR                  UR                  S9U l        [        R                  " UR                   5      U l        U R%                  5         SU l        S U l        SU l        g s  snf )Nr   rH  r   F)r   r   embed_tokensr   r   r[  range
num_layersrY  boolblockr}   r   r   r;   r   r   r   	post_initmodel_parallel
device_mapr   )r   rk   r  ir   s       r+   r   T5Stack.__init__T  s     ( ++]]6,,-$
- FQ!VE-$
 
 !,NN 9 9!;zz&"5"56 	#&+#$
s    C)c                    UcD  [        [        U R                  5      [        [        R
                  R                  5       5      5      OUU l        [        U R                  [        U R                  5      5        SU l	        SU R                  R                  5       ;   a  SO.S[        [        U R                  R                  5       5      5      -   U l        S[        [        U R                  R                  5       5      5      -   U l        U R                  R!                  5        HG  u  p#U H<  nS[        U5      -   nU R                  U   R#                  U5      U R                  U'   M>     MI     U R$                  R#                  U R                  5      U l        U R&                  R#                  U R                  5      U l        g)a  
This is an experimental feature and is a subject to change at a
moment's notice.

Uses a device map to distribute attention modules of the model
across several devices. If no device map is given, it will evenly
distribute blocks across all devices.

Args:
    device_map (`Dict[int, list]`, optional, defaults to None):
        A dictionary that maps attention modules to devices. Note
        that the embedding module and LMHead are always
        automatically mapped to the first device (for esoteric
        reasons). That means that the first device should have fewer
        attention modules mapped to it than other devices. For
        reference, the t5 models have the following number of
        attention modules:

            - t5-small: 6
            - t5-base: 12
            - t5-large: 24
            - t5-3b: 24
            - t5-11b: 24

Example:

>>> # Here is an example of a device map on a machine with 4 GPUs
>>> # using t5-3b, which has a total of 24 attention modules:
>>> model = T5ForConditionalGeneration.from_pretrained("t5-3b")
>>> device_map = {
>>>     0: [0, 1, 2], 1: [3, 4, 5, 6, 7, 8, 9], 2: [10, 11, 12, 13, 14,
>>>     15, 16], 3: [17, 18, 19, 20, 21, 22, 23],
>>> }
>>> model.parallelize(device_map)
>>> # all of the parallelize methods in this file are the same

NTcpucuda:)r   r\   r  r  rd   cudadevice_countr  r   r  ri   strr   first_devicer`  last_deviceitemsr   r  r;   )r   r  r   r   r5   cuda_devices         r+   parallelizeT5Stack.parallelizei  sT   R ! 3tzz?E%**2I2I2K,LM'1 	 	$//3tzz?;"%*doo.B.B /
 &
ES!5!5!7899 	"ST__-A-A-C)D%EEOO))+DA%A.$(JJu$5$8$8$E

5!  , !--001B1BC $ 5 5 8 89I9I Jr   c                    SU l         SU l        SU l        SU l        [	        [        U R                  5      5       H.  nU R                  U   R                  S5      U R                  U'   M0     U R                  R                  S5      U l        U R                  R                  S5      U l	        [        R                  R                  5         g)a;  
Moves the model to cpu from a model parallel state.

Example:

>>> # On a 4 GPU machine with t5-3b:
>>> model = T5ForConditionalGeneration.from_pretrained("t5-3b")
>>> device_map = {
>>>     0: [0, 1, 2], 1: [3, 4, 5, 6, 7, 8, 9], 2: [10, 11, 12, 13, 14,
>>>     15, 16], 3: [17, 18, 19, 20, 21, 22, 23],
>>> }
>>> model.parallelize(device_map)
>>> # Splits the model across several devices model.deparallelize()
>>> # Put the model back on cpu and
>>> # cleans memory by calling torch.cuda.empty_cache()
>>> # all of the deparallelize methods in this file are the same
FNr  )r  r  r  r  r  r\   r  r   r  r;   rd   r  empty_cache)r   r  s     r+   deparallelizeT5Stack.deparallelize  s    $ $! s4::'A JJqM,,U3DJJqM ( --007 $ 5 5 8 8 ?

 r   c                     U R                   $ r   r  r   s    r+   get_input_embeddingsT5Stack.get_input_embeddings  s       r   c                     Xl         g r   r  r   new_embeddingss     r+   set_input_embeddingsT5Stack.set_input_embeddings  s    *r   c                 P  ^	^
 U R                   (       aS  [        R                  R                  U R                  5        U R
                  R                  U R                  5      U l        T	b  T	OU R                  R                  m	T
b  T
OU R                  R                  m
Ub  UOU R                  R                  nUb  UOU R                  R                  nUb*  Ub'  U R                  (       a  SOSn[        SU SU S35      eUb&  UR                  5       nUR                  SUS   5      nO>Ub  UR                  5       S S nO'U R                  (       a  SOSn[        SU SU S	35      eUc%  U R
                  c   S
5       eU R                  U5      nUu  nnUb  US   S   R                   S   U-   OUnT	SL a  U R                  (       d   SU  S35       eUc0  [        R"                  " UU5      R                  UR$                  5      nU R                  (       aD  UcA  Ub>  UR                   S   n[        R"                  " UUUR$                  [        R&                  S9nUc  S /[)        U R*                  5      -  nU R-                  X.UR$                  5      nU R                  (       aO  UbL  UR                  5       u  nnnUU4nUc  [        R"                  " UUR$                  S9nU R/                  U5      nOS nU R1                  X`R                  R2                  5      nU R1                  UU R                  R2                  5      nT	(       a  SOS nU(       a  SOS nT
(       a  SOS nT
(       a  U R                  (       a  SOS nS nS nU R5                  U5      n[7        [9        U R*                  U5      5       GH  u  n u  n!n"UU    n#UU    n$U R                   (       a  [        R                  R                  UR$                  5        Ub  UR                  UR$                  5      nUb  UR                  UR$                  5      nUb  UR                  UR$                  5      nUb  UR                  UR$                  5      nUb  UR                  UR$                  5      nU#b  U#R                  UR$                  5      n#U$b  U$R                  UR$                  5      n$U(       a  UU4-   nU R:                  (       aQ  U R<                  (       a@  T	(       a  [>        RA                  S5        Sm	U
U	4S jn%[C        U%" U!5      UUUUUUU#U$S 5
      n&OU!" UUUUUUU#U$U"T	T
S9n&T	SL a  U&S S S-   U&SS  -   n&U&S S u  nn'U&S   nU R                  (       a  Ub  U&T
(       a  SOS   nT	(       a  UU'4-   nT
(       a#  UU&S   4-   nU R                  (       a	  UU&S   4-   nU R                   (       d  GM,  U RD                  RG                  5        HO  u  n(n)U U)S   :X  d  M  S[I        U(5      -   U RJ                  :w  d  M/  UR                  S[I        U(S-   5      -   5      nMQ     GM     U RM                  U5      nU R5                  U5      nU(       a  UU4-   nU(       d  [O        S UUUUU4 5       5      $ [Q        UUUUUS9$ )Ndecoder_r_  zYou cannot specify both zinput_ids and zinputs_embeds at the same timer/   zYou have to specify either zinput_ids or inputs_embedsz<You have to initialize the model with valid token embeddingsr   r8   Tz)`use_cache` can only be set to `True` if z is used as a decoderr   r(  )r  r(   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fc                    >^  U UU4S jnU$ )Nc                  .   > [        T" / U QTPTP76 5      $ r   )tuple)inputsr  r5  r4  s    r+   custom_forwardFT5Stack.forward.<locals>.create_custom_forward.<locals>.custom_forwardS  s(    $"IFIII7HI K Kr   r(   )r  r  r5  r4  s   ` r+   create_custom_forward.T5Stack.forward.<locals>.create_custom_forwardQ  s    K *)r   )
rL  r2  re  rf  rg  r3  rh  r$  r4  r5  r   r^  r'     r  c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r(   )r)   r   s     r+   r,   "T5Stack.forward.<locals>.<genexpr>  s"        %q   %s   	)last_hidden_statepast_key_valuesr   
attentionscross_attentions))r  rd   r  
set_devicer  r  r   rk   r4  r5  output_hidden_statesuse_return_dictr   r   r.  r  r`   r   r  r   r\   r  get_extended_attention_maskinvert_attention_maskget_head_maskr  r   	enumeratezipr   r*  rK   ra  r   r  r  r  r  r;   r  r	   )*r   r|  rL  re  rf  r  	head_maskcross_attn_head_maskr  r4  r5  r  ri  err_msg_prefixinput_shaper  r6  mask_seq_lengthencoder_seq_lengthextended_attention_maskencoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskpresent_key_value_statesall_hidden_statesall_attentionsall_cross_attentionsr2  rg  r   r  layer_moduler$  r3  rh  r  layer_outputsr>  r   r   s*            ``                               r+   r   T5Stack.forward  s     JJ!!$"3"34 $ 1 1 4 4T5F5F GD!*!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D KK,, 	 &1%<k$++B]B] ]%>+/??ZN*>*:.HXXvw  "#..*K!r;r?;I&',,.s3K+/??ZN-n-=]>JZZgh   $$0p2pp0 --i8M!,
J  /: *!,Q/55@J 	 ??k&OPTvUj$kk?!"ZZ
ODGG$$&N??5=BWBc!6!<!<Q!?%*ZZ"$++jj	&"" "#fs4::6O #'"B"B)=)=#?
 ??4@=R=W=W >: 7$6$;$= %-).(1E1E*G&.2.H.H&/(+ /3+ &&y++2H2HI	#112F26++2H2H J)22 "6BD0d&7*.//  "@D 	(,%]3 $-S_-M#N A  'lO)=a)@&""

%%m&:&:;!-%3%6%6}7K7K%LN ,$1$4$4]5I5I$JM(4,A,D,D%,,-.)2>6U6X6X%,,7.30<4Q4T4T%,,5.1".&5&8&89M9M&NO-91K1N1N%,,2..#$58I$I!**t}}NNt !&I* !+),7!+!)31#.! !-!#:"/*?+J2O$3/I#1'&7!" E! -bq 1 5 !+AB/!0 6C2A5F2M2 *!,M#8#D0=*A13- +C+G/ ,/( !!/=3C2F!F??+?%a(C, ,,( """ OO113DAqAbEzgA&6$:J:J&J(5(8(83q1u:9M(N 4I $OP --m<]3   1]4E E  (!$%       9+4+%1
 	
r   )
r  r  r   r  r;   r  r   r   r  r  r   )NNNNNNNNNNNN)r   r   r   r   r   r  r  r  r  r   r   r   r   s   @r+   r  r  R  sP    ,*8Kt!8!+
 "#!!f
 f
r   r  a_  
The input argument `head_mask` was split into two arguments `head_mask` and
`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`,
but this feature is deprecated and will be removed in future versions. If you do
not want to use any `decoder_head_mask` now, please set `decoder_head_mask =
torch.ones(num_layers, num_heads)`.
)	group_keymodule_namec            #       N  ^  \ rS rSrSrSS/rS/rS\4U 4S jjrS!S jr	S	 r
S
 rS rS rS rS r               S"S\\R$                     S\\R&                     S\\R$                     S\\R(                     S\\R&                     S\\R&                     S\\R*                     S\\\\R&                           S\\\\R&                           S\\R*                     S\\R*                     S\\   S\\   S\\   S\\   S\\\R&                     \4   4 S jjrS rU =r$ )#r  i  a  The bare T5 Model transformer outputting raw hidden-states without any
specific head on top.

The T5 model was proposed in [Exploring the Limits of Transfer Learning with
a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by
Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder
transformer pre-trained in a text-to-text denoising generative setting.

This model inherits from [`PreTrainedModel`]. Check the superclass
documentation for the generic methods the library implements for all its
model (such as downloading or saving, resizing the input embeddings, pruning
heads etc.)

This model is also a PyTorch
[torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch
documentation for all matter related to general usage and behavior.

Parameters:
    config ([`T5Config`]): Model configuration class with all the parameters
    of the model.
        Initializing with a config file does not load the weights associated
        with the model, only the configuration. Check out the
        [`~PreTrainedModel.from_pretrained`] method to load the model
        weights.
zencoder\.embed_tokens\.weightzdecoder\.embed_tokens\.weightzMdecoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weightrk   c                   > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        SUl
        SUl        [        X R                  5      U l        [        R                  " U5      nSUl	        SUl        UR                  Ul        [        X0R                  5      U l        U R#                  5         SU l        S U l        g )NFT)r   r   r   r   
vocab_sizer   r  copydeepcopyr   r4  is_encoder_decoderr  encodernum_decoder_layersr  rA   r  r  r  )r   rk   encoder_configdecoder_configr   s       r+   r   T5Model.__init__  s     ll6#4#4fnnEv.$)!#( ,1)~{{;v.$(!,1)$*$=$=!~{{; 	 $r   c                    UcN  [        [        U R                  R                  5      [	        [
        R                  R                  5       5      5      OUU l        [        U R                  [        U R                  R                  5      5        U R                  R                  U R                  5        U R                  R                  U R                  5        SU l        g )NT)r   r\   r  r  r  rd   r  r  r  r   r  rA   r  )r   r  s     r+   r  T5Model.parallelize  s     ! DLL&&'uzz/F/F/H)IK'1 	 	$//3t||/A/A+BC  1  1"r   c                 D   U R                   R                  5         U R                  R                  5         U R                   R                  S5      U l         U R                  R                  S5      U l        SU l        S U l        [        R                  R                  5         g )Nr  F)	r  r  rA   r   r  r  rd   r  r  r  s    r+   r  T5Model.deparallelize  si    ""$""$||u-||u-#

 r   c                     U R                   $ r   )r  r  s    r+   r  T5Model.get_input_embeddings  s    {{r   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r   )r  r  r  rA   r  s     r+   r  T5Model.set_input_embeddings
  s+    $)).9)).9r   c                     U R                   $ r   )r  r  s    r+   get_encoderT5Model.get_encoder      ||r   c                     U R                   $ r   )rA   r  s    r+   get_decoderT5Model.get_decoder  r  r   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of
heads to prune in this layer} See base class PreTrainedModel
N)r  r  r5   	attentionr   )r   heads_to_pruner5   r   s       r+   _prune_headsT5Model._prune_heads  s<    
 +002LELLu%//;;EB 3r   r|  rL  r{  r}  r  decoder_head_maskr  encoder_outputsr  r  decoder_inputs_embedsr4  r5  r  ri  returnc                 N   Ub  UOU R                   R                  nUb  UOU R                   R                  nUbR  UcO  U R                   R                  U R                   R                  :X  a!  [
        R                  " [        [        5        UnUc  U R                  UUU
UUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU R                  (       a3  [        R                  R!                  U R"                  R$                  5        U R                  (       a  [        R                  R!                  U R"                  R$                  5        UR'                  U R"                  R$                  5      nUb%  UR'                  U R"                  R$                  5      nUb%  UR'                  U R"                  R$                  5      nUb%  UR'                  U R"                  R$                  5      nU R#                  UUUU	UUUUUUUUS9nU(       d  UU-   $ [)        UR*                  UR,                  UR.                  UR0                  UR2                  UR*                  UR.                  UR0                  S9$ )	a  
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. T5 is a model
    with relative position embeddings so you should be able to pad the
    inputs on both the right and the left.

    Indices can be obtained using [`T5Tokenizer`]. See
    [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
    for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a
    look a [T5 Training](./t5#training).
attention_mask (`torch.FloatTensor` of shape `(batch_size,
sequence_length)`, *optional*):
    Mask to avoid performing attention on padding token indices. Mask
    values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
decoder_input_ids (`torch.LongTensor` of shape `(batch_size,
target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`T5Tokenizer`]. See
    [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
    for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    T5 uses the `pad_token_id` as the starting token for
    `decoder_input_ids` generation. If `past_key_values` is used,
    optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining
    take a look at [T5 Training](./t5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,
target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in
    `decoder_input_ids`. Causal mask will also be used by default.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the
    encoder. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.

decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or
`(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the
    decoder. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.

cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or
`(num_layers, num_heads)`, *optional*):
        Mask to nullify selected heads of the cross-attention modules in
        the decoder. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
    Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*,
    `optional`: *attentions*) `last_hidden_state` of shape `(batch_size,
    sequence_length, hidden_size)` is a sequence of hidden states at the
    output of the last layer of the encoder. Used in the cross-attention
    of the decoder.
past_key_values (`tuple(tuple(torch.FloatTensor))` of length
`config.n_layers` with each tuple having 4 tensors of shape
`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
    Contains precomputed key and value hidden states of the attention
    blocks. Can be used to speed up decoding.

    If `past_key_values` are used, the user can optionally input only
    the last `decoder_input_ids` (those that don't have their past key
    value states given to this model) of shape `(batch_size, 1)` instead
    of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to
    directly pass an embedded representation. This is useful if you want
    more control over how to convert `input_ids` indices into associated
    vectors than the model's internal embedding lookup matrix.
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
target_sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `decoder_input_ids` you can choose to
    directly pass an embedded representation. If `past_key_values` is
    used, optionally only the last `decoder_inputs_embeds` have to be
    input (see `past_key_values`). This is useful if you want more
    control over how to convert `decoder_input_ids` indices into
    associated vectors than the model's internal embedding lookup
    matrix.

    If `decoder_input_ids` and `decoder_inputs_embeds` are both unset,
    `decoder_inputs_embeds` takes the value of `inputs_embeds`.

use_cache (`bool`, *optional*):
    If set to `True`, `past_key_values` key value states are returned
    and can be used to speed up decoding (see `past_key_values`).

output_attentions (`bool`, *optional*):
    Whether or not to return the attentions tensors of all attention
    layers. See `attentions` under returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
    Whether or not to return the hidden states of all layers. See
    `hidden_states` under returned tensors for more detail.
return_dict (`bool`, *optional*):
    Whether or not to return a [`~utils.ModelOutput`] instead of a plain
    tuple.
Returns:

Example:

>>> from transformers import T5Tokenizer, T5Model

>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
>>> model = T5Model.from_pretrained("t5-small")

>>> input_ids = tokenizer(
...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
>>> ).input_ids  # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

>>> # forward pass
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
>>> last_hidden_states = outputs.last_hidden_state
N)r|  rL  r  r  r5  r  ri  r   r   r8   )r  r   r  )r|  rL  r  r  re  rf  r  r  r4  r5  r  ri  )r  r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_statere  encoder_attentions)rk   r4  r  r  r  warningswarn_T5Model__HEAD_MASK_WARNING_MSGFutureWarningr  r  r   r\   r  rd   r  r  rA   r  r   r   r  r  r   r  r  )r   r|  rL  r{  r}  r  r%  r  r&  r  r  r'  r4  r5  r  ri  r   decoder_outputss                     r+   r   T5Model.forward  st   r "+!6IDKK<Q<Q	%0%<k$++B]B]  %6%>{{%%)G)GG5}E$-! ""ll#-+#"3%9' + O O,H"J "J:"1!"4'!+ .a015'!+ +1-15O (*JJ!!$,,";";<JJ!!$,,";";<),,T\\-F-FGM ,$5$8$8LL--%/!)!/!2!24<<3L3L!M%1)?)B)BLL--*/& ,,'1/+"/#1'!5/!5# ' 
 "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r   )rA   r  r  r  r  r   )NNNNNNNNNNNNNNN)r   r   r   r   r  _keys_to_ignore_on_load_missing"_keys_to_ignore_on_load_unexpectedr   r   r  r  r  r  r  r  r#  r   rd   
LongTensorFloatTensor
BoolTensorr   r   r  r   r   r   r   r   r   s   @r+   r  r    s   8 	)('#
 	Y*&x .#!:
C 156:8<=A159=7;EIEI048<$(,0/3&*!c
E,,-c
 !!2!23c
 $E$4$45	c

 !))9)9 :c
 E--.c
 $E$5$56c
 'u||4c
 "%e.?.?(@"ABc
 "%e.?.?(@"ABc
  -c
  (5c
 D>c
 $D>c
 'tnc
  d^!c
" 
uU&&');;	<#c
 c
r   r  )Br  r	  r   rM   r.  typingr   r   r   rd   r   torch.utils.checkpointr   transformers.activationsr   transformers.modeling_outputsr	   transformers.modeling_utilsr
   r   r   transformers.utilsr   r   r   r   r   r   'transformers.utils.model_parallel_utilsr   r   modelscope.metainfor   modelscope.models.baser   r   r   modelscope.models.builderr   modelscope.outputsr   r   modelscope.utils.constantr   modelscope.utils.loggerr   configurationr   rK   r{   Moduler}   r   r   r   r   rF  rQ  rY  rt  r  __HEAD_MASK_WARNING_MSGregister_modulebackboneT5r  r(   r   r+   <module>rL     sW      	  ) )   - +.= =N NE ' < < , O + . #	k\+")) +:ryy  BII &		 0v")) vr!299 !H%BII %P~bii ~BN
O NbY
 Y
z
  %..fiiHG
 G
 IG
r   