
    9iT                     0   S SK r S SKrS SKJrJrJr  S SKrS SKJr  S SKJ	r	  S SK
JrJr  S SKJr  S SKJr  S SKJrJrJr  S S	KJr  S S
KJr  SSKJrJr  SSKJr  \" 5       rSr\R@                  " \RB                  \RD                  S9 " S S\5      5       r#g)    N)OptionalTupleUnion)nn)CrossEntropyLoss)assert_device_mapget_device_map)Models)MODELS)AttentionBackboneModelOutputSeq2SeqLMOutputTokenGeneratorOutput)Tasks)
get_logger   )T5PreTrainedModelT5Stack)T5Configa_  
The input argument `head_mask` was split into two arguments `head_mask` and
`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`,
but this feature is deprecated and will be removed in future versions. If you do
not want to use any `decoder_head_mask` now, please set `decoder_head_mask =
torch.ones(num_layers, num_heads)`.
)	group_keymodule_namec            %         ^  \ rS rSr/ SQrS/rS%S\4U 4S jjjrS%S jrS r	S r
S	 rS
 rS rS rS r                S&S\\R$                     S\\R&                     S\\R$                     S\\R(                     S\\R&                     S\\R&                     S\\R*                     S\\\\R*                           S\\\\R*                           S\\R&                     S\\R&                     S\\R$                     S\\   S\\   S\\   S\\   S\\\R&                     \4   4"S jjr       S'S  jrS\R*                  4S! jrU 4S" jrS# rS$rU =r $ )(T5ForConditionalGeneration.   )zencoder\.embed_tokens\.weightzdecoder\.embed_tokens\.weightzlm_head\.weightzMdecoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weightconfigc                   > [         TU ]  U5        UR                  U l        [        R
                  " UR                  UR                  5      U l        [        R                  " U5      nSUl
        SUl        SUl        [        X@R                  5      U l        [        R                  " U5      nSUl
        SUl        UR                  Ul        [        XPR                  5      U l        [        R$                  " UR                  UR                  SS9U l        U R)                  5         SU l        US:X  a  U R-                  5         g g )NFT)biasauto)super__init__d_model	model_dimr   	Embedding
vocab_sizesharedcopydeepcopy
is_decoder	use_cacheis_encoder_decoderr   encodernum_decoder_layers
num_layersdecoderLinearlm_head	post_initmodel_parallelparallelize)selfr   
device_mapkwargsencoder_configdecoder_config	__class__s         m/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/T5/text2text_generation.pyr   #T5ForConditionalGeneration.__init__<   s     ll6#4#4fnnEv.$)!#( ,1)~{{;v.$(!,1)$*$=$=!~{{;yy1B1BO 	 $      c                 $   UcN  [        [        U R                  R                  5      [	        [
        R                  R                  5       5      5      OUU l        [        U R                  [        U R                  R                  5      5        U R                  R                  U R                  5        U R                  R                  U R                  5        U R                  R                  U R                  R                  5      U l        SU l        g )NT)r	   lenr*   blockrangetorchcudadevice_countr4   r   r2   r-   r/   tofirst_devicer1   )r3   r4   s     r9   r2   &T5ForConditionalGeneration.parallelizeX   s     ! DLL&&'uzz/F/F/H)IK'1 	 	$//3t||/A/A+BC  1  1||t||'@'@A"r;   c                    U R                   R                  5         U R                  R                  5         U R                   R                  S5      U l         U R                  R                  S5      U l        U R                  R                  S5      U l        SU l        S U l        [        R                  R                  5         g )NcpuF)
r*   deparallelizer-   rC   r/   r1   r4   r@   rA   empty_cacher3   s    r9   rH   (T5ForConditionalGeneration.deparallelizec   s}    ""$""$||u-||u-||u-#

 r;   c                     U R                   $ N)r$   rJ   s    r9   get_input_embeddings/T5ForConditionalGeneration.get_input_embeddingsm   s    {{r;   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g rM   )r$   r*   set_input_embeddingsr-   r3   new_embeddingss     r9   rQ   /T5ForConditionalGeneration.set_input_embeddingsp   s+    $)).9)).9r;   c                     Xl         g rM   r/   rR   s     r9   set_output_embeddings0T5ForConditionalGeneration.set_output_embeddingsu   s    %r;   c                     U R                   $ rM   rV   rJ   s    r9   get_output_embeddings0T5ForConditionalGeneration.get_output_embeddingsx       ||r;   c                     U R                   $ rM   )r*   rJ   s    r9   get_encoder&T5ForConditionalGeneration.get_encoder{   r\   r;   c                     U R                   $ rM   )r-   rJ   s    r9   get_decoder&T5ForConditionalGeneration.get_decoder~   r\   r;   	input_idsattention_maskdecoder_input_idsdecoder_attention_mask	head_maskdecoder_head_maskcross_attn_head_maskencoder_outputspast_key_valuesinputs_embedsdecoder_inputs_embedslabelsr(   output_attentionsoutput_hidden_statesreturn_dictreturnc                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUbR  UcO  U R                   R                  U R                   R                  :X  a!  [
        R                  " [        [        5        UnUc  U R                  UUU
UUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU R                  (       a3  [        R                  R!                  U R"                  R$                  5        Ub  Uc  Uc  U R'                  U5      nU R                  (       a  [        R                  R!                  U R"                  R$                  5        UR)                  U R"                  R$                  5      nUb%  UR)                  U R"                  R$                  5      nUb%  UR)                  U R"                  R$                  5      nUb%  UR)                  U R"                  R$                  5      nU R#                  UUUU	UUUUUUUUS9nUS   nU R                  (       a  [        R                  R!                  U R                  R$                  5        U R*                  R)                  U R                  R$                  5      U l        UR)                  U R*                  R,                  R.                  5      nU R                   R0                  (       a  UU R2                  S-  -  nU R+                  U5      nSnUb@  [5        S	S
9nU" UR7                  SUR9                  S5      5      UR7                  S5      5      nU(       d  U4USS -   U-   nUb  U4U-   $ U$ [;        UUUR<                  UR>                  UR@                  URB                  URD                  UR>                  UR@                  S9	$ )a  
Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. T5 is a model
        with relative position embeddings so you should be able to pad the
        inputs on both the right and the left.

        Indices can be obtained using [`T5Tokenizer`]. See
        [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
        for detail.

        [What are input IDs?](../glossary#input-ids)

        To know more on how to prepare `input_ids` for pretraining take a
        look a [T5 Training](./t5#training).
    attention_mask (`torch.FloatTensor` of shape `(batch_size,sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask
        values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
        Indices of decoder input sequence tokens in the vocabulary.

        Indices can be obtained using [`T5Tokenizer`]. See
        [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
        for details.

        [What are decoder input IDs?](../glossary#decoder-input-ids)

        T5 uses the `pad_token_id` as the starting token for
        `decoder_input_ids` generation. If `past_key_values` is used,
        optionally only the last `decoder_input_ids` have to be input (see
        `past_key_values`).

        To know more on how to prepare `decoder_input_ids` for pretraining
        take a look at [T5 Training](./t5#training).
    decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
        Default behavior: generate a tensor that ignores pad tokens in
        `decoder_input_ids`. Causal mask will also be used by default.
    head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
        Mask to nullify selected heads of the self-attention modules in the
        encoder. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or
        `(num_layers, num_heads)`, *optional*):
        Mask to nullify selected heads of the self-attention modules in the
        decoder. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in
            the decoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

    encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
        Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*,
        `optional`: *attentions*) `last_hidden_state` of shape `(batch_size,
        sequence_length, hidden_size)` is a sequence of hidden states at the
        output of the last layer of the encoder. Used in the cross-attention
        of the decoder.
    past_key_values (`tuple(tuple(torch.FloatTensor))` of length
        `config.n_layers` with each tuple having 4 tensors of shape
        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):

        Contains precomputed key and value hidden states of the attention
        blocks. Can be used to speed up decoding.

        If `past_key_values` are used, the user can optionally input only
        the last `decoder_input_ids` (those that don't have their past key
        value states given to this model) of shape `(batch_size, 1)` instead
        of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to
        directly pass an embedded representation. This is useful if you want
        more control over how to convert `input_ids` indices into associated
        vectors than the model's internal embedding lookup matrix.
    decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`,
        *optional*):
        Optionally, instead of passing `decoder_input_ids` you can choose to
        directly pass an embedded representation. If `past_key_values` is
        used, optionally only the last `decoder_inputs_embeds` have to be
        input (see `past_key_values`). This is useful if you want more
        control over how to convert `decoder_input_ids` indices into
        associated vectors than the model's internal embedding lookup
        matrix.

        If `decoder_input_ids` and `decoder_inputs_embeds` are both unset,
        `decoder_inputs_embeds` takes the value of `inputs_embeds`.

    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned
        and can be used to speed up decoding (see `past_key_values`).

    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention
        layers. See `attentions` under returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See
        `hidden_states` under returned tensors for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain
        tuple.
    labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the sequence classification/regression loss.
        Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All
        labels set to `-100` are ignored (masked), the loss is only computed
        for labels in `[0, ..., config.vocab_size]`

Returns:

Examples:

>>> from transformers import T5Tokenizer, T5ForConditionalGeneration

>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")

>>> # training
>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
>>> outputs = model(input_ids=input_ids, labels=labels)
>>> loss = outputs.loss
>>> logits = outputs.logits

>>> # inference
>>> input_ids = tokenizer(
...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
>>> ).input_ids  # Batch size 1
>>> outputs = model.generate(input_ids)
>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
>>> # studies have shown that owning a dog is good for you.
N)rc   rd   rl   rg   ro   rp   rq   r   r      )last_hidden_statehidden_states
attentions)rc   rd   rl   rk   encoder_hidden_statesencoder_attention_maskrg   ri   r(   ro   rp   rq   g      i)ignore_index)	losslogitsrk   decoder_hidden_statesdecoder_attentionscross_attentionsencoder_last_hidden_staterx   encoder_attentions)#r   r(   use_return_dictr,   r+   warningswarn2_T5ForConditionalGeneration__HEAD_MASK_WARNING_MSGFutureWarningr*   
isinstancer   r=   r1   r@   rA   
set_devicer-   rD   _shift_rightrC   r/   weightdevicetie_word_embeddingsr!   r   viewsizer   rk   rv   rw   r   ru   )r3   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   r(   ro   rp   rq   r5   rv   decoder_outputssequence_output	lm_logitsr|   loss_fctoutputs                            r9   forward"T5ForConditionalGeneration.forward   s   @ "+!6IDKK<Q<Q	%0%<k$++B]B]  %6%>{{%%)G)GG5}E$-! ""ll#-+#"3%9' + O O,H"J "J:"1!"4'!+ .a015'!+ +1-15O (*JJ!!$,,";";<"3";@U@] $ 1 1& 9 JJ!!$,,";";<),,T\\-F-FGM ,$5$8$8LL--%/!)!/!2!24<<3L3L!M%1)?)B)BLL--*/& ,,'1/+"/#1'!5/!5# ' 
 *!, JJ!!$,,";";<<<??4<<+D+DEDL-001D1D1K1KLO;;** .1EFOLL1	'T:Hr9>>"#56BID
 ]_QR%88?JF*.*:THv%FF+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r;   c	           	      4    Ub  US S 2SS 24   nUUUUUUUUS.$ )Nr{   )re   rk   rj   rd   rg   rh   ri   r(    )
r3   rc   pastrd   rg   rh   ri   r(   rj   r5   s
             r9   prepare_inputs_for_generation8T5ForConditionalGeneration.prepare_inputs_for_generation  s>     !!RS&)I "+#.,"!2$8"	
 		
r;   c                 $    U R                  U5      $ rM   )r   )r3   rn   s     r9   %prepare_decoder_input_ids_from_labels@T5ForConditionalGeneration.prepare_decoder_input_ids_from_labels  s      ((r;   c                    > [         TU ]  " U0 UD6n[        [        U[        R
                  5      (       a  US9$ US   S9$ )Nr   )	sequences)r   generater   r   r@   Tensor)r3   argsr5   r   r8   s       r9   r   #T5ForConditionalGeneration.generate  sJ    
 !4262# *65<< @ @f
 	
FLQi
 	
r;   c           	      H   Uc  [         R                  S5        U$ SnU H  nSnU H2  nUUR                  SUR                  UR                  5      5      4-   nM4     US   R
                  US   R
                  :X  d   e[        U5      [        U5      :X  d   eUU4-   nM     U$ )NzHYou might want to consider setting `use_cache=True` to speed up decodingr   r   )loggerwarningindex_selectrC   r   shaper=   )r3   r   beam_idxreordered_decoder_pastlayer_past_statesreordered_layer_past_stateslayer_past_states          r9   _reorder_cache)T5ForConditionalGeneration._reorder_cache  s     <NNZ K!#!% +-'$5 .I$118;;'7'>'>?AMD /D+ %6 /q177;L<5  23s;L7MMMM%;+?/ &/" "&  &%r;   )r-   r4   r*   r/   r!   r1   r$   rM   )NNNNNNNNNNNNNNNN)NNNNNNN)!__name__
__module____qualname____firstlineno___keys_to_ignore_on_load_missing"_keys_to_ignore_on_load_unexpectedr   r   r2   rH   rN   rQ   rW   rZ   r^   ra   r   r@   
LongTensorFloatTensor
BoolTensorr   r   boolr   r   r   r   r   r   r   __static_attributes____classcell__)r8   s   @r9   r   r   .   s4   
'# 	Y*&x  8	#!:
& 9=>B@DEI9=AE?CHLHL=AEI59,0487;.2!J
#E$4$45J
 ():): ;J
 $,E,<,<#=J
 )11A1A(B	J

 $E$5$56J
 $,E,=,=#>J
 '/u||&<J
 "*%ell0C*D!EJ
 "*%ell0C*D!EJ
  ((9(9:J
 (00A0A'BJ
 !!1!12J
 $D>J
 $,D>J
 '/tnJ
  &d^!J
" #5):):#;_#LM#J
\ ,059048<;?046:
4)ELL )
& &r;   r   )$r%   r   typingr   r   r   r@   r   torch.nnr   'transformers.utils.model_parallel_utilsr   r	   modelscope.metainfor
   modelscope.models.builderr   modelscope.outputsr   r   r   modelscope.utils.constantr   modelscope.utils.loggerr   backboner   r   configurationr   r   __HEAD_MASK_WARNING_MSGregister_moduletext2text_generationT5r   r   r;   r9   <module>r      s      ) )   %E ' ,6 6 + . 0 #	  ((		\&!2 \&	\&r;   