
    9i8                        S SK r S SKrS SKrS SKJr  S SKJrJrJrJ	r	J
r
  S SKrS SKrS SKJs  Jr  S SKJrJr  S SKJr  S SKJrJrJrJrJrJr  S SKJr  S SKJr  S S	K J!r"  S
SK#J$r$  Sr%Sr& " S S\RN                  5      r( " S S\RN                  5      r) " S S\RN                  5      r* " S S\RN                  5      r+ " S S5      r, " S S\RN                  5      r- " S S\RN                  5      r. " S S\5      r/ " S S\/5      r0 " S  S!\RN                  5      r1 " S" S#\RN                  5      r2 " S$ S%\/5      r3g)&    N)	dataclass)AnyDictListOptionalUnion)Tensornn)xavier_uniform_)
BertConfig	BertModelBertTokenizerRobertaConfigRobertaModelRobertaTokenizer)ACT2FN)PreTrainedModel)logger   )
PlugConfigzconfig.jsonzpytorch_model.binc                   H   ^  \ rS rSrSr  SU 4S jjr     SS jrSrU =r$ )MultiHeadedAttention(   a  
Multi-Head Attention module from
"Attention is All You Need"
:cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.

Similar to standard `dot` attention but uses
multiple attention distributions simultaneously
to select relevant items.

.. mermaid::

   graph BT
      A[key]
      B[value]
      C[query]
      O[output]
      subgraph Attn
        D[Attn 1]
        E[Attn 2]
        F[Attn N]
      end
      A --> D
      C --> D
      A --> E
      C --> E
      A --> F
      C --> F
      D --> O
      E --> O
      F --> O
      B --> O

Also includes several additional tricks.

Args:
   head_count (int): number of parallel heads
   model_dim (int): the dimension of keys/values/queries,
       must be divisible by head_count
   dropout (float): dropout parameter
c                   > X!-  S:X  d   eX!-  U l         X l        [        TU ]  5         Xl        [
        R                  " X!U R                   -  5      U l        [
        R                  " UXR                   -  5      U l        [
        R                  " UXR                   -  5      U l	        [
        R                  " SS9U l        [
        R                  " U5      U l        X@l        U R                  (       a  [
        R                  " X"5      U l        g g )Nr   dim)dim_per_head	model_dimsuper__init__
head_countr
   Linearlinear_keyslinear_valueslinear_querySoftmaxsoftmaxDropoutdropoutuse_final_linearfinal_linear)selfr"   r   r*   r+   	__class__s        g/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/fid_plug/backbone.pyr!   MultiHeadedAttention.__init__R   s    
 %***%3"$99YT=N=N0NOYYy'14E4E'EGIIi&03D3D&DFzzb)zz'* 0!! "		) ?D "    c	                 B  ^^^ UR                  S5      mU R                  mU R                  mUUU4S jn	UUU4S jn
UGb  US:X  a  U R                  U5      U R	                  U5      U R                  U5      p!nU	" U5      nU	" U5      nUR                  nUS   b)  [        R                  " US   R                  U5      U4SS9nUS	   b)  [        R                  " US	   R                  U5      U4SS9nXS'   X%S	'   OUS
:X  aZ  U R                  U5      nUS   c2  U R	                  U5      U R                  U5      p!U	" U5      nU	" U5      nO	US   US   p!XS'   X%S'   OCU R	                  U5      nU R                  U5      nU R                  U5      nU	" U5      nU	" U5      nU	" U5      nU[        R                  " T5      -  n[        R                  " X1R                  SS5      5      nUb;  UR                  S5      R                  U5      nUR!                  U[#        S5      5      nU R%                  U5      nUbi  USS2S4   U-  nU[        R&                  " US5      R                  S5      S-   -  n[        R                  " USS2SS24   UR                  S5      /S5      nU R)                  U5      nU R*                  (       a:  U
" [        R                  " X5      5      nU R-                  U5      nU(       a  UU4$ U$ [        R                  " X5      nU(       a  UU4$ U$ )aY  
Compute the context vector and the attention vectors.

Args:
   key (`FloatTensor`): set of `key_len`
        key vectors `[batch, key_len, dim]`
   value (`FloatTensor`): set of `key_len`
        value vectors `[batch, key_len, dim]`
   query (`FloatTensor`): set of `query_len`
         query vectors  `[batch, query_len, dim]`
   mask: binary mask indicating which keys have
         non-zero attention `[batch, query_len, key_len]`
Returns:
   (`FloatTensor`, `FloatTensor`) :

   * output context vectors `[batch, query_len, dim]`
   * one of the attention vectors `[batch, query_len, key_len]`
r   c                 L   > U R                  TSTT5      R                  SS5      $ )zprojection r   r      )view	transposex
batch_sizer   r"   s    r/   shape+MultiHeadedAttention.forward.<locals>.shape   s%    66*b*lC1a!r1   c                 l   > U R                  SS5      R                  5       R                  TSTT-  5      $ )zcompute context r   r4   r   )r6   
contiguousr5   r7   s    r/   unshape-MultiHeadedAttention.forward.<locals>.unshape   s3    ;;q!$//1j"j<&?@Ar1   Nr-   	self_keysr4   r   self_valuescontextmemory_keysmemory_values   r   -infr   g&.>)sizer   r"   r&   r$   r%   devicetorchcattomathsqrtmatmulr6   	unsqueeze	expand_asmasked_fillfloatr(   sumr*   r+   r,   )r-   keyvaluequerymasklayer_cachetypepredefined_graph_1return_attnr:   r>   rH   scoresattnattn_masked	drop_attnrB   outputr9   r   r"   s                     @@@r/   forwardMultiHeadedAttention.forwardi   s   8 XXa[
((__
	!
	A "v~$($5$5e$<d>N>N? ..u5 " Cje{+7))[%=%@%@%H#$N()+C}-9!II$]366v>FAOE+.K(-2M*"))%0}-5!%!1!1#!68J8J9*C!%LE!,]!;['>)-0M*/4O,""3'C&&u-E%%e,E*C%LEe 		,//e]]1a%89>>!$..v6D''eFm<F ||F#*q"u+(::K%		+q)33A6=?K 99d1crc6lK,A,A!,DEqIDLL&	!!ell9<=G&&w/Ft|#ll94G}$r1   )
r   r*   r,   r"   r$   r&   r%   r   r(   r+   )皙?T)NNNNF	__name__
__module____qualname____firstlineno____doc__r!   ra   __static_attributes____classcell__r.   s   @r/   r   r   (   s4    'X "&	@6  #'!q qr1   r   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )PositionwiseFeedForward   a  A two-layer Feed-Forward-Network with residual layer norm.

Args:
    d_model (int): the size of input for the first-layer of the FFN.
    d_ff (int): the hidden layer size of the second-layer
        of the FNN.
    dropout (float): dropout probability in :math:`[0, 1)`.
c                 J  > [         TU ]  5         [        R                  " USS9U l        [        R
                  " X5      U l        [        S   U l        [        R                  " U5      U l
        [        R
                  " X!5      U l        [        R                  " U5      U l        g )Nư>epsgelu_new)r    r!   r
   	LayerNorm
layer_normr#   w_1r   actvr)   	dropout_1w_2	dropout_2)r-   d_modeld_ffr*   r.   s       r/   r!    PositionwiseFeedForward.__init__   sl    ,,wD999W+:&	G,99T+G,r1   c           	          U R                  U R                  U R                  U R                  U5      5      5      5      nU R	                  U R                  U5      5      nX1-   $ N)ry   rx   rw   rv   r{   rz   )r-   r8   interr`   s       r/   ra   PositionwiseFeedForward.forward   sI    tyy$//!2D)EFG0zr1   )rx   ry   r{   rv   rw   rz   )rc   rd   rl   s   @r/   rn   rn      s    - r1   rn   c                   F   ^  \ rS rSrSrSrU 4S jr   SS jrS rSr	U =r
$ )	TransformerDecoderLayer   a  
Args:
  d_model (int): the dimension of keys/values/queries in
                   MultiHeadedAttention, also the input size of
                   the first-layer of the PositionwiseFeedForward.
  heads (int): the number of heads for MultiHeadedAttention.
  d_ff (int): the second-layer of the PositionwiseFeedForward.
  dropout (float): dropout probability(0-1.0).
  self_attn_type (string): type of self-attention scaled-dot, average
  c                 x  > [         TU ]  5         [        X!US9U l        [        X!US9U l        [        XU5      U l        [        R                  " USS9U l	        [        R                  " USS9U l
        [        R                  " U5      U l        U R                  U R                  5      nU R                  SU5        g )N)r*   rq   rr   rW   )r    r!   r   	self_attncontext_attnrn   feed_forwardr
   ru   layer_norm_1layer_norm_2r)   drop_get_attn_subsequent_maskMAX_SIZEregister_buffer)r-   r|   headsr}   r*   rW   r.   s         r/   r!    TransformerDecoderLayer.__init__  s    -egN0G-3G7KLLd;LLd;JJw'	--dmm< 	VT*r1   c           
      \   [         R                  " UR                  [         R                  5      U R                  SS2SUR                  S5      2SUR                  S5      24   R                  [         R                  5      -   S5      nU R                  U5      n	U	n
Ub  [         R                  " XY4SS9n
SnU R                  U
U
U	UUSS9nU R                  U5      U-   nU R                  U5      nU R                  UUUUUSSS	9u  pU R                  U R                  U5      U-   5      nXU
4$ )
a  
Args:
    inputs (`FloatTensor`): `[batch_size x 1 x model_dim]`
    memory_bank (`FloatTensor`): `[batch_size x src_len x model_dim]`
    src_pad_mask (`LongTensor`): `[batch_size x 1 x src_len]`
    tgt_pad_mask (`LongTensor`): `[batch_size x 1 x 1]`

Returns:
    (`FloatTensor`, `FloatTensor`, `FloatTensor`):

    * output `[batch_size x 1 x model_dim]`
    * attn `[batch_size x 1 x src_len]`
    * all_input `[batch_size x current_step x model_dim]`

Nr   r   r   r-   )rW   rX   rY   rB   T)rW   rX   rY   r[   )rI   gtrY   uint8rW   rG   r   rJ   r   r   r   r   r   )r-   inputsmemory_banksrc_pad_masktgt_pad_maskprevious_inputrX   stepdec_mask
input_norm	all_inputrV   
query_normmidr]   r`   s                   r/   ra   TransformerDecoderLayer.forward  sH   . 88ekk*ii0L--a002G<3D3DQ3G2GGHMM! &&v.
	%		>">AFIH#   		% 6)&&u-
%%# & 	 ""499S>E#9:Y&&r1   c                     SX4n[         R                  " [         R                  " U5      SS9R                  S5      n[        R
                  " U5      nU$ )z
Get an attention mask to avoid using the subsequent info.

Args:
    size: int

Returns:
    (`LongTensor`):

    * subsequent_mask `[1 x size x size]`
r   )kr   )nptriuonesastyperI   
from_numpy)r-   rG   
attn_shapesubsequent_masks       r/   r   1TransformerDecoderLayer._get_attn_subsequent_maskK  sG     _
''"''*"5;BB7K**?;r1   )r   r   r   r   r   r   )NNN)re   rf   rg   rh   ri   r   r!   ra   r   rj   rk   rl   s   @r/   r   r      s/    	 H+*  $ 6'p r1   r   c                   <   ^  \ rS rSrSU 4S jjrSS jrS rSrU =r$ )PositionalEncodingi]  c           	        > [         TU ]  5         [        R                  " X25      n[        R                  " SU5      R                  S5      n[        R                  " [        R                  " SUS[        R                  S9[        R                  " S5      U-  * -  5      n[        R                  " UR                  5       U-  5      US S 2SS S24'   [        R                  " UR                  5       U-  5      US S 2SS S24'   UR                  S5      nU R                  SU5        [        R                  " U5      U l        X l        g )Nr   r   r4   )dtypeg     @pe)r    r!   rI   zerosarangerO   exprR   rL   logsincosr   r
   r)   r*   r   )r-   r*   r   max_lenr   positiondiv_termr.   s          r/   r!   PositionalEncoding.__init___  s    [[&<<7+55a899ell1c1EKKH"&((7"3c"9 :; =ii 08 ;<1add7ii 08 ;<1add7\\!_T2&zz'*r1   c                    U[         R                  " U R                  5      -  nU(       a   XR                  S S 2U4   S S 2S S S 24   -   nO&XR                  S S 2S UR	                  S5      24   -   nU R                  U5      nU$ Nr   )rL   rM   r   r   rG   r*   )r-   embr   s      r/   ra   PositionalEncoding.forwardl  sr    DIIdhh''4(D!44C <CHHQK<00Cll3
r1   c                 J    U R                   S S 2S UR                  S5      24   $ r   )r   rG   )r-   r   s     r/   get_embPositionalEncoding.get_embv  s!    wwq,388A;,''r1   )r   r*   )r   r   )	re   rf   rg   rh   r!   ra   r   rj   rk   rl   s   @r/   r   r   ]  s    ( (r1   r   c                   <    \ rS rSrS
S\S\4S jjrS rS rS r	Sr
g	)TransformerDecoderStateiz  srccache_num_layersc                 j    Xl         S U l        S U l        S U l        US:w  a  U R	                  U5        g g Nr   )r   r   previous_layer_inputscache_init_cache)r-   r   r   s      r/   r!    TransformerDecoderState.__init__|  s:    &*-1"/3
r!-. "r1   c                 *    Xl         X l        S U l        g r   )r   r   r   )r-   	new_inputr   s      r/   update_state$TransformerDecoderState.update_state  s    '%:"
r1   c                     0 U l         [        U5       H/  nS S S.nS US'   S US'   X0R                   SR                  U5      '   M1     g )N)rC   rD   r@   rA   layer_{})r   rangeformat)r-   
num_layerslayerrX   s       r/   r   #TransformerDecoderState._init_cache  sM    
:&E*.FK'+K$)-K&3>JJz((/0	 'r1   c                    ^^ SUU4S jjmT" U R                   S5      U l         U R                  b  T" U R                  5        g g )Nr   c                    > U R                  5        H3  u  p#Uc  M
  [        U[        5      (       a
  T" U5        M)  T" X15      X'   M5     g r   )items
isinstancedict)struct	batch_dimr   v_recursive_mapfns       r/   r   <TransformerDecoderState.map_batch_fn.<locals>._recursive_map  s;    =!!T**&q)$&q$4	 'r1   r   )r   r   )r-   r   r   s    `@r/   map_batch_fn$TransformerDecoderState.map_batch_fn  s;    	5 	5 dhh?::!4::& "r1   )r   r   r   r   N)r   )re   rf   rg   rh   r	   intr!   r   r   r   rj    r1   r/   r   r   z  s$    /F /c /
?'r1   r   c                   V   ^  \ rS rSrSrSrU 4S jr  SS\S\S\S\	S	\4
S
 jjr
SrU =r$ )TransformerDecoderi  ao  
The Transformer decoder from "Attention is All You Need".


.. mermaid::

   graph BT
      A[input]
      B[multi-head self-attn]
      BB[multi-head src-attn]
      C[feed forward]
      O[output]
      A --> B
      B --> BB
      BB --> C
      C --> O


Args:
   num_layers (int): number of encoder layers.
   d_model (int): size of the model
   heads (int): number of heads
   d_ff (int): size of the inner FF layer
   dropout (float): dropout parameters
   embeddings (:obj:`onmt.modules.Embeddings`):
      embeddings to use, should have positional encodings
   attn_type (str): if using a separate copy attention
transformerc                 N  > [         TU ]  5         Xl        X`l        [	        UU R                  R
                  5      U l        [        R                  " [        U5       Vs/ s H  n[        X#XE5      PM     sn5      U l        [        R                  " USS9U l        S U l        g s  snf )Nrq   rr   )r    r!   r   
embeddingsr   embedding_dimpos_embr
   
ModuleListr   r   transformer_layersru   rv   state)	r-   r   r|   r   r}   r*   r   _r.   s	           r/   r!   TransformerDecoder.__init__  s     %$)'*.//*G*GI #%--:&1
& $GDB&1
 # ,,wD9
1
s   B"r   tgtr   r   memory_masksc                    UR                   nUnUR                  5       u  pUR                  5       u  pU R                  U5      nUR                  5       S:X  d   eU R	                  X5      nUnU R                  R
                  nUR                  R                  U5      R                  S5      R                  XU5      nUb$  UR                  S5      n	UR                  XU	5      nO:UR                  R                  U5      R                  S5      R                  XU	5      nUR                  c  / n/ n[        U R                  5       H  nS nUR                  c  UR                  b  UR                  U   nU R                  U   " XUUUUR                  b  UR                  SR!                  U5         OS US9u  nnnUR                  c  WR#                  U5        UR#                  U5        M     UR                  c  [$        R&                  " W5      nU R)                  U5      nUR                  c  UR+                  UW5        UUU4$ )NrE   r   r   r   )r   rX   r   )r   rG   r   r   r   padding_idxdataeqrO   expandr   r   r   r   r   r   r   appendrI   stackrv   r   )r-   r   r   r   r   r   	src_words	tgt_words	src_batchsrc_len	tgt_batchtgt_lenr   r`   src_memory_bankr   r   r   saved_inputsattnsiprev_layer_inputr]   r   s                           r/   ra   TransformerDecoder.forward  s    II		&^^-	&^^-	 ooc"wwyA~~c(%oo11 ~~((5??BVI0 	 $"''+G'..y7KL$>>,,[9CCAF	G4  ;;Lt'A#{{"''3','B'B1'E$))!, ,#3{{. !&J,=,=a,@ A48 $FD) {{"##I.LL (" ;; ;;|4L( ;;sL1ue##r1   )r   rv   r   r   r   r   NN)re   rf   rg   rh   ri   decoder_typer!   r   r	   r   ra   rj   rk   rl   s   @r/   r   r     sT    8 !L* !'+:$.:$:$ $:$ 	:$
 %:$ :$r1   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )PlugPointerGeneratori  c                    > [         TU ]  5         [        R                  " X5      U l        [        R
                  " S5      U l        g r   )r    r!   r
   r#   dense
LogSoftmaxgen_func)r-   hidden_size
vocab_sizer.   s      r/   r!   PlugPointerGenerator.__init__  s.    YY{7
b)r1   c                 J    U R                  U5      nU R                  U5      nU$ r   r  r  )r-   r8   s     r/   ra   PlugPointerGenerator.forward  s"    JJqMMM!r1   r  re   rf   rg   rh   r!   ra   rj   rk   rl   s   @r/   r  r    s    *
 r1   r  c                   \    \ rS rSrSr\rSr\S\	\
\\R                  4      4S j5       rSrg)PlugPreTrainedModeli  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
plugpretrained_model_name_or_pathc                     [         R                  R                  U[        5      n[         R                  R	                  U5      (       a  [
        R                  " U5      O	[        5       n[         R                  R                  UUR                  5      Ul        [         R                  R                  U[        5      n[         R                  R	                  U5      (       a  [        R                  " U5      OS nU " X55      $ r   )ospathjoinCONFIG_NAMEisfiler   from_json_fileencoder_pthWEIGHTS_NAMErI   load)clsr  config_fileconfigcheckpoint_file
checkpoints         r/   from_pretrained#PlugPreTrainedModel.from_pretrained#  s     ggll#@+N;=77>>< <**;7(l 	WW\\*G*0*<*<>'',,'D'3546GGNN5 5UZZ0"& 	6&&r1   r   N)re   rf   rg   rh   ri   r   config_classbase_model_prefixclassmethodr   r   strr  PathLiker*  rj   r   r1   r/   r  r    sL    
 L'08s?A{{@K :L 1M' 'r1   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )	PlugModeli3  c           	        > [         T	U ]  U5        Xl        UR                  S:X  d  UR                  S:X  a/  [	        [
        R                  " UR                  5      5      U l        O>UR                  S:X  a.  [        [        R                  " UR                  5      5      U l        UR                  S:  Ga*  [        R                  " UR                  U R                  R                  R                  R                  5      nU R                  R                   R"                  R$                  R&                  UR$                  R&                  S S& U R                  R                   R"                  R$                  R&                  S   S S S 24   R)                  UR                  S-
  S5      UR$                  R&                  SS & X0R                  R                  R                   l        U R                  R                  R*                  U l        [        R                  " U R*                  U R                  R                  R                  UR                  S:X  a  SOSS9nUR,                  (       aM  [.        R0                  " U R                  R                  R                   R2                  R$                  5      Ul        [5        UR6                  UR8                  UR:                  UR<                  UR>                  US	9U l         [C        UR8                  U R*                  5      U l"        U R@                  R                   R$                  U RD                  RF                  l        Ub  [I        US
   RK                  5       5       H  nURM                  S5      (       a2  US
   U   US
   URO                  SS5      '   US
   RQ                  U5        URM                  S5      (       d  Mc  US
   U   US
   URO                  SS5      '   US
   RQ                  U5        M     U RS                  US
   SS9n[U        U5        g U R@                  RW                  5        GH  n[Y        U[        RZ                  [        R                  45      (       a%  UR$                  R&                  R]                  SSS9  Oh[Y        U[        R^                  5      (       aI  UR`                  R&                  Rc                  5         UR$                  R&                  Re                  S5        [Y        U[        RZ                  5      (       d  M  UR`                  c  M  UR`                  R&                  Rc                  5         GM     U RD                  Rg                  5        H>  nURi                  5       S:  a  [k        U5        M$  UR&                  Rc                  5         M@     URl                  (       a  UR                  S:X  a?  [        R                  " U R*                  U R                  R                  R                  SS9nO>[        R                  " U R*                  U R                  R                  R                  SS9n[.        R0                  " U R                  R                   R2                  R$                  5      Ul        X@R@                  l        U R@                  R                   R$                  U RD                  RF                  l        g )Nbertzh_bertrobertai   r   r   r   )r   )r   r}   r*   r   modelzmodule. zplug.F)strict        g{Gz?)meanstd      ?)7r    r!   r'  encoderr   r   r*  r"  r4  r   r   max_posr
   	Embeddingr7  r  r   position_embeddingsweightr   repeatr  	share_embcopydeepcopyword_embeddingsr   
dec_layersdec_hidden_size	dec_headsdec_ff_sizedec_dropoutdecoderr  	generatorr  listkeys
startswithreplacepopload_state_dictprintmodulesr   r#   normal_ru   biaszero_fill_
parametersr   r   use_bert_emb)
r-   r'  r)  my_pos_embeddingstgt_embeddingsrT   msgmodulepr.   s
            r/   r!   PlugModel.__init__5  s    >>V#v~~'B!**6+=+=>@DI^^y($--f.@.@ACDI NNS  "		 6 6 B B!D 261E1E1Y1Y1`1`1e1e $$)) +.*-/ 		,,@@GGLLa!!'(<a!@ $$)) >OIIOO&&:))**55OOII((#^^y8aA
 $(MM		**::AA%CN!)""""##&&%' .f.D.D.2oo?&*ll&=&=&D&D#!Jw/4467>>),,*4W*=c*B w'!2)' (w'++C0>>'**(27(;C(@ w')% &w'++C0 8 &&z'':5&IC#J,,..0fryy",,&?@@MM&&..CT.B55KK$$**,MM&&,,S1fbii00V[[5LKK$$**, 1 ^^..0557Q;#A&FFLLN	 1
 "">>Y.%'\\		((44$%&'N
 &(\\		((44$%&'N )-II((88??)A%*8'*.,,*A*A*H*HDNN  'r1   c                     U R                  XUSS9u  pV[        U5      nU R                  XrS S 2S S24   U5      u  pnXS   U4$ )NFtoken_type_idsreturn_dictr   )r4  r   rM  )
r-   r   r   mask_srcre  top_vecr   r   decoder_outputsr  s
             r/   ra   PlugModel.forward  sY    YY.e  M
',$(LLAssFW$M!b	722r1   )r4  r'  rM  rN  r  r   r  rl   s   @r/   r2  r2  3  s    OIb3 3r1   r2  c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )LabelSmoothingLossi  z
With label smoothing,
KL-divergence between q_{smoothed ground truth prob.}(w)
and p_{prob. computed by model}(w) is minimized.
c                 
  > SUs=:  a  S::  d   e   eX0l         [        [        U ]  5         XS-
  -  n[        R
                  " U4U5      nSXPR                   '   U R                  SUR                  S5      5        SU-
  U l        g )Nr:  r=  r4   r   one_hot)	r   r    rl  r!   rI   fullr   rO   
confidence)r-   label_smoothingtgt_vocab_sizeignore_indexsmoothing_valuern  r.   s         r/   r!   LabelSmoothingLoss.__init__  s    _++++++' $02)a-?@**n/A$%  !Y(9(9!(<=/r1   c                 4   U R                   R                  UR                  S5      S5      nUR                  SUR	                  S5      U R
                  5        UR                  X R                  :H  R	                  S5      S5        [        R                  " XSS9$ )zN
output (FloatTensor): batch_size x n_classes
target (LongTensor): batch_size
r   r   rS   )	reduction)
rn  rC  rG   scatter_rO   rp  masked_fill_r   Fkl_div)r-   r`   target
model_probs       r/   ra   LabelSmoothingLoss.forward  sx    
 \\((Q;
Av//2DOOD+;+;!; F Fq I1Mxxe<<r1   )rp  r   )ird   rl   s   @r/   rl  rl    s    	0	= 	=r1   rl  c                   B   ^  \ rS rSrSrSU 4S jjrS rS rS rSr	U =r
$ )	NMTLossComputei  z 
Standard NMT Loss Computation.
c                    > [         TU ]  5         Xl        US   U l        US:  a  [	        XCU R                  S9U l        g [        R                  " U R                  SS9U l        g )NPADr   )rs  rS   )rs  rw  )r    r!   rN  r   rl  	criterionr
   NLLLoss)r-   rN  symbolsr  rq  r.   s        r/   r!   NMTLossCompute.__init__  s]    ""5>Q/$:J:JLDN  ZZ!--@DNr1   c                 D    UR                  SUR                  S5      5      $ )Nr   r4   r5   rG   )r-   _vs     r/   _bottleNMTLossCompute._bottle  s    wwr2771:&&r1   c                 D    UR                  SX!R                  S5      5      $ )Nr   r   r  )r-   r  r9   s      r/   	_unbottleNMTLossCompute._unbottle  s    wwr:wwqz22r1   c                    US S 2SS 24   nUR                  S5      UR                  S5      pTUR                  U R                  5      R                  5       nU R	                  U5      nU R                  U5      nUR                  5       R                  S5      n	U R                  X5      n
U
R                  [        U5      5      n
XR                  XES5      4$ )Nr   r   r   )rG   ner   rS   r  rN  r=   r5   r  divrR   )r-   r   r`   r|  r9   decoder_lengthnormalizationbottled_outputr\   gtruthlosss              r/   ra   NMTLossCompute.forward  s    QU%+[[^V[[^N		$"2"23779f-/""$))"-~~f-xxm,-[[R@@@r1   )r  rN  r   )r:  )re   rf   rg   rh   ri   r!   r  r  ra   rj   rk   rl   s   @r/   r  r    s$    	@'3	A 	Ar1   r  c            	       :  ^  \ rS rSr\ " S S5      5       rSS\4U 4S jjjrSS jr SSSS\	4S	 jjr
SS
 jrSS\" S5      * S4S jr            SSSS\S\4S jjrS r  SS\R$                  S\R$                  S\\\R$                  4   4S jjrSrU =r$ )PlugForConditionalGenerationi  c                       \ rS rSr% \\S'   \R                  \S'   \R                  \S'   \R                  \S'   \R                  \S'   Sr\	S   \S'   Sr
\	\	\      \S	'   Sr\	\   \S
'   Srg)"PlugForConditionalGeneration.Batchi  r9   r   r   rg  re  Nquery_idsrc_strtgt_strr   )re   rf   rg   rh   r   __annotations__rI   r	   r  r   r  r/  r  rj   r   r1   r/   Batchr    s]    \\\\,,$#$t*##'d3i'!c!r1   r  datasetc                 L  > [         TU ]  U5        [        R                  " 5       U l        Xl        UR                  S:X  aO  [        R                  " UR                  SS9nUR                  UR                  UR                  UR                  S.nOzUR                  S:X  d  UR                  S:X  aZ  [        R                  " UR                  SS9nUR                  S   UR                  S	   UR                  S
   UR                  S   S.nWU l        WU l        [%        X5      U l        [)        U R&                  R*                  UU R&                  R,                  UR.                  5      U l        X0R
                  l        U R"                  S   U l        U R"                  S   U l        g )Nr6  F)do_lower_case)BOSEOSr  EOQr4  r5  Tz[CLS]z[SEP]z[PAD]z	[unused2]r  r  )r    r!   logging
get_loggerr   r'  r>  r   r*  r"  cls_token_idsep_token_idpad_token_idunk_token_idr   vocab	tokenizerr  r2  r  r  rN  r  rq  r  r  start_token	end_token)r-   r'  r)  r  r  r  r.   s         r/   r!   %PlugForConditionalGeneration.__init__  s_    ((*>>Y&(88""%9I !-- -- -- --	G ^^v%9)D%55""$8I !w/ w/ w/ {3	G #f1	"499#6#6#'99#7#7#)#9#9;	 &<<.e,r1   c                     Uc,  UR                  U R                  S   5      R                  5       nU R                  XX45      S   nU R	                  X%5      nU$ )Nr  r   )r  r  longr  r  )r-   r   r   rg  re  r`   r  s          r/   ra   $PlugForConditionalGeneration.forward  sP    vvdll512779H3X>qAyy%r1   batchfastc                     U R                   R                  5         [        R                  " 5          U R                  " U/UQ70 UD6sSSS5        $ ! , (       d  f       g= f)a)  
Translate a batch of sentences.

Mostly a wrapper around :obj:`Beam`.

Args:
   batch (:obj:`Batch`): a batch from a dataset object
   data (:obj:`Dataset`): the dataset object
   fast (bool): enables fast beam search (may not support all features)

Todo:
   Shouldn't need the original dataset.
N)r  evalrI   no_grad_fast_translate_batch)r-   r  r  argskwargss        r/   translate_batch,PlugForConditionalGeneration.translate_batch  s=    $ 			]]_--eEdEfE __s   A
Ac                 6   [        [        [        UR                  5       5      5      5      nUS:w  a.  XC   US   sUS'   XC'   UR	                  U5      R                  5       n[        UR                  5       5      nUS==   U-  ss'   UR                  S5      nUR                  US5      R                  SS5      R                  US5      R                  SS5      R                  5       R                  " U6 nUS:w  a  UR	                  U5      R                  5       nU$ )Nr   r   r   )	rO  r   lenrG   permuter=   r5   r6   rC  )r-   r8   countr   permout_sizer  s          r/   _tile"PlugForConditionalGeneration._tile  s    E#affh-()!8!%DGDGTY		$**,A>uq	FF5"Yq!_VE1Yq!_Z\T
  !8		$**,Ar1   
   r=  Infr   c                    US:  aG  [        [        X%5      UR                  S5      5      nU[        R                  " X5      S   S   :  nXAU'   US:  a  [        R
                  " USS9u  px[        R                  " [        R                  " USS9SS9n	X:  n
US:  a	  SU
S	S U24'   U
S	S S24   R                  5       U
S	SS 24'   SU
S
'   U
R                  SX5      nXAU'   U$ )Nr   r   ).r   Nr=  T)
descendingr   r   .).r   )minmaxrG   rI   topksortcumsumrz  r(   clonescatter)r-   logitstop_ktop_pfilter_valuemin_tokens_to_keepindices_to_removesorted_logitssorted_indicescumulative_probssorted_indices_to_removes              r/   _top_k_top_p_filtering3PlugForConditionalGeneration._top_k_top_p_filtering+  s    19E6B)E !'F)B1)E GK *L !L(4$%3;,1JJv$,O)M$||		-R0b : (8'?$!A%EF(.A/A.A)AB0HSbS1%' %S!"W-/0$V, !9 @ @>!=(4$%r1   
max_length
min_lengthc                   ^ ^^J TmUR                   nUR                  nUR                  nUR                  nT R                  R                  UUUSS9u  nn[        UT R                  R                  R                  5      nUR                  nUR                  UU 4S j5        T R                  UTSS9n[        R                  " U[        R                  US9n[        R                  " SUT-  T[        R                  US9n[        R                  " UT-  S/T R                   [        R                  US9n0 n[#        / 5      nUbO  U HI  n[%        US S	 5      nUS	   nUR'                  U/ 5      U/-   UU'   UR)                  [+        U5      5        MK     [        R,                  " S
/[/        S5      /TS-
  -  -   US9R1                  U5      n [3        U5       Vs/ s H  n/ PM     n!n0 n"[3        U5       Vs/ s H  n/ PM     snU"S'   [3        U5       Vs/ s H  n/ PM     snU"S'   S/U-  U"S'   UU"S'   [3        U5       GH}  n#US S 2S	4   R5                  SS	5      n$U$R7                  SS5      n$T R                  R                  UU$UU#S9u  n%n&nT R                  R8                  R;                  U%R7                  SS5      R=                  S5      5      n'U'R?                  S	5      n(U#U:  a  SU'S S 2T R@                  4'   [+        U5      S:  a  UR?                  S5      n)/ n*[3        U)5       H  n+/ n,U H]  n-[%        UU+U#S-   U--
  U#S-   24   RC                  5       RE                  5       RG                  5       5      n.U,UR'                  U./ 5      -  n,M_     U*RI                  [#        U,5      5        M     U'R?                  S5      U):X  d   e[3        U)5       H  n+U*U+    H
  n/SU'U+U/4'   M     M     US:  a   T RK                  UUR?                  S5      U	U#S-   5      n0[3        U'R?                  S5      5       HC  n+[#        U0U+   5       H.  n1U'U+U14   S:  a  U'U+U14==   U-  ss'   M  U'U+U14==   U-  ss'   M0     ME     U#S-   U-  n2U
(       a  U'U-  n3T RM                  U3XSS9n3[        RN                  " [P        RR                  " U3S	S9SS9n4[P        RT                  " U3SS9n3U3U R5                  S	5      RW                  S5      -  n3U3U2-  n3[        RX                  " U3S	U45      n5U4R5                  S	T5      n4U5R5                  S	T5      n5OPU'U R5                  S	5      RW                  S5      -  n'U'U2-  n6U6R[                  S	TU(-  5      n6U6R]                  TS	S9u  n5n4T R^                  R`                  (       Gax  UR?                  S5      n7U7S:  Ga`  [3        UR?                  S5      5       GHA  n+Sn8UU+    V9s/ s H  n9[c        U95      PM     n:n9T R^                  Rd                  S:X  a8  T Rf                  Ri                  U:5      Rk                  5       Rm                  5       n:OXU: V9s/ s H  n9T Rf                  Rn                  U9   PM     n:n9SRq                  U:5      Rs                  SS5      Rm                  5       n:[+        U:5      S::  a  M  [3        S[+        U:5      S-
  5       V+s/ s H  n+U:U+S-
     U:U+   U:U+S-      4PM     n;n+[%        U;S	   5      n<U<U;S S	 ;   a  Sn8U8(       d  GM<  SW6W+'   GMD     U5U2-  n U4U(-  n=U4Ru                  U(5      n4U=US U=R?                  S5       RW                  S5      -   n>U>R5                  S	5      mJ[        Rv                  " URy                  STJ5      U4R5                  S	S5      /S	5      nU4R{                  T R@                  5      n?U#S-   U:X  a  U?R}                  T R@                  5        U?S S 2S4   R{                  S5      n@U?R                  5       (       Gah  UR5                  S	TUR?                  S	5      5      nA[3        U?R?                  S5      5       GH  n+UU+   nBW@U+   (       a  U?U+   R}                  T R@                  5        U?U+   R                  5       R5                  S	5      nCUC HE  nDU!WB   RI                  U5U+UD4   WAU+UDSS 24   45        U(       d  M/  [+        U!5      T:X  d  M@  SW@U+'   MG     W@U+   (       d  M  [        U!WB   S SS9nET R^                  R                  S:X  d5  T R^                  R                  S :X  a]  T R^                  R                  (       dB  WES T  H6  nFUFu  nGnHU"S   WB   RI                  UG5        U"S   UB   RI                  UH5        M8     GMJ  WES   u  nGnHU"S   WB   RI                  UG5        U"S   UB   RI                  UH5        GM     W@R{                  S5      R                  5       R5                  S	5      nI[+        UI5      S:X  a    U"$ U Ry                  SWI5      n U>Ry                  SUI5      n>URy                  SUI5      nWARy                  SUI5      R5                  S	UR?                  S	5      5      nU>R5                  S	5      mJURy                  STJ5      nUR                  UJ4S! j5        GM     U"$ s  snf s  snf s  snf s  sn9f s  sn9f s  sn+f )"NFrd  c                 &   > TR                  U TUS9$ )Nr   )r  )r   r   	num_beamsr-   s     r/   <lambda>DPlugForConditionalGeneration._fast_translate_batch.<locals>.<lambda>n  s    tzz%zDr1   r   r   )r   rH   )r   r   rH   r   r   r:  rF   )rH   predictionsr\   
gold_scorer  )r   g@xr=  )r  r  r  )num_samplesrE   r6   z ##r8  TgPKc                     U S   $ )Nr   r   )r8   s    r/   r  r  0  s    1r1   )rT   reverseqg_ranking_test
paraphrasec                 (   > U R                  UT5      $ r   )index_select)r   r   select_indicess     r/   r  r  K  s    5#5#5c>#Jr1   )Dr9   r   rg  re  r  r4  r   rM  r   rH   r   r  rI   r   r  ro  r  settuplegetaddr  tensorrR   rC  r   r5   r6   rN  ra   squeezerG   r  cpunumpytolistr   calc_banned_tokensr  multinomialrz  r(   log_softmaxrO   gatherreshaper  r'  block_trigramr   r>  r  decodestripsplitids_to_tokensr  rR  fmodrJ   r  r   rZ  anynonzerosortedr  sample_topk)Kr-   r  r  r  bad_words_idsearly_stoppingr  length_penaltyrepetition_penaltyno_repeat_ngram_size	do_sampletemperaturer  r  r  r  r9   r   rg  re  src_featuresr   r   rH   batch_offsetbeam_offset	alive_seqbad_words_prefix_dictbad_words_prefix_lenbw_idrT   rU   topk_log_probs
hypothesesresultsr   decoder_inputdec_outr  	log_probsr  	num_hyposbad_word_banned_tokenr  curr_banned_tokenpre_lenpre_keybanned_tokenprev_output_tokensprevious_tokencurr_length_penalty_scorestopk_idstopk_scorescurr_scorescur_lenfailwwordstrigramstrigramtopk_beam_indexbatch_indexis_finishedend_conditionr  bfinished_hypjbest_hypeachscoreprednon_finishedr  sK   `     `                                                                   @r/   r  2PlugForConditionalGeneration._fast_translate_batchN  s   & 	%%
ii>>--))...e ) Ma'TYY->->-I-IJ$$ 	D	Fzz,	qzA||ejj9ll"** JJ
Y 6:#//%*ZZ&,.	 !#"2w$&E#2J'b	-B-F-F. %w.'%c*$((S2 ' LLv9q=99%vj1 	 #(
"34"3Qb"3
4.3J.?!@.?".?!@).z):;):AR):;!"j 0 *%D%ae,11!R8M *33Aq9M$(II$5$5}l %6 %?!GUE 		++33!!!Q'//24I"+Jj /4	!T^^+, ()A-%NN1-	(*%y)A(*%#7"'	!TAX5G45I6 66 36 )77:suUUWVVX#O)-B-F-F#R.) )) $8
 *005F1GH * !~~a(I555y)A(=a(@5:	!\/2 )A *
 "C']%)%<%<y~~a02F1H&" y~~a01A*-.@.C*D$Q%67!;%a&78<NN8%a&78<NN8	 +E 2 $(!8n"<#k1555! 6 
 !,,IIg2. !# --$ >..r2<<Q??!$77#llR+ $==	#)..	# ^004>>qAA	'*==)11"i*6LM(3(8(8(8(K%X)))#..+aK"9>>!#45$1:1 >AQ >;;..);$(NN$9$9 %%'',uwuuw " JO%INA < <Q ? " % %(HHUO$;$;E2$F$L$L$NEJ!O$-2136u:>.C$D.C &+1q5\58U1q5\$J.C ! $D #("5"hsm3#'D4-3KN+ 6. )+>>N '*4O}}Z0H  633A67AA!DE  )--b1N 		&&q.9b!$# I
 #++dnn5Kax:%!!$..1'1-003M  'nnRINN2<NO{//23A$QA$Q'#A,,T^^<#.q>#9#9#;#@#@#DL)"1,,(A.Aq!"H0EFH)>c*o.J/3M!,	 * %Q''#)&qM~t$M;;..2CC $ 3 3| C(,(?(?(0)(<.2t ' 1! 4 ; ;E B ' 6q 9 @ @ F )=
 +31+KE4#H-a077>#M215<<TB3 44  -//2::<AA"E|$)  "0!<!<Q!M)66q,G+88LI'44QET"innR01  )--b1N'44QGLJLg &l } 5 "A;~ !?%$Ds$   k4kk*k#kkc           	      >  ^^^^ TS-   T:  a  [        U5       Vs/ s H  n/ PM     sn$ [        U5       Vs/ s H  n0 PM     snm[        U5       H  nTU   R                  5       R                  5       R                  5       nTU   n[	        [        T5       V	s/ s H  oU	S  PM	     sn	6  H,  n
[        U
S S 5      nUR                  U/ 5      U
S   /-   X'   M.     M     UUUU4S jn[        U5       Vs/ s H
  o" U5      PM     nnU$ s  snf s  snf s  sn	f s  snf )Nr   r   c                    > TS-   T-
  n[        TU UT24   R                  5       R                  5       R                  5       5      nTU    R	                  U/ 5      $ r   )r  r  r  r  r  )hypo_idx	start_idx	ngram_idxr)  generated_ngramsr  prev_input_idss      r/   _get_generated_ngramsNPlugForConditionalGeneration.calc_banned_tokens.<locals>._get_generated_ngrams_  sa    !&::Ix(0 1 225#%JI $H-11)R@@r1   )r   r  r  r  zipr  r  )r-   rB  r  r  r)  r   idx
gen_tokensgenerated_ngramr  ngramprev_ngram_tuplerC  r>  banned_tokensrA  s    ` ``          @r/   r  /PlugForConditionalGeneration.calc_banned_tokensO  s@    Q;-- %i 01 01B 011(-i(89(81B(89#C',00288:AACJ.s3O.34H.IJ.In.IJL#(s#4 4C4G4G$b5*-22YK581L $	A 	A =B)<L
<L!(+<L 	 
 - 29
 K
s   DDD
6D	input_idsattention_maskreturnc                     Uc,  UR                  U R                  S   5      R                  5       nU R                  UR	                  5       S   US UUS9nU R
                  " U/UQ70 UD6nUS   nSU0$ )Nr  r   )r9   r   r   re  rg  r  )r  r  r  r  rG   r  )	r-   rM  rN  re  r  r  r  translation_batchpredss	            r/   	translate&PlugForConditionalGeneration.translatel  s     !&\\$,,u*=>CCEN

 ~~'*)#  % !00HHH!-0u%%r1   )r'  r  r   r  r  r  r  r  )Ndefaultr  )Fr   )P   r  NTrE   333333?rW     Fr=  r   r=  )re   rf   rg   rh   r   r  r/  r!   ra   boolr  r  rR   r  r   r  r  rI   r	   r   rS  rj   rk   rl   s   @r/   r  r    s    " " "- - -B &+F&F"F,( &(%(-25\M23!J 1302,0-1()-01434(-*-$%$'%,*- +.B> 26!%&"\\&"',,&
  $C$56& &r1   r  )4rE  rL   r  dataclassesr   typingr   r   r   r   r   r  r   rI   torch.nn.functionalr
   
functionalrz  r	   torch.nn.initr   transformersr   r   r   r   r   r   transformers.activationsr   transformers.modeling_utilsr   modelscope.utilsr   r  configurationr   r  r#  Moduler   rn   r   r   r   r   r  r  r2  rl  r  r  r   r1   r/   <module>re     s  "   	 ! 3 3      ): : + 7 . %"r299 rjbii 2dbii dN( (:#' #'Lj$ j$Z
299 
'/ '2X3# X3v= =<ARYY ADo&#6 o&r1   