
    9i                     >   S SK r S SKJrJrJrJrJr  S SKrS SKrS SK	J
r
  S SKJr  S SKJr  S SKJrJrJrJrJr  S SKJrJrJrJrJrJrJrJrJr  S SKJ r   S SK!J"r#  S S	KJ$r$  S
r%S
r&\'" S5      r( " S S\5      r) " S S\5      r* " S S\5      r+S r,SS jr-S r.S r/g)    N)AnyDictListOptionalTuple)utils)	fsdp_wrap)FairseqEncoderFairseqEncoderDecoderModelFairseqIncrementalDecoderregister_modelregister_model_architecture)	AdaptiveSoftmax	BaseLayerFairseqDropoutLayerDropModuleList	LayerNormPositionalEmbeddingSinusoidalPositionalEmbeddingTransformerDecoderLayerTransformerEncoderLayer)checkpoint_wrapper)quant_noise)Tensori   g    חAc                   &  ^  \ rS rSrSrU 4S jr\S 5       r\S 5       r	\SS j5       r
\S 5       r\S 5       r    SS	\S
\S\\   S\\   4S jjr\R$                  R&                   SS\\\\\\\\      4      4   S\S\\\\4      4S jj5       r SS\\\\   4   S\\\\\\   4   4   S\4S jjr SS\\\\   4   S\\\\\\   4   4   S\4S jjrS\\\4   4S jrS\\\\\   4      4S jrS\\\\\\   4   4   4S jrSr U =r!$ )
CanmtModel   a  

Args:
    encoder (TransformerEncoder): the encoder
    decoder (TransformerDecoder): the decoder

The CanmtModel provides the following named architectures and
command-line arguments:

.. argparse::
    :ref: fairseq.models.transformer_parser
    :prog:
c                 b   > [         TU ]  X#5        Xl        SU l        X l        X0l        X@l        g )NT)super__init__argssupports_align_argsencoderdecodersecond_decoder)selfr!   r#   r$   r%   	__class__s        g/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/canmt/canmt_model.pyr    CanmtModel.__init__-   s-    *	#' ,    c                 &   U R                  S[        R                  " 5       SS9  U R                  S[        SSS9  U R                  S[        SS	S9  U R                  S
S[        SSS9  U R                  S[        SSS9  U R                  S[
        SSS9  U R                  S[
        SSS9  U R                  S[
        SSS9  U R                  S[
        SSS9  U R                  SSSS9  U R                  SSSS9  U R                  S[        SS S9  U R                  S![
        SS"S9  U R                  S#[
        SS$S9  U R                  S%[
        SS&S9  U R                  S'[
        SS(S9  U R                  S)SS*S9  U R                  S+SS,S9  U R                  S-[
        SS.S9  U R                  S/SS0S9  U R                  S1SS2S9  U R                  S3S4SS5S69  U R                  S7S8S9S:94  U R                  S;[        SS<S9  U R                  S=SS>S9  U R                  S?SS@S9  U R                  SASSBS9  U R                  SCSSDS9  U R                  SES4SSFS69  U R                  SGS4SSHS69  U R                  SI[        SSJSKSL9  U R                  SM[        SSJSNSL9  U R                  SOSPSQSR9  U R                  SSSPSQSR9  U R                  ST[        SSJSUSL9  U R                  SV[
        SSWSXSL9  U R                  SY[        SSJSZSL9  U R                  S[[
        S[        S\SL9  gP)]z+Add model-specific arguments to the parser.z--activation-fnzactivation function to use)choiceshelpz	--dropoutDzdropout probability)typemetavarr-   z--attention-dropoutz)dropout probability for attention weightsz--activation-dropoutz--relu-dropoutz,dropout probability after activation in FFN.z--encoder-embed-pathSTRz%path to pre-trained encoder embeddingz--encoder-embed-dimNzencoder embedding dimensionz--encoder-ffn-embed-dimz#encoder embedding dimension for FFNz--encoder-layersznum encoder layersz--encoder-attention-headsznum encoder attention headsz--encoder-normalize-before
store_truez)apply layernorm before each encoder block)actionr-   z--encoder-learned-posz0use learned positional embeddings in the encoderz--decoder-embed-pathz%path to pre-trained decoder embeddingz--decoder-embed-dimzdecoder embedding dimensionz--decoder-ffn-embed-dimz#decoder embedding dimension for FFNz--decoder-layersznum decoder layersz--decoder-attention-headsznum decoder attention headsz--decoder-learned-posz0use learned positional embeddings in the decoderz--decoder-normalize-beforez)apply layernorm before each decoder blockz--decoder-output-dimzPdecoder output dimension (extra linear layer if different from decoder embed dimz"--share-decoder-input-output-embedz)share decoder input and output embeddingsz--share-all-embeddingszWshare encoder, decoder and output embeddings (requires shared dictionary and embed dim)z --no-token-positional-embeddingsFz?if set, disables positional embeddings (outside self attention))defaultr4   r-   z--adaptive-softmax-cutoffEXPRzacomma separated list of adaptive softmax cutoff points. Must be used with adaptive_loss criterion)r0   r-   z--adaptive-softmax-dropoutz6sets adaptive softmax dropout for the tail projectionsz--layernorm-embeddingzadd layernorm to embeddingz--no-scale-embeddingzif True, dont scale embeddingsz--checkpoint-activationszicheckpoint activations at each layer, which saves GPU memory usage at the cost of some additional computez--offload-activationszUcheckpoint activations at each layer, then save to gpu.Sets --checkpoint-activations.z--no-cross-attentionzdo not perform cross-attentionz--cross-self-attentionzperform cross+self-attentionz--encoder-layerdropr   z!LayerDrop probability for encoder)r/   r0   r5   r-   z--decoder-layerdropz!LayerDrop probability for decoderz--encoder-layers-to-keepNz=which layers to *keep* when pruning as a comma-separated list)r5   r-   z--decoder-layers-to-keepz--quant-noise-pqz0iterative PQ quantization noise at training timez--quant-noise-pq-block-size   z1block size of quantization noise at training timez--quant-noise-scalarzBscalar quantization noise and scalar quantization at training timez--min-params-to-wrapad  minimum number of params for a layer to be wrapped with FSDP() when training with --ddp-backend=fully_sharded. Smaller values will improve memory efficiency, but may make torch.distributed communication less efficient due to smaller input sizes. This option is set to 0 (i.e., always wrap) when --checkpoint-activations or --offload-activations are passed.)add_argumentr   get_available_activation_fnsfloatstrintDEFAULT_MIN_PARAMS_TO_WRAP)parsers    r(   add_argsCanmtModel.add_args5   s    	668- 	 	/ 	eS7L 	 	N!<	 	 	>
 	"? 	 	A 	"8	 	 	:
 	!.	 	 	0
 	%6	 	 	8
 	%	 	 	'
 	'.	 	 	0
 	(< 	 	> 	#C 	 	E 	"8	 	 	:
 	!.	 	 	0
 	%6	 	 	8
 	%	 	 	'
 	'.	 	 	0
 	#C 	 	E 	(< 	 	> 	"2	 	 	3 	0< 	 	> 	$: 	 	;
 	.M 	 	O 	'8 	 	9 	:
 	(I	 	 	K
 	#- 	 	/ 	"1 	 	3 	&B 	 	C
 	#- 	 	.
 	"1	 	 	3
 	$/	 	 	1
 	!4 	 	6 	!4 	 	6 	&P 	 	

 	&P 	 	

 	C 	 	E 	)D 	 	F 	"P 	 	
 	".1 	 	3r*   c                    [        U5        UR                  (       a)  [        UR                  R                  S5      5      Ul        UR
                  (       a)  [        UR
                  R                  S5      5      Ul        [        USS5      c  [        Ul	        [        USS5      c  [        Ul        UR                  UR                  pCUR                  (       a  X4:w  a  [        S5      eUR                   UR"                  :w  a  [        S5      eUR$                  (       a%  UR$                  UR&                  :w  a  [        S5      eU R)                  XUR                   UR&                  5      nUnSUl        ONU R)                  XUR                   UR&                  5      nU R)                  XUR"                  UR$                  5      n[        US	S
5      (       a  SUl        U R/                  XU5      nU R1                  XU5      nU R1                  XU5      n	UR                  (       d#  [        US[2        5      n
[5        XzS9n[5        XS9nU " XX5      $ )zBuild a new model instance.,max_source_positionsNmax_target_positionsz3--share-all-embeddings requires a joined dictionaryzP--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dimz?--share-all-embeddings not compatible with --decoder-embed-pathToffload_activationsFmin_params_to_wrapmin_num_params)base_architectureencoder_layers_to_keeplensplitencoder_layersdecoder_layers_to_keepdecoder_layersgetattrDEFAULT_MAX_SOURCE_POSITIONSrC   DEFAULT_MAX_TARGET_POSITIONSrD   	vocab_src	vocab_tgtshare_all_embeddings
ValueErrorencoder_embed_dimdecoder_embed_dimdecoder_embed_pathencoder_embed_pathbuild_embedding share_decoder_input_output_embedcheckpoint_activationsbuild_encoderbuild_decoderr=   r	   )clsr!   tasksrc_dicttgt_dictencoder_embed_tokensdecoder_embed_tokensr#   r$   r%   rF   s              r(   build_modelCanmtModel.build_model   s)   
 	$&&"%d&A&A&G&G&L"MD&&"%d&A&A&G&G&L"MD4/6>(DD%4/6>(DD%!^^T^^($$# IK K%%)?)?? f  &&,,0G0GG U  $'#6#6t7;7M7M7;7N7N$P  $8 48D1#&#6#6t7;7M7M7;7N7N$P  $'#6#6t7;7M7M7;7N7N$P  4.66*.D'##D4HI##D4HI**4+?A((!(/C)C"E  KGKG4'::r*   c                     [        U5      nUR                  5       n[        XSU5      nU(       a-  [        R                  " U5      n[        R
                  " XU5        U$ N)rK   pad	Embeddingr   parse_embeddingload_embedding)	r`   r!   
dictionary	embed_dimpathnum_embeddingspadding_idxemb
embed_dicts	            r(   r[   CanmtModel.build_embedding9  sK    Z nn&;?..t4J  =
r*   c                     [        XU5      $ ri   )TransformerEncoder)r`   r!   rb   embed_tokenss       r(   r^   CanmtModel.build_encoderE  s    !$,??r*   c           
      0    [        UUU[        USS5      S9$ )Nno_cross_attentionF)no_encoder_attn)TransformerDecoderrP   )r`   r!   rc   rx   s       r(   r_   CanmtModel.build_decoderI  s%    !#D*>F	
 	
r*   return_all_hiddensfeatures_onlyalignment_layeralignment_headsc	                     U R                  UUUS9n	U R                  UU	UUUUUS9n
U R                  USUSUUUUS9nUS   S   nUS   S   nU/U/S	.nU R                  UUUUUSUS9nXU4$ )
z
Run the forward pass for an encoder-decoder model.

Copied from the base class, but without ``**kwargs``,
which are not supported by TorchScript.
)src_lengthsr   )encoder_outr   r   r   r   r   NT)r   r   full_context_alignmentr   r   r   r      
last_layerself_attn_padding_mask)r   encoder_padding_mask)r#   r$   r%   )r&   
src_tokensr   prev_output_tokensprev_src_tokensr   r   r   r   r   decoder_outdecoder_out_redecoder_out_tensordecoder_paddingdecoder_kvssrc_outs                   r(   forwardCanmtModel.forwardR  s    $ ll#1 # 3 ll#'++#1 # 
 '#'++#1 & 	
 ,A.|<(+,DE //%4$5
 %%#'++1 & 
 [00r*   
net_output	log_probssamplec                 &    U R                  XU5      $ )z@Get normalized probabilities (or log probs) from a net's output.)get_normalized_probs_scriptable)r&   r   r   r   s       r(   get_normalized_probsCanmtModel.get_normalized_probs  s     33J4:< 	<r*   encoder_outsincremental_statestemperaturec                    S nUnU R                   R                  UUUS9nS n[        U5      nUS:  a[  US   bU  [        US   [        5      (       a  US   nO(US   S   n	[        U	[        5      (       a  U	nOU	b  U	S   nUb  US S 2SS S 24   nUS   S S 2SS 2S S 24   R                  U5      US::  a  S OUS   4n
U R                  U
SS S9nUS S 2SS S 24   nUS   S   nXU4$ )	N)r   incremental_stater   attnr   Tr   r   r   )r$   r   rK   
isinstancer   div_r   r&   tokensr   r   r   r   r   r   decoder_lenattn_holderdecoder_out_tupleprobsr   s                r(   forward_decoderCanmtModel.forward_decoder  s0    :>"ll**#0 + 
 "&+&?{1~9+a.&11"1~)!nV4k622&D ,&q>DAr1H~ N1bc19%**;71$D+a.
 ))d * <aQh(^L9...r*   c                    S nUnU R                   R                  XS9nS n[        U5      nUS:  a[  US   bU  [        US   [        5      (       a  US   nO(US   S   n	[        U	[        5      (       a  U	nOU	b  U	S   nUb  US S 2SS S 24   nUS   S S 2SS 2S S 24   R                  U5      US::  a  S OUS   4n
U R                  U
SS S9nUS S 2SS S 24   nUS   S   nXX4$ )	N)r   r   r   r   r   Tr   r   )r%   r   rK   r   r   r   r   r   s                r(   forward_decoder_srcCanmtModel.forward_decoder_src  s,    :>"))11 2 - "&+&?{1~9+a.&11"1~)!nV4k622&D ,&q>DAr1H~ N1bc19%**;71$D+a.
 ))d * <aQh(^L9.;;r*   	net_inputc                     UR                  5        VVs0 s H  u  p#US:w  d  M  US:w  d  M  US:w  d  M  X#_M!     nnnU R                  R                  U5      $ s  snnf )Nr   r   sources)itemsr#   forward_torchscript)r&   r   kvencoder_inputs        r(   forward_encoderCanmtModel.forward_encoder  so     ")
)Q2F-F && +,	> AD) 	 

 ||//>>
s   AAAAc                 B    Uc   eU R                   R                  X5      $ )
Reorder encoder output according to *new_order*.

Args:
    encoder_out: output from the ``forward()`` method
    new_order (LongTensor): desired order

Returns:
    *encoder_out* rearranged according to *new_order*
)r#   reorder_encoder_out)r&   r   	new_orders      r(   r   CanmtModel.reorder_encoder_out  s%     '''||//HHr*   c                 :    U R                   R                  X5        g ri   )r$   #reorder_incremental_state_scripting)r&   r   r   s      r(   reorder_incremental_state$CanmtModel.reorder_incremental_state  s    
 	88	+r*   )r!   r$   r#   r%   r"   ri   )TFNN)      ?)"__name__
__module____qualname____firstlineno____doc__r    staticmethodr?   classmethodrf   r[   r^   r_   boolr   r<   r   torchjitexportr   r   r   r;   r   r   r:   r   r   r   r   r   __static_attributes____classcell__r'   s   @r(   r   r      s6   - F3 F3R 7; 7;r 	 	 @ @ 
 
 $(#)-)-:1 !:1 :1 "#:1 "#:1x YY
 /3	<&(4T(6:J5K0K+L"MMN< < c6k*+	< < !&/ 3V,-&/ !d30@+@&A!AB	&/
 &/Z !#< 3V,-#< !d30@+@&A!AB	#<
 #<J?c6k): ?Ic>B6l?K :L 1M I + d30@+@&A!AB+ +r*   r   c                   d  ^  \ rS rSrSrU 4S jrS r SS\\R                     4S jjr
   SS\\R                     S\S	\\R                     4S
 jjr   SS\\R                     S\S	\\R                     4S jjr\R                  R                  S\\\\	   4   4S j5       rS rS rSrU =r$ )rw   i  a-  
Transformer encoder consisting of *args.encoder_layers* layers. Each layer
is a :class:`TransformerEncoderLayer`.

Args:
    args (argparse.Namespace): parsed command-line arguments
    dictionary (~fairseq.data.Dictionary): encoding dictionary
    embed_tokens (torch.nn.Embedding): input embedding
c                   > Xl         [        TU ]	  U5        U R                  S[        R
                  " S/5      5        [        UR                  U R                  R                  S9U l
        UR                  U l        UR                  nUR                  U l        UR                  U l        X0l        UR                   (       a  SO["        R$                  " U5      U l        UR(                  (       d*  [+        UR                  UU R                  UR,                  S9OS U l        [1        USS5      n[1        USS5      (       a  [3        XES	9U l        OS U l        UR6                  (       dJ  UR8                  S
:  a:  [;        [<        R>                  " XDSS9UR8                  UR@                  5      U l!        OS U l!        U R                  S:  a  [E        U R                  S9U l#        O[<        RH                  " / 5      U l#        U RF                  RK                  [M        URN                  5       Vs/ s H  o`RQ                  U5      PM     sn5        [S        U RF                  5      U l*        URV                  (       a  [3        XES	9U l,        g S U l,        g s  snf )Nversion   module_namer   learnedr   Flayernorm_embeddingr   r   bias        p)-r!   r   r    register_bufferr   r   r   dropoutr'   r   dropout_moduleencoder_layerdropembedding_dimrr   rC   rx   no_scale_embeddingmathsqrtembed_scaleno_token_positional_embeddingsr   encoder_learned_posembed_positionsrP   r   r   adaptive_inputquant_noise_pqapply_quant_noise_nnLinearquant_noise_pq_block_sizer   r   layers
ModuleListextendrangerM   build_encoder_layerrK   
num_layersencoder_normalize_before
layer_norm)r&   r!   rn   rx   ro   r   ir'   s          r(   r    TransformerEncoder.__init__  s   	$YaS(9:,LLdnn&=&=?!%!7!7 ..	'33$($=$=!("&"9"93tyy@ 88  ))  00	
 ?C 	 x/4.66'0'JD$'+D$""t':':Q'>1		)U;##.. D  $D!!C'-0F0FGDK--+DK49$:M:M4N
4Nq$$T*4N
 	 dkk*(('	ADO"DO
s   'J
c                     [        U5      n[        USS5      nU(       a  [        USS5      n[        X$S9nU(       d  [        US[        5      OSn[	        X%S9nU$ Nr]   FrE   )offload_to_cpurF   r   rG   )r   rP   r   r=   r	   )r&   r!   layer
checkpointr  rF   s         r(   r   &TransformerEncoder.build_encoder_layerG  se    '-T#;UC
$T+@%HN&uLE  D.0JK#$ 	 %Cr*   token_embeddingc                 *   Uc  U R                  U5      nU R                  U-  =p4U R                  b  X@R                  U5      -   nU R                  b  U R                  U5      nU R	                  U5      nU R
                  b  U R                  U5      nX44$ ri   )rx   r   r   r   r   r   )r&   r   r  xembeds        r(   forward_embedding$TransformerEncoder.forward_embeddingS  s     ""//
;O$$66+,,Z88A##/((+A"'  #Axr*   r   r   token_embeddingsc                 &    U R                  XX45      $ )  
Args:
    src_tokens (LongTensor): tokens in the source language of shape
        `(batch, src_len)`
    src_lengths (torch.LongTensor): lengths of each source sentence of
        shape `(batch)`
    return_all_hiddens (bool, optional): also return all of the
        intermediate hidden states (default: False).
    token_embeddings (torch.Tensor, optional): precomputed embeddings
        default `None` will recompute embeddings

Returns:
    dict:
        - **encoder_out** (Tensor): the last encoder layer's output of
          shape `(src_len, batch, embed_dim)`
        - **encoder_padding_mask** (ByteTensor): the positions of
          padding elements of shape `(batch, src_len)`
        - **encoder_embedding** (Tensor): the (scaled) embedding lookup
          of shape `(batch, src_len, embed_dim)`
        - **encoder_states** (List[Tensor]): all intermediate
          hidden states of shape `(src_len, batch, embed_dim)`.
          Only populated if *return_all_hiddens* is True.
)forward_scriptable)r&   r   r   r   r  s        r(   r   TransformerEncoder.forwardc  s    < &&z'9M 	Mr*   c                 J   UR                  U R                  5      nUR                  R                  S:H  =(       d    UR	                  5       nU R                  UU5      u  pxU(       a&  USUR                  S5      R                  U5      -
  -  nUR                  SS5      n/ n	U(       a  U	R                  U5        U R                   H2  n
U
" UU(       a  UOSS9nU(       d  M  U	c   eU	R                  U5        M4     U R                  b  U R                  U5      nU/U/U/U	/ / S.$ )r  xlar   r   r   N)r   r   r   encoder_embeddingencoder_statesr   r   )eqrr   devicer/   anyr	  	unsqueezetype_as	transposeappendr   r   )r&   r   r   r   r  r   has_padsr  r  r  r  s              r(   r  %TransformerEncoder.forward_scriptable  s8   >  *}}T-=-=>$$))U2 
6J6N6N 7
#55j6F H Q-77;CCAFFGA KK1!!!$ [[E &:!%'A "!%111%%a( ! ??&"A 3%9$:"3!4,
 	
r*   r   c                 X   [        US   5      S:X  a  / nOUS   S   R                  SU5      /n[        US   5      S:X  a  / nOUS   S   R                  SU5      /n[        US   5      S:X  a  / nOUS   S   R                  SU5      /n[        US   5      S:X  a  / nOUS   S   R                  SU5      /n[        US   5      S:X  a  / nOUS   S   R                  SU5      /nUS   n[        U5      S:  a(  [        U5       H  u  pU
R                  SU5      X'   M     UUUUUUS	.$ )
r   r   r   r   r   r  r   r   r  r  )rK   index_select	enumerate)r&   r   r   new_encoder_outnew_encoder_padding_masknew_encoder_embeddingr   r   r  idxstates              r(   r   &TransformerEncoder.reorder_encoder_out  s    {=)*a/ O M*1-::1iHO {123q8')$ 23A6CCy"($ {./0A5$&! /03@@IN%! {<()Q.J \*1-;;AyIJ {=)*a/K ]+A.<<Q	JK %%56~"'7
&+&8&8I&F# 8 +$<!6,$&
 	
r*   c                     U R                   c  U R                  $ [        U R                  U R                   R                  5      $ )z.Maximum input length supported by the encoder.)r   rC   minmax_positionsr&   s    r(   r)   TransformerEncoder.max_positions  >    ',,,4,,''557 	7r*   c                    [        U R                  [        5      (       aZ  SR                  U5      nX1;   a  [	        SR                  U5      5        X	 [
        R                  " S5      USR                  U5      '   [        U R                  5       H1  nU R                  U   R                  USR                  X$5      5        M3     SR                  U5      n[        R                  " UR                  U[
        R                  " S/5      5      S   5      S:  a'  S	U l        S
U l        [
        R                  " S/5      X'   U$ )@Upgrade a (possibly old) state dict for new versions of fairseq.{}.embed_positions.weightszdeleting {0}r    {}.embed_positions._float_tensorz{}.layers.{}
{}.versionr      NF)r   r   r   formatprintr   FloatTensorr   r   r   upgrade_state_dict_namedr   itemgetr   r   	normalize)r&   
state_dictnameweights_keyr   version_keys         r(   r6  +TransformerEncoder.upgrade_state_dict_named  s   d**,IJJ6==dCK(n++K89+**1- 9@@ t'AKKN33N11$:< (
 #))$/::jnn[%,,s2CDQGH1L"DO"DN&+llA3&7J#r*   )r!   r   r   r   rx   r   r   r   r   rC   r9  r   rr   r   ri   )NFN)r   r   r   r   r   r    r   r   r   r   r	  r   r   r  r   r   r   r;   r   r   r)  r6  r   r   r   s   @r(   rw   rw     s    3#j
 EI+3ELL+A& /3#(37M ell+M !	M
 #5<<0MH /3#(37E
 ell+E
 !	E

 #5<<0E
N YY:
tCf4E/F :
 :
x7 r*   rw   c                     ^  \ rS rSrSr  SU 4S jjrS rSS jr        SS\\	\
\\   4      S\\	\
\	\
\\   4   4      S\S	\S
\\   S\\   S\\   S\4S jjr    SS\\	\
\\   4      S\\	\
\	\
\\   4   4      S	\S
\\   S\\   4
S jjr     SS\\	\
\\   4      S\\	\
\	\
\\   4   4      S	\S
\\   S\\   4
S jjrS rS rS rS rSrU =r$ )r}   i&  a  
Transformer decoder consisting of *args.decoder_layers* layers. Each layer
is a :class:`TransformerDecoderLayer`.

Args:
    args (argparse.Namespace): parsed command-line arguments
    dictionary (~fairseq.data.Dictionary): decoding dictionary
    embed_tokens (torch.nn.Embedding): output embedding
    no_encoder_attn (bool, optional): whether to attend to encoder outputs
        (default: False).
c           	        > Xl         [        T
U ]	  U5        U R                  S[        R
                  " S/5      5        [        R                  " S5      U l        [        UR                  U R                  R                  S9U l        UR                  U l        UR                  U l        UR                   nUR"                  nXpl        UR&                  U l        UR*                  U l        UR,                  U l        X0l        UR0                  (       a  SO[2        R4                  " U5      U l        UR8                  (       dJ  UR:                  S:  a:  [=        [>        R@                  " XwSS9UR:                  URB                  5      U l"        OS U l"        Xv:w  a
  [A        XgSS9OS U l#        URH                  (       d*  [K        U R,                  UU R*                  URL                  S9OS U l'        [Q        US	S5      n[Q        US
S5      (       a  [S        XxS9U l*        OS U l*        [Q        USS5      U l+        U R                  S:  a  [Y        U R                  S9U l-        O[>        R\                  " / 5      U l-        U RZ                  R_                  [a        URb                  5       V	s/ s H  n	U Re                  X5      PM     sn	5        [g        U RZ                  5      U l4        URj                  (       a!  [Q        USS5      (       d  [S        XxS9U l6        OS U l6        XpR(                  :w  a%  URn                  (       d  [A        XpR(                  SS9OS U l8        S U l9        XPl:        U Rt                  c  U Rw                  XU5        g g s  sn	f )Nr   r   r   r   r   Fr   r   r   r   r   cross_self_attentionr   r   no_decoder_final_norm)<r!   r   r    r   r   r   empty_future_maskr   r   r'   r   r   decoder_layerdropr\   share_input_output_embedr   rX   ro   decoder_output_dimoutput_embed_dimrr   rD   rx   r   r   r   r   r   r   r   r   r   r   r   project_in_dimr   r   decoder_learned_posr   rP   r   r   rA  r   r   r   r   r   rO   build_decoder_layerrK   r   decoder_normalize_beforer   tie_adaptive_weightsproject_out_dimadaptive_softmaxoutput_projectionbuild_output_projection)r&   r!   rn   rx   r|   rP  input_embed_dimro   r   _r'   s             r(   r    TransformerDecoder.__init__3  s    	$YaS(9:!KKN,LLdnn&=&=?!%!7!7(,(M(M%&44**	" $ 7 7'33$($=$=!("&"9"93tyy@ ""t':':Q'>1		)U;##.. D  $D + ?E:15 	 88  ))  00	
 ?C 	 x/4.66'0'JD$'+D$$+D2H,1%3! !!C'-0F0FGDK--+DK4../
/ $$T;/
 	 dkk*((-u26 26'	ADO"DO 111-- 933%@37 	
 !%!2!!)((<H *'
s   M<c           
         UR                   b{  [        [        U5      U R                  [        R
                  " UR                   [        S9UR                  UR                  (       a  UOS UR                  UR                  S9U l        GOU R                  (       a  [        R                  " U R                  R                   R"                  S   U R                  R                   R"                  S   SS9U l        U R                  R                   U R$                  l        On[        R                  " U R                  [        U5      SS9U l        [        R&                  R)                  U R$                  R                   SU R                  S-  S9  [+        US	S5      n[-        U5       H>  nU R.                  R1                  US-   UR2                  -  US-   -  [5        U5      5        M@     g )
N)r/   )r   adaptive_inputsfactortie_projr   r   Fr         ࿩meanstdbase_layers)adaptive_softmax_cutoffr   rK   rH  r   eval_str_listr<   adaptive_softmax_dropoutrM  adaptive_softmax_factortie_adaptive_projrO  rF  r   r   rx   weightshaperP  initnormal_rP   r   r   insertrO   r   )r&   r!   rn   rx   num_base_layersr   s         r(   rQ  *TransformerDecoder.build_output_projection  s   ''3$3J%%##D$@$@sK55,, !-2633//	%D! **%'YY!!((..q1!!((..q1&D"
 -1,=,=,D,DD"")%'YY%%s:U&DD"GGOO&&--))4/  1 "$q9'AKKa%4...Oa4GH$ (r*   c                     [        X5      n[        USS5      nU(       a  [        USS5      n[        X5S9nU(       d  [        US[        5      OSn[	        X6S9nU$ r   )r   rP   r   r=   r	   )r&   r!   r|   r  r  r  rF   s          r(   rK  &TransformerDecoder.build_decoder_layer  se    '>T#;UC
$T+@%HN&uLE  D.0JK#$ 	 %Cr*   r   r   r   r   r   r   r   r   c
           	      d    U R                  UUUUUUS9u  pU(       d  U R                  U
5      n
X4$ )a  
Args:
    prev_output_tokens (LongTensor): previous decoder outputs of shape
        `(batch, tgt_len)`, for teacher forcing
    encoder_out (optional): output from the encoder, used for
        encoder-side attention, should be of size T x B x C
    incremental_state (dict): dictionary used for storing state during
        :ref:`Incremental decoding`
    features_only (bool, optional): only return features without
        applying output layer (default: False).
    full_context_alignment (bool, optional): don't apply
        auto-regressive mask to self-attention (default: False).

Returns:
    tuple:
        - the decoder's output of shape `(batch, tgt_len, vocab)`
        - a dictionary with any model-specific outputs
)r   r   r   r   r   )extract_featuresoutput_layer)r&   r   r   r   r   r   r   r   r   r   r  extras               r(   r   TransformerDecoder.forward  sJ    @ ((#/#9++ ) 
 !!!$Axr*   c                 .    U R                  UUUUUU5      $ ri   )extract_features_scriptable)r&   r   r   r   r   r   r   s          r(   rm  #TransformerDecoder.extract_features  s)     //"
 	
r*   c                    UR                  5       u  pxUc  U R                  S-
  nSn	Sn
UbH  [        US   5      S:  a6  US   S   n	U	R                  5       S   U:X  d   SU SU	R                   35       eUb  [        US   5      S:  a  US   S   n
SnU R                  b  U R	                  XS9nUb  USS2S	S24   nUb  USS2S	S24   nU R
                  U R                  U5      -  nU R                  b  U R                  U5      nU R                  b  U R                  U5      nUb  X-  nU R                  b  U R                  U5      nU R                  U5      nUR                  SS5      nSnU R                  (       d.  UR                  U R                  5      R                  5       (       a  UR                  U R                  5      nSnU/n[!        U R"                  5       H  u  nnUc  U(       d  U R%                  U5      nOSnU" UU	U
UUU['        UU:H  5      ['        UU:H  5      S
9u  nnnUR)                  U5        Uc  Mc  UU:X  d  Mk  UR+                  5       R-                  U5      nM     Ub  Ub  USU nUR/                  SS9nU R0                  b  U R1                  U5      nUnUR                  SS5      nU R2                  b  U R3                  U5      nUU/UUUS.4$ )a  
Similar to *forward* but only return features.

Includes several features from "Jointly Learning to Align and
Translate with Transformer Models" (Garg et al., EMNLP 2019).

Args:
    full_context_alignment (bool, optional): don't apply
        auto-regressive mask to self-attention (default: False).
    alignment_layer (int, optional): return mean alignment over
        heads at this layer (default: last layer).
    alignment_heads (int, optional): only average alignment over
        this many heads (default: all heads).

Returns:
    tuple:
        - the decoder's features of shape `(batch, tgt_len, embed_dim)`
        - a dictionary with any model-specific outputs
Nr   r   r   zExpected enc.shape == (t, z	, c) got r   )r   r   )self_attn_maskr   	need_attnneed_head_weights)dim)r   inner_statesr   r   )sizer   rK   rd  r   r   rx   r   rI  r   r   r  rA  r  rr   r  r   r   buffered_future_maskr   r  r:   tor[  r   rN  )r&   r   r   r   r   r   r   bsslenencpadding_mask	positionsr  r   r   ry  r$  r  ru  
layer_attnself_attn_hiddenr   s                         r(   rr  .TransformerDecoder.extract_features_scriptable  sI   : &**,""oo1O $)-"s;}+E'F'Jm,Q/CHHJqMR' M3B4yLM'"s23(578(9&'=>qAL 	+,," - II(!3ArsF!;$%af-	 t001CDD'  #A*##A&A NA##/((+A" KK137$$(:(=(=  )""%#%)(%7%:%:4;K;K%L" "&01s#DKK0JC (1G!%!:!:1!=!%.3!-'= 68"&(>"@	/+Az+ "%#*@!''),,Q/% 1( *,_-999#D??&"A
KK1+$$Q'AF($&<	
 
 	
r*   c                 B    U R                   c  U R                  U5      $ U$ )z(Project features to the vocabulary size.)rO  rP  )r&   featuress     r(   rn  TransformerDecoder.output_layert  s$      ())(33Or*   c                     U R                   c  U R                  $ [        U R                  U R                   R                  5      $ )z/Maximum output length supported by the decoder.)r   rD   r(  r)  r*  s    r(   r)   TransformerDecoder.max_positions|  r,  r*   c                    UR                  S5      nU R                  R                  S5      S:X  dC  U R                  R                  UR                  :X  a  U R                  R                  S5      U:  aE  [        R                  " [
        R                  " [        R                  " X"/5      5      S5      U l        U R                  R                  U5      U l        U R                  S U2S U24   $ )Nr   r   )	rz  rD  r  r   triur   fill_with_neg_infzerosr|  )r&   tensorrx  s      r(   r{  'TransformerDecoder.buffered_future_mask  s    kk!n""1%*))00FMMA$$))!,s2 %

''SJ(?@!!ED --008  #tt,,r*   c                 2   [        U R                  [        5      (       a@  SR                  U5      nX1;   a  X	 [        R
                  " S5      USR                  U5      '   U S3U;  a>  U R                  (       a  U S3nOU S3nXA;   a  UU   X S3'   U R                  (       d  X	 [        U R                  5       H]  nSSS	S
.nUR                  5        H@  u  pxS H5  n	SR                  X%Xy5      n
X;   d  M  X   USR                  X%X5      '   X	 M7     MB     M_     SR                  U5      n[        R                  " UR                  U[        R                  " S/5      5      S   5      S::  a'  SU l        SU l        [        R                  " S/5      X'   U$ )r.  r/  r   r0  z.output_projection.weightz.embed_tokens.weightz
.embed_outself_attn_layer_normencoder_attn_layer_normfinal_layer_norm)012)rc  r   z{}.layers.{}.layer_norms.{}.{}z{}.layers.{}.{}.{}r1  r   r2  NF)r   r   r   r3  r   r5  rF  r   r   r   r   r7  r8  r   r   r9  )r&   r:  r;  r<  embed_out_keyr   layer_norm_mapoldnewmr   r=  s               r(   r6  +TransformerDecoder.upgrade_state_dict_named  s   d**,IJJ6==dCK(+**1- 9@@  V,-Z?,,#'&(< =#'&
 3*AK!B#
V#<=>44"1t'A ,.'N
 +002+A8??)A0: ##7#>#> S$- .&M , 3 (  #))$/::jnn[%,,s2CDQGHAM"DO"DN&+llA3&7J#r*   )rD  rO  r!   rA  rE  r   ro   r   r   rx   r   r   r   rD   r9  r   rH  rP  rr   rI  rN  r   rF  )FN)F)NNFFNNNF)NFNN)r   r   r   r   r   r    rQ  rK  r   r   r;   r   r   r   r<   r   r   rm  rr  rn  r)  r{  r6  r   r   r   s   @r(   r}   r}   &  s%   
" SIjB
 :>IM#',)-)-%)#(+ d3V#456+ $Dd33;F3C4D /E *E %F G	+ + !%+ "#+ "#+ c]+ !+d JN',)-)-
 d3V#456
 $Dd33;F3C4D /E *E %F G	
 !%
 "#
 "#
& JN',)-)-w
 d3V#456w
 $Dd33;F3C4D /E *E %F G	w
 !%w
 "#w
 "#w
r7-+ +r*   r}   c                     [         R                  " XUS9n[         R                  R                  UR                  SUS-  S9  [         R                  R                  UR                  U   S5        U$ )N)rr   r   rY  rZ  )r   rk   re  rf  rc  	constant_)rq   r   rr   r  s       r(   rk   rk     sS    
^LAGGOOAHH1-*=O>GGahh{+Q/Hr*   c                     [         R                  " XU5      n[         R                  R                  UR                  5        U(       a*  [         R                  R                  UR                  S5        U$ )Nr   )r   r   re  xavier_uniform_rc  r  r   )in_featuresout_featuresr   r  s       r(   r   r     sH    
		+T2AGGAHH%
!&&#&Hr*   c                 $   [        U SS 5      U l        [        U SS5      U l        [        U SS5      U l        [        U SS5      U l        [        U SS	5      U l        [        U S
S5      U l        [        U SS5      U l        [        U SS 5      U l        [        U SU R                  5      U l	        [        U SU R                  5      U l
        [        U SS5      U l        [        U SS	5      U l        [        U SS5      U l        [        U SS5      U l        [        U SS5      U l        [        U SS5      U l        [        U SS5      U l        [        U SS5      U l        [        U SS 5      U l        [        U SS5      U l        [        U SS5      U l        [        U SS5      U l        [        U S S5      U l        [        U S!S5      U l        [        U S"S5      U l        [        U S#S5      U l        [        U S$U R                  5      U l        [        U S%U R                  5      U l        [        U S&S5      U l        [        U S'S5      U l        [        U S(S5      U l        [        U S)S5      U l         [        U S*S5      U l!        U RB                  (       a  S+U l         [        U S,S 5      U l"        [        U S-S 5      U l#        [        U S.S5      U l$        [        U S/S5      U l%        [        U S0S5      U l&        [        U S1S	5      U l'        [        U S2S5      U l(        g )3NrZ   rW   i   encoder_ffn_embed_dimi   rM      encoder_attention_headsr7   r   Fr   rY   rX   decoder_ffn_embed_dimrO   decoder_attention_headsrL  rJ  attention_dropoutr   activation_dropoutactivation_fnrelur   g?r^  r`  r   r\   rU   r   r   r{   rA  rG  decoder_input_dimr   r   rM  r]   rE   TrJ   rN   r   rE  r   r   quant_noise_scalar))rP   rZ   rW   r  rM   r  r   r   rY   rX   r  rO   r  rL  rJ  r  r  r  r   r^  r`  r\   rU   r   r   r{   rA  rG  r  r   r   rM  r]   rE   rJ   rN   r   rE  r   r   r  r!   s    r(   rI   rI     s   %d,@$GD$T+>DD!(/F!MD!$(8!<D#*41JA#ND $+D2L,1%3D!&t-BEJD%d,@$GD$T+>%)%;%;=D!(/F)-)C)C"ED!$(8!<D#*41JA#ND $+D2L,1%3D!&t-BEJD$T+>DD%d,@#FD ?D4C0DL#*41J+/$1D $+D2L,-%/D!,30%-9D) '.De LD*1.+7D'!$(8%@D%d,@%HD '.De LD%d,@&*&<&<>D$T+>%)%;%;=D &d,@%HD&t-BEJD '.De LD")$0H*/#1D&t-BEJD&*#")$0H$"OD")$0H$"OD$T+>BD$T+>BD!$(8!<D%,T3N-.&0D"%d,@!DDr*   c                    [        U SS5      U l        [        U SS5      U l        [        U SS5      U l        [        U SS5      U l        [        U S	S
5      U l        [        U SS
5      U l        [        U SS5      U l        [        U SS5      U l        [        U SS5      U l	        [        U SS5      U l
        [        U SS5      U l        [        U SS5      U l        [        U SS5      U l        [        U 5        g )NrW   i   r  i   r     rM      r   TrL  rO   r   rX   r  r  r  g{Gz?r  r   )rP   rW   r  r  rM   r   rL  rO   rX   r  r  r  r  r   rI   r  s    r(   transformer_deepr    s    $T+>DD!(/F!MD#*41JB#OD !$(8"=D$+D2L,0%2D!$+D2L,0%2D!!$(8!<D$T+>DD!(/F!MD#*41JB#OD $T+>ED%d,@$GD4D1DLdr*   )T)0r   typingr   r   r   r   r   numpyr   torch.nnr   fairseqr   fairseq.distributedr	   fairseq.modelsr
   r   r   r   r   fairseq.modulesr   r   r   r   r   r   r   r   r   &fairseq.modules.checkpoint_activationsr   fairseq.modules.quant_noiser   r   r   rQ   rR   r<   r=   r   rw   r}   rk   r   rI   r   r*   r(   <module>r     s     3 3     )9 9O O O
 F I # #  X f++ f+R\ \~R2 Rj8Evr*   