
    9i                     R   S SK r S SKJr  S SKJr  S SKrS SKJr  S SK	J
r
Jr  S SKJr  S SKJr  S/r\R"                  " \R$                  \R$                  S	9 " S
 S\
5      5       r " S S\" SS5      5      rS rS rS rS rS-S jrS.S jrS/S jrS rS-S jrS-S jrS-S jr0 S4S jr 0 S4S jr!SSS0 S4S jr"S0S jr#S1S  jr$S! r%   S2S" jr&S# r' S3S$ jr(     S4S% jr)S& r*S' r+S( r,Sr-S) r. " S* S+\R^                  R`                  Rb                  Rd                  5      r3S, r4g)5    N)
namedtuple)Dict)Models)ModelTensor)MODELS)TasksCsanmtForTranslation)module_namec                     ^  \ rS rSrU 4S jr   SS\\\4   S\\\4   S\\\4   S\\\4   S\\\4   4
S	 jjr	S\\\4   S\\\4   4S
 jr
S rSS jrS rSS jr0 S4S jr  SS jrS rS r0 S4S jrS rSrU =r$ )r
      c                 d   > [         TU ]  " U/UQ70 UD6  X0l        [        U R                  5        g)z3
Args:
    params (dict): the model configuration.
N)super__init__paramsprint)self	model_dirargskwargs	__class__s       h/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/csanmt/translation.pyr   CsanmtForTranslation.__init__   s-    
 	4T4V4dkk    Ninputlabelprefix
prefix_hitreturnc                    UcY  [         R                  R                  R                  S5         U R	                  UUUS.U R
                  5      u  pVSSS5        WWS.$ U R                  X5      u  pxUUS.$ ! , (       d  f       N+= f)a|  return the result by the model

Args:
    input: the preprocessed input source sequence
    label: the ground truth target data for model training
    prefix: the preprocessed input target prefix sequence for interactive translation
    prefix_hit: the preprocessed target prefix subword vector for interactive translation

Returns:
    output_seqs: output sequence of target ids
NNmtModel)
input_widsprefix_widsr   )output_seqsoutput_scores)train_oploss)tfcompatv1variable_scopebeam_searchr   transformer_model_train_fn)	r   r   r   r   r   r$   r%   r&   r'   s	            r   __call__CsanmtForTranslation.__call__   s      =,,Z8-1-=-=&+'-&0 {{.$* 9  +!. 
 "<<UJNH$  98s   #A55
Bc                     g)z
Run the forward pass for a model.

Args:
    input (Dict[str, Tensor]): the dict of the model inputs for the forward method

Returns:
    Dict[str, Tensor]: output from the model forward pass
N )r   r   s     r   forwardCsanmtForTranslation.forwardA   s     	r   c                     US   nUS   n[         R                  R                  R                  SUS-  [         R                  S9nUS   (       a~  [         R                  R                  R                  S[         R                  R                  R                  S9   [         R                  R                  R                  S	X4/US
9nS S S 5        O\[         R                  R                  R                  S5         [         R                  R                  R                  S	X4/US
9nS S S 5        [         R                  R                  R                  SU/5      n[         R                  " U[         R                  S9S S 2S S24   n[         R                  " X/S5      n	[         R                  " [         R                  " U	S5      [         R                  S9n
U
S S 2S S24   n[         R                  " USS/SS//SS9n[         R                  " W[         R                  " U	[         R                  5      5      nXS-  -  nUS   S:X  a  [!        U5      n[         R"                  " U[         R$                  " US5      5      n[         R&                  R)                  X5      n[+        US5      nUS   S:  a   [         R&                  R-                  XS   S9n[/        UUX5      nX4$ ! , (       d  f       GN= f! , (       d  f       GN= f)Nsrc_vocab_sizehidden_size              ࿩dtypeshared_source_target_embeddingShared_EmbeddingreuseWeightsinitializerSource_Embeddingencoder_input_bias   r   tensorpaddingsconstant_values      ?position_info_typeabsolute   maskingresidual_dropoutrate)r(   r)   r*   random_normal_initializerfloat32r+   
AUTO_REUSEget_variable
zeros_likeint64concatcast	not_equalpadgatherint32add_timing_signalmultiplyexpand_dimsnnbias_addattention_biasdropouttransformer_encoder)r   featuresr   r5   r6   rA   src_embeddingsrc_biaseos_paddingsrc_seqsrc_maskshift_src_maskencoder_inputencoder_self_attention_biasencoder_outputs                  r   encoding_graph#CsanmtForTranslation.encoding_graphM   s    01]+iill<<d""** = 6 23,,&biill.E.E - G "		 9 9< + !: !-G G ,,-?@ "		 9 9< + !: !- A 99<<,,-A.9]< mmHBHH=a!eD))X3Q7772<<32::F!!SbS&)!!fq!f%
 		-"((1KL%c)9:&':5-m<MM$&NN>1$EG }?&4^Y&O#$%+EEMM+=$> * @M -]-H-;E ::QG G A@s   +K6+K.
K+.
K=c                    US   n[         R                  R                  R                  SUS-  [         R                  S9nS nUS   (       a  US   nSnO'US:X  a  US   nS	nOUS
:X  a  US   nSnO[        S5      e[         R                  R                  R                  U[         R                  R                  R                  S9   [         R                  R                  R                  SXt/US9nS S S 5        [         R                  " U[         R                  S9S S 2S S24   n	[         R                  " X/S5      n
[         R                  " [         R                  " U
S5      [         R                  S9nUS S 2S S24   n[         R                  " USS/SS//SS9n[         R                  " W[         R                  " U
[         R                   5      5      nXS-  -  n[         R"                  " U[         R$                  " US5      5      n['        US5      nUS   S:  a   [         R(                  R+                  XS   S9n[-        XUU5      nU$ ! , (       d  f       GNc= f)Nr6   r7   r8   r9   r;   r5   Shared_Semantic_EmbeddingsourceSource_Semantic_Embeddingtargettrg_vocab_sizeTarget_Semantic_Embeddingzerror: no right name specified.r=   r?   r@   rD   r   rE   rF   rJ   rM   rN   rO   rP   )r(   r)   r*   rR   rS   
ValueErrorr+   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r_   r`   rc   ra   rd   transformer_semantic_encoder)r   rf   r   namer6   rA   scope
vocab_sizeembedding_matri   	input_seq
input_maskshift_input_maskrm   rn   ro   s                   r   semantic_encoding_graph,CsanmtForTranslation.semantic_encoding_graph   s1   ]+iill<<d""** = 623 01J/EX 01J/EX 01J/E>??YY\\((biill6M6M(NIILL55J4+ 6 OM O mmHBHH=a!eDIIx5q9	WWR\\)Q7rzzJ
%a"f-66#!fq!f%
 		-BHH1MN%c)9:M$&NN3CQ$GI '55E5>'@# $%+EEMM+=$> * @M 68H ; ONs   +I
Ic                 l    SnSnUS   (       a  S nS nU R                  XUS9nU R                  X#US9nXg4$ )Nrt   rv   r;   r{   )r   )r   rf   labelsr   source_nametarget_namefeature_outputlabel_outputs           r    build_contrastive_training_graph5CsanmtForTranslation.build_contrastive_training_graph   s^    23KK55; 6 033 4 . ++r   c                    ^^^ US   mUS   mTS-  S:X  d   eUUU4S jn/ nU" X5      nUR                  U" X!5      5        [        U5      T:X  d   e[        R                  " USS9$ )Nnum_of_samplesetarM   r   c           
        > X-
  n[         R                  R                  [         R                  " U5      [         R                  " [         R                  " U5      SSS9-
  T	-   [         R
                  " [         R                  " U5      SSS9[         R                  " [         R                  " U5      SSS9-
  ST	-  -   5      n/ n[        TS-  5       H  nT
[         R                  R                  [         R                  " US9SU5      -  ST
-
  [         R                  R                  [         R                  " US9SS5      -  -   nXU-  -   nUR                  U5        M     U$ )NrM   T)input_tensoraxiskeepdimsr   r7         ?)r(   mathdivideabs
reduce_min
reduce_maxrangerandomnormalshapeappend)x_vectory_vectorbias_vectorw_rRiomegasampleKepsilonr   s           r   get_samples7CsanmtForTranslation.MGMC_sampling.<locals>.get_samples   s3   "-K''..{#bmm!#!41t'M M !#!41tM--!#!41tMM g+		C A16]bii..rxxk/JCQTUU3Y"))"2"2288+3NPSUX"YYZ!K$77 	 #
 Hr   r   )extendlenr(   rX   )	r   x_embeddingy_embeddingr   r   r   ALL_SAMPLESr   r   s	       `  @@r   MGMC_sampling"CsanmtForTranslation.MGMC_sampling   st    #$Um1uzz	( !+;;{@A;1$$$yy1--r   c                    US   nUS   n[         R                  R                  R                  SUS-  [         R                  S9nUS   (       a~  [         R                  R                  R                  S[         R                  R                  R                  S9   [         R                  R                  R                  S	Xg/US
9n	S S S 5        O\[         R                  R                  R                  S5         [         R                  R                  R                  S	Xg/US
9n	S S S 5        [         R                  " U[         R                  S9S S 2S S24   n
[         R                  " X:/S5      n[         R                  " [         R                  " US5      [         R                  S9nUS S 2S S24   n[         R                  " USS/SS//SS9n[         R                  " W	[         R                  " U[         R                  5      5      nXS-  -  n[!        [         R"                  " US9S   S5      n[         R                  " USS/SS/SS//S9S S 2S S2S S 24   nUS   S:X  a  [%        U5      n[         R&                  R)                  USSUS   -
  -
  S9n[+        UUUUS S UUS9u  nnU R-                  UU5      nUS   nSUS   -
  [         R                  " US-
  [         R                  S9-  n[         R.                  " [         R                  " U[         R                  5      UUUS9n[         R                  " UUR0                  5      n[         R&                  R3                  U[         R4                  " U5      S9U-  n[         R6                  " US9[         R6                  " US9-  nU$ ! , (       d  f       GN= f! , (       d  f       GN= f)Nrw   r6   r7   r8   r9   r;   r<   r=   r?   r@   Target_EmbeddingrD   r   rE   rF   rJ   r   causalrG   rH   rK   rL   r   rO   rP   
states_key
states_valembedding_augmentationr   
confidence)depthon_value	off_value)logitsr   r   )r(   r)   r*   rR   rS   r+   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   rc   r   r^   ra   rd   transformer_decoder
predictionone_hotr:   !softmax_cross_entropy_with_logitsstop_gradient
reduce_sum)r   ro   rn   r   r   r   rw   r6   rA   trg_embeddingri   trg_seqtrg_maskshift_trg_maskdecoder_inputdecoder_self_attention_biasdecoder_outputattention_weightsr   r   r   soft_targetsmaskxentropyr'   s                            r   decoding_graph#CsanmtForTranslation.decoding_graph   sw      01]+iill<<d""** = 6 23,,&biill.E.E - G "		 9 9< + !: !-G G ,,-?@ "		 9 9< + !: !- A
 mmF"((;ArrEB))V115772<<32::F!!SbS&)!!fq!f%
 		-"((1KLc))&4HH=)!,h'8# QFQFQF+CEEFQYP&':5-m<MS62D+E%E F & H -@''#9-)) 8,'6,//277Qbjj42 2	zzGGGRXX& 	!
 ww~v||455::""2"2<"@ ; BDHI}}(3bmm7  yG G A@s   +N'6+N9'
N69
Oc                 2   U R                  X5      u  pgS nUbk  Ubh  U R                  XEU5      n[        R                  " UUS   SS/5      n[        R                  " UUS   SSS/5      n[        R                  " X#S   S/5      nU R	                  UUUUUS9n	U	$ )Nr   rD   )r   )rp   r   r(   tiler   )
r   rf   r   r   feature_embeddinglabel_embeddingro   rn   r   r'   s
             r   build_training_graph)CsanmtForTranslation.build_training_graph)  s     7;6I6I73!%(_-H%)%7%7!F&<"  WW^&,-=&>1%EGN*,''+()1a3+5' WWV-=&>%BCF ""'#9 # ; r   c                 4  ^^^^ [        U R                  5      n[        R                  R                  R                  SUS9   U R                  S   mU R                  S   n[        R                  R                  R                  R                  5       n[        U5        [        U R                  S   XPR                  5      n[        R                  " U[        R                  S9nU R                  S   S:X  a4  [        R                  R                  R                  R                  U5      nOU R                  S   S	:X  a\  [        R                  R                  R                  R                  UU R                  S
   U R                  S   U R                  S   S9nOG[        R                  R                  R                  R                  S5        [         R#                  5         [%        WU R                  S   5      nS m[        R&                  " [        R(                  " TS9S   T:  UUU4S jU4S jS9m[        R&                  " [        R(                  " TS9S   T:  UUU4S jU4S jS9mTS:  a  [+        TT5      n	[+        TT5      n
OT/n	T/n
TS:  a  [-        T5       Vs/ s H  nSU-  PM
     nnOS/n/ n/ n[/        U5       GH  u  nn[        R0                  " U5         [        R                  R                  R                  [        R                  R                  R3                  5       US:  a  SOS S9   [        R4                  " SSU4-  5         U R7                  X   X   U R                  5      u  nnU R9                  X   X   U R                  UU5      nUR;                  U5        [        R                  R                  R<                  R?                  SRA                  U5      U5        [        R                  R                  RC                  5        Vs/ s H)  nSURD                  ;  d  M  S URD                  ;  d  M'  UPM+     nnURG                  UUSS!9nUR;                  U5        S S S 5        S S S 5        S S S 5        GM     [        RH                  " U5      [K        U5      -  n[M        U5      nUS":  a:  [O        [Q        U6 5      u  nn[        RR                  " UU5      u  nn[Q        UU5      nURU                  U[        R                  R                  R                  R                  5       S#9nUU4sS S S 5        $ s  snf s  snf ! , (       d  f       N= f! , (       d  f       N= f! , (       d  f       GM  = f! , (       d  f       g = f)$Nr!   r@   num_gpusgradient_clip_normlearning_rate)valuer:   	optimizersgdadam
adam_beta1
adam_beta2adam_epsilon)r   beta1beta2r   zoptimizer not supportedupdate_cyclec                 j    U n[        U5       H  n[        R                  " X /SS9nM     US U24   nU$ )Nr   r   )r   r(   rX   )inputsr   outputsr   s       r   	fill_gpusBCsanmtForTranslation.transformer_model_train_fn.<locals>.fill_gpuse  s=     xA ii(9BG )!)8)*.r   r   r   c                     > T" T T5      $ Nr1   )rf   r   r   s   r   <lambda>ACsanmtForTranslation.transformer_model_train_fn.<locals>.<lambda>n  s    	(H =r   c                     > T $ r   r1   )rf   s   r   r   r   o  s    r   )predtrue_fnfalse_fnc                     > T " TT5      $ r   r1   )r   r   r   s   r   r   r   r  s    	&( ;r   c                     > T $ r   r1   )r   s   r   r   r   s  s    r   zgpu:%dzcpu:0Tr=   z%s_%dGPUzmle_loss_{}Semantic_Embeddingmini_xlm_encoder)var_listcolocate_gradients_with_opsr7   )global_step)+get_initializerr   r(   r)   r*   r+   trainget_global_stepr   get_learning_rate_decayconvert_to_tensorrS   GradientDescentOptimizerAdamOptimizerlogginginfosysexitMultiStepOptimizercondr   shard_featuresr   	enumeratedeviceget_variable_scope
name_scoper   r   r   summaryscalarformattrainable_variablesr{   compute_gradientsadd_nr   average_gradientslistzipclip_by_global_normapply_gradients)r   rf   r   rA   r   r   r   r   optfeature_shardslabel_shardsddevicesmulti_gradssharded_lossesr   r  r   r   mle_lossvtrainable_vars_listgrads_and_vars
total_lossgradsr   _r&   r   r   s    ``                         @@r   r-   /CsanmtForTranslation.transformer_model_train_fnH  s   %dkk2YY\\(((M{{:.H!%-A!B)),,,,<<>K+ 4O,k;;HM00#2::7M {{;'50IILL..GG!#	[)V3IILL..<<"/++l3++l3 KK7	 = 9	 		$$))*CD
$YN0KLC wwXXH-a08;=)+H WWXXF+A.9;')F
 !|!/(!C-fh?"* &x!|16xAA8a<A")KN&w/	6YYv&		(C(C		779&'!ed )D )7 w%';<7;7\7\*-|8M4#'#<#<*-|*L$: '--h7		,,33M4H4H4K4<>
 (*yy||'G'G'I/'I!3166A  2!&& @ 'I , /
 *-)>)>$%88< *? *> $**>:) =)7&& 04 .1C4GGJ /{;N!C'"&sN';"<x11%9KLq!$UH!5**IILL..>>@ + BH Z'C NMf B*/ =<)7 )7&&u NMs   I#V	%U
42V	&AU6=U%	CUU7U	U$U3U%	;U6B=V	
V	U
U"U%	%
U3/U66
V 	V		
Vc                 j   US   nUS   nUS   (       ai  US   (       a  SOSn[         R                  R                  R                  USS9   [         R                  R                  R	                  S	5      nS S S 5        O0[         R                  R                  R	                  S
[
        U/5      n[         R                  " US9S S n[         R                  " USU/5      n[         R                  " UWSS9n[         R                  " U[         R                  " Xt//S5      5      nU$ ! , (       d  f       N= f)Nr6   rw   $shared_embedding_and_softmax_weightsr;   r<   r   Tr=   r?   Softmaxr   rE   transpose_br   )
r(   r)   r*   r+   rU   tgt_vocab_sizer   reshapematmulrX   )	r   r   r   r6   rw   embedding_scopeweightsr   r   s	            r   r   CsanmtForTranslation.prediction  s   ]+ 01894:05207I ,,_D,I)),,33I> JI iill//	1?0MOG~.s3NR4EF>7EFBIIu6F.G$KL JIs   *D$$
D2Fc	                 f   US   n	US   n
[         R                  R                  R                  SU
S-  [         R                  S9nUS   (       a~  [         R                  R                  R                  S[         R                  R                  R                  S9   [         R                  R                  R                  S	X/US
9nS S S 5        O\[         R                  R                  R                  S5         [         R                  R                  R                  S	X/US
9nS S S 5        [         R                  " W[         R                  " U[         R                  5      5      nXS-  -  n[        [         R                  " US9S   S5      n[         R                  " USS/SS/SS//S9S S 2S S2S S 24   nUS   S:X  a  [        U5      nU(       d  US S 2SS 2S S 24   nUS S 2S S 2SS 2S S 24   n[        UUUUUUUUS9u  nnU(       d  US S 2SS S 24   nUS S 2SS S 24   nOUnUnUS   (       ai  US   (       a  SOSn[         R                  R                  R                  USS9   [         R                  R                  R                  S	5      nS S S 5        O+[         R                  R                  R                  SX/5      n[         R                   " UWSS9n[         R"                  R%                  U5      nUUXV4$ ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       Nl= f)Nrw   r6   r7   r8   r9   r;   r<   r=   r?   r@   r   rJ   r   rD   r   r   r   rE   rK   rL   r   r+  Tr,  r-  )r(   r)   r*   rR   rS   r+   rT   rU   r\   rY   r]   rc   r   r[   r^   r   r1  ra   log_softmax)r   ro   r   rn   r   r   r   r   	is_prefixrw   r6   rA   r   r   r   r   r   decoder_output_lastattention_weights_lastr2  r3  r   log_probs                          r   inference_func#CsanmtForTranslation.inference_func  s      01]+iill<<d""** = 6 23,,&biill.E.E - G "		 9 9< + !: !-G G ,,-?@ "		 9 9< + !: !- A
 		-"((1KLc))&4HH=)!,h'8# QFQFQF+CEEFQYP&':5-m<M)!RS!)4M*EaFHc1GM +N',?''!!#1-)) "0B":%6q"ax%@""0%6"89&,-M&N"Tf ,,_D,I)),,33I> JI iill//	1?0MOG.TJ55$$V,/GGcG G A@H JIs$   +K>6+L&*L">
L
L"
L0c                   ^ ^^&^'^(^)^*^+^,^-^.^/ TS   m(TS   m/TS   nTS   m.TS   m,TS   m-US   nSU;   a  US   nUS	   nOS nS n[         R                  " U5      S
   m'[        UT(5      n[        U5      nUb;  [         R                  " [        UT(5      [         R
                  5      n[        UT(5      nT R                  UT5      u  m)m*SnTS   (       a  S nT R                  UTUS9m+[        T.5       Vs/ s H  n[         R                  " T'S
U/S5      PM     n	n[        T.5       Vs/ s H  n[         R                  " T'S
U/S5      PM     n
n[        T.5       HW  nX   R                  [         R                  " S S U/5      5        X   R                  [         R                  " S S U/5      5        MY     [        T.5       Vs/ s H  n[        X   T(5      PM     n	n[        T.5       Vs/ s H  n[        X   T(5      PM     n
nSnUGb  [         R                  " U[         R                  " T'T(S/S
5      /SS9n[         R                  " U5      S   n[        U5      n[        T.5       Vs/ s H  n[        X   5      PM     nn[        T.5       Vs/ s H  n[        X   5      PM     nnT R                  T)T+T*UUUTSS9u  nnnn[        T.5       Vs/ s H  n[        UU   T'T(5      PM     n	n[        T.5       Vs/ s H  n[        UU   T'T(5      PM     n
n[        U5      n[         R                  " UUS S 2SS S 24   [         R                   " US S 2SS S 24   5      [         R"                  R$                  -  5      n[         R                  " US S 2S S24   [         R&                  " [         R                  " [         R(                  " US5      [         R
                  5      S5      /S5      n[        UT'T(5      n[         R                  " U[         R                  " T'T(S/S
5      /SS9nO[         R                  " T'T(S/S
5      n[         R*                  " S/[         R"                  R$                  /T(S-
  -  -   /5      n[         R,                  " UT'S/5      n[         R.                  " U5      nUn[         R                  " T'T(/[         R"                  R$                  5      n[         R                  " [         R                  " T'T(/S
5      [         R0                  5      n[3        UUU4X4UUU4S9nU'U(U)U*U+U,U.UU U/4
S jm&U,U-4S jnU&4S jn[         R*                  " S
SS9n[3        [         R                  " / SQ5      [         R                  " S S /5      [         R                  " S S /5      4[        T.5       Vs/ s H  n[         R                  " S S S U/5      PM     sn[        T.5       Vs/ s H  n[         R                  " S S S U/5      PM     sn4[         R                  " S S /5      [         R                  " / SQ5      [         R                  " S S /5      4S9n[         R4                  " UUUU/[         R                  " / 5      U/SSS9nUS   n U R6                  S
   n!U R6                  S   n"U R8                  S
   n#U R8                  S   n$U R8                  S   n%U!R                  S T(S /5        U$R                  S T(S /5        [         R:                  R<                  R                  [         R>                  " U#SS9U$U!5      n$[         R:                  R<                  R                  [         R>                  " U#SS9U%U"5      n%U$S S 2S S 2US-
  S24   n$U$U%4$ s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf )N	beam_sizerw   r6   num_decoder_layerslp_ratemax_decoded_trg_lenr"   r#   r   r   rt   r;   r   r7   rD   rM   r   rE   Tr   r7  r   statefinishc                 $
  >
 UR                   S S u  p#UR                  u  pE[        U5      n[        T05       Vs/ s H  n[        XG   5      PM     nn[        T05       Vs/ s H  n[        XW   5      PM     n	nT2R	                  T,T.T-UUU	T1SS9u  pp[        U
T*T+5      n
[        R                  " US5      U
-   n[        T05       Vs/ s H  n[        X   T*T+5      PM     nn[        T05       Vs/ s H  n[        X   T*T+5      PM     nn[        R                  " S[        R                  " U S-   [        R                  S9-   S-  T/5      nUU-  n[        R                  " UST+T3-  /5      n[        R                  R                  UST+-  S	9u  nnUT3-  nUT3-  n[        UU5      n[        R                  " US S 2S S 2S S24   [        R                  " US5      /SS
9n[        R                   " T*ST+-  S/[        R"                  " S[        R$                  5      5      n[        R                  " UU/SS
9n[        R&                  " US5      nU[        R                  " U[        R                  S9[        R                  R(                  -  -   n[        R                  R                  UT+5      u  nn[        UU5      n[        UU5      n[        UU5      n[        R                  " US S 2S S 2S S24   [        R                  " US5      /SS
9n[        R                   " T*T+S/[        R"                  " S[        R$                  5      5      n[        R                  " UU/SS
9n[        T05       Vs/ s H  n[        X   U5      PM     nn[        T05       Vs/ s H  n[        UU   U5      PM     nnUU-  n UR*                  u  n!n"n#US[        R                  " U[        R                  S9-
  [        R                  R(                  -  -   n$[        R                  " U!U/SS
9n%[        R                  " U#U$/SS
9n&[        R                  R                  U&T+5      u  n&n'[        U%U'5      n%[        R                   " T*T+S/[        R"                  " S[        R$                  5      5      n[        R                  " U"U/SS
9n"[        R                  " U"U/SS
9n([        U(U'5      n([-        UU U4UU4U%U(U&4S9n)U S-   U)4$ s  snf s  snf s  snf s  snf s  snf s  snf )NrM   FrB        @rD   r9         @rE   )kr   r   r   rC  )r   rD  merge_first_two_dimsr   r;  split_first_two_dimsr(   r`   powrY   rS   r0  ra   top_k	gather_2drX   fillconstantr]   equalminrE  BeamSearchState)4timerD  seqs	log_probsr   r   	flat_seqslayerflat_states_keyflat_states_valstep_log_probsstep_attn_weightsstep_states_keystep_states_valcurr_log_probsnext_states_keynext_states_vallength_penaltycurr_scores
top_scorestop_indicesbeam_indicessymbol_indicescandidate_seqspad_seqsflagsalive_scoresalive_indicesalive_symbols
alive_seqsalive_states_keyalive_states_valalive_log_probsprev_fin_flagsprev_fin_seqsprev_fin_scoresstep_fin_scores	fin_flags
fin_scoresfin_indicesfin_seqs	new_state
batch_sizer>  ro   rn   r   r@  r?  r   r   rw   s4                                             r   _beam_search_step;CsanmtForTranslation.beam_search.<locals>._beam_search_stepx  s,   #ll2A.OD%*[["J,T2I ##566E %Z%676   ##566E %Z%676  
 SWReRe+ Sf S!ON 2.*2;=N^^Iq9NJN
 ##56 7E %_%;Z%.06   ##56 7E %_%;Z%.06    VVrwwtaxrzz::cA7LN(>9K **[&()n*D%EGK ')eekk+Yk&O#J&.8L(>9N 't\:NYY1crc	*24N ww
A	M1=!{{1bhh79HYY'AJN HH^Q/E%RZZ))+-::>>): :L +-%%++lI*N'L-%nmDM%lMBM"47JAq#2#I&q13J ww
Iq9!{{1bhh79HJ#9BJ ##56 6E /0-@6    ##56 6E /%0-@6    +^;O >C\\:NM?(bgge2::66"**..,I IO 		>5"9BIO_#EANJ&(eekk*i&H#J!)[9Iww
Iq9!{{1bhh79HII}h&?aHMyy-!@qIH ;7H'"O\B')9:!8Z8I !8Y&&]&
b  s#   S4S9S>-T+TTc                   > UR                   S   nUR                  S   nUR                  S   n[        R                  " S[        R                  " T[        R
                  S9-   S-  T5      nUS S 2S4   U-  n[        R                  " U[        R                  " U[        R
                  S9-  SS9nS[        R                  " [        R                  " USS9[        R
                  S9-
  nU[        R
                  R                  U-  -  n[        R                  " [        R                  " UU5      S	9n	[        R                  " [        R                  " U T5      [        R                  " U	5      5      n
U
$ )
NrD   r   rM   rG  r9   rH  r   r   r   r   )r   rE  r(   rL  rY   rS   r   
reduce_anyrR  
reduce_allgreaterlogical_andlesslogical_not)tsrV  finished_flagsfinished_scoresmax_lpbest_alive_scoreworst_finished_scoreadd_maskbound_is_metr
  r@  rA  s              r   _is_finished6CsanmtForTranslation.beam_search.<locals>._is_finished  s1   IXXa[NhhqkOVV 32::FF#MF  )A7#%==,''.

;<$  RWW>Bjj" "H !BJJNNX$== ==ZZ(<(8:;L >>./1MOD Kr   c                    > T" X5      nU$ r   r1   )r  r  outsr|  s      r   _loop_fn2CsanmtForTranslation.beam_search.<locals>._loop_fn  s    $Q*DKr   rT  NNNF)r
  body	loop_varsshape_invariantsparallel_iterations	back_propr  ) r(   r   tile_to_beam_sizerJ  rY   r]   rp   r   r   rO  	set_shapeTensorShaperX   r;  rK  where	ones_likerS   rR  r`   argmaxrP  r   rV   boolrS  
while_loopr   rE  r)   r*   r  )0r   rf   r   r6   	src_inputr   r   r   rX  r   r   fixed_length	init_seqsrW  rY  rZ  r[  r\  r]  r^  rV  init_log_probsinit_scoresry  rw  rv  rD  r  r  rT  r  r   final_statern  rk  final_flags
final_seqsfinal_scoresr|  r{  r>  ro   rn   r   r@  rA  r?  rw   s0   ` `                                   @@@@@@@@@@r   r,    CsanmtForTranslation.beam_search  s   ;'	 01]+#$89#$%:;\*	H$m,F!,/JFJXXi(+
%i;	(3	WW.vyA288LF*:yAJ6:6I6Iv73323K55vK 6 1
 12
2 GGZK0#62 	 
 12
2 GGZK0#62 	 
 -.E''dK89;''dK89; / 12
2 j/;2 	 
 12
2 j/;2 	 
 		*i!;Q?@qJI88I.r2L,Y7I ##566E %Z%676   ##566E %Z%676  
 SWReRe+ Sf S ON- ##56 7E %_U%;Z%.06   ##56 7E %_U%;Z%.06   .j9JN1b!84^Ar1H56GII 		!SbS&!GGBIIi4bhh?E# 	I -Y
INI		BGGZA$>BC!MI Y :A>I KK" 0IM BBCD 	*aAmmN3WWj)4bjjnnE
GGBGGZ$;Q?I	~{;*x4
s	' s	'j	2	 {{16**NN#56NND$<0"..$BFBH 3IJ
 ##566E dD+>?6
 ##566E dD+>?6 NND$($* +,.NN;M,NNND$<023 --Um nnR02BC ! aj ''*
"))!,!((+ ''*
"))!,dIt45dIt45YY\\''MM{;Z
 yy||))MM{;\  1lQ&6r&9 9:
<''g




xs<   4$^'$^ ^'^'^#^(^-7^22$^7
%$^<)r   r  r   )g-q=NN)__name__
__module____qualname____firstlineno__r   r   strr   r  r.   r2   rp   r   r   r   r   r   r-   r   r;  r,   __static_attributes____classcell__r   s   @r   r
   r
      s     -1-126	!S&[)!S&[)! c6k*! "$,/	! <@V;L	!F
T#v+. 
4V3D 
0;d.`,.J !.2I^ 04-1>c(J2 !!&AHFq( q(r   c                       \ rS rSrSrg)rS  i6  r1   N)r  r  r  r  r  r1   r   r   rS  rS  6  s    r   rS  rC  c                     [         R                  " U SS9n S/U R                  R                  -  nXS'   [         R                  " X5      $ )z#Tiles a given tensor by beam_size. rD   r   )r(   r`   r   ndimsr   )rG   r>  	tile_dimss      r   r  r  ;  s>    ^^F+Ffll(((IaL776%%r   c                 ^   [         R                  " U 5      n U R                  R                  c  [         R                  " U 5      $ U R                  R	                  5       n[         R                  " U 5      n/ n[        [        U5      5       H  nX   nUc  X$   nUR                  U5        M!     U$ r   )r(   r  r   dimsas_listr   r   r   )xstatic_shapedynamic_shaperetr   dims         r   infer_shaper  D  s    
QAww||xx{77??$LHHQKM
C3|$%o;"C

3	 & Jr   c                 ^    [        U 5      nU/U/-   USS  -   n[        R                  " X5      $ NrD   )r  r(   r0  )rG   dim_0dim_1r   	new_shapes        r   rK  rK  W  s4    E5'!E!"I-I::f((r   c                     [        U 5      nUS==   US   -  ss'   UR                  S5        [        R                  " X5      $ )Nr   rD   )r  popr(   r0  )rG   r   s     r   rJ  rJ  ]  s8    E	!HaH	IIaL::f$$r   c                 &   [         R                  " U 5      S   n[         R                  " U5      S   n[         R                  " X4-  5      U-  n[         R                  " XSU/5      n[         R                  " XQ/SS9n[         R
                  " XUS9nU$ )zGather the 2nd dimension given indices
:param params: A tensor with shape [batch_size, M, ...]
:param indices: A tensor with shape [batch_size, N]
:param name: An optional string
:return: A tensor with shape [batch_size, N, ...]
r   rD   rE   r   r   )r(   r   r   r0  stack	gather_nd)r   indicesr{   r{  
range_size	batch_posoutputs          r   rN  rN  d  s{     &!!$J'"1%J01Z?I

9:&>?Ihh	+"5G\\&5FMr   c           
      $   [         R                  R                  R                  USU /US9   [	        U [
        [        45      (       d  U /n U  Vs/ s H  ofR                  5       S   PM     nn[        U 5      [        U5      :w  a  [        S5      e[         R                  " [         R                  " U S   5      S S U//SS9nU  V	s/ s H)  n	[         R                  " U	SU	R                  S   /5      PM+     n n	/ n
U(       au  [        U5      n[         R                  " U S5      n Xq/n[         R                  R                  R                  SU5      nU
R                  [         R                   " X5      5        Ow[#        [        U5      5       H_  nX}   U/nS	U-  n[         R                  R                  R                  X5      nU
R                  [         R                   " X   U5      5        Ma     [         R$                  " U
5      nU(       aL  U/n[         R                  R                  R                  S
U5      n[         R&                  R)                  X5      n[         R                  " X5      nUsS S S 5        $ s  snf s  sn	f ! , (       d  f       g = f)Nlinear)default_namevaluesr:   rE   z inputs and input_size unmatched!r   r   rD   matrixz	matrix_%dbias)r(   r)   r*   r+   
isinstancer  tuple	get_shaper   RuntimeErrorrX   r   r0  sumrU   r   r1  r   r  ra   rb   )r   output_sizer  rX   r:   r|   item
input_sizeoutput_shapeinpresultsr   r  r   r{   r  s                   r   r  r  u  s   		$	$& 
% 
H&4-00XF7=>vtnn&r*v
>v;#j/)ABByy"((6!9"5cr":[M!J&') CII&3"**S2syy}"56&IZJYYvq)F-EYY\\..x?FNN299V453z?+#4"Q224?ryyF;<	 , '" ME99<<,,VU;DUU^^F1FF1O
H 
H
 ? J
H 
Hs+   #JI7+AJ0I<6E7J7
J
Jc                    [         R                  R                  R                  USU /US9   U R	                  5       R                  5       S   n[         R                  R                  R                  SU/[         R                  " 5       S9n[         R                  R                  R                  SU/[         R                  " 5       S9n[         R                  " U SS5      n[         R                  " [         R                  " X-
  5      SS5      nX-
  [         R                  R                  R                  X-   5      -  n	X-  U-   sS S S 5        $ ! , (       d  f       g = f)N
layer_norm)r  r  r>   rE   layer_norm_scaler@   layer_norm_offsetT)r(   r)   r*   r+   r  r  rU   ones_initializerzeros_initializerreduce_meansquarersqrt)
r   r   r{   r>   channel_sizescaleoffsetmeanvariancenorm_inputss
             r   r  r    s   		$	$|VHE 
% 
K'')113B7		))++- * / **,,,. + 0 ~~fb$/>>"))FM":BE}		(:(:8;M(NN"V+#
K 
K 
Ks   DE
Ec                 ^    U(       a  US:X  a  U $ US:X  a  [        U 5      $ [        SU-  5      e)Nnoner  Unknown mode %s)r  ry   )r  modes     r   _layer_processr    s3    46>		!}*T122r   c                 f    U(       a'  US:  a!  [         R                  R                  USU-
  S9nX-   $ )Nr   rD   rP   )r(   ra   rd   )r  y	keep_probs      r   _residual_fnr    s,    Y_EEMM!!y/M25Lr   c                    US   nSUS   -
  n[         R                  R                  R                  USX/S9   [         R                  R                  R                  S5         [	        XSS5      n[         R
                  R                  U5      nS S S 5        U(       a'  US:  a!  [         R
                  R                  WSU-
  S	9n[         R                  R                  R                  S
5         [	        WUSS5      nS S S 5        U W-   sS S S 5        $ ! , (       d  f       N= f! , (       d  f       N-= f! , (       d  f       g = f)Nr6   r   relu_dropoutembedding_augmentation_layerr  r  input_layerTrD   rP   output_layerr(   r)   r*   r+   r  ra   relurd   )r  r   r   r{   r6   r  hiddenr  s           r   r  r    s    'Kf^,,I		$	$7. 
% 
0 YY\\((72tLFUUZZ'F 8 SUU]]6Y]@FYY\\((8FKt<F 9 6z
0 
0 87 98
0 
0s<   *D5"-DAD5.D$=D5
D!	D5$
D2	.D55
Ec                    US   nUS   nSUS   -
  n[         R                  R                  R                  USU /S9   [         R                  R                  R                  S5         [	        XSS5      n[         R
                  R                  U5      nS S S 5        U(       a'  US:  a!  [         R
                  R                  WS	U-
  S
9n[         R                  R                  R                  S5         [	        WUSS5      nS S S 5        WsS S S 5        $ ! , (       d  f       N= f! , (       d  f       N*= f! , (       d  f       g = f)Nfilter_sizer6   r   r  	ffn_layerr  r  TrD   rP   r  r   )r  r   r{   r  r6   r  r  r  s           r   transformer_ffn_layerr    s    'K'Kf^,,I		$	${A3 
% 
8YY\\((7AD$7FUUZZ'F 8 SUU]]6Y]@FYY\\((8FKt<F 9 
8 
877 98
8 
8s<   *D7'-DAD73D&	D7
D#	D7&
D4	0D77
Eencoderc                 &   US   nUS   nUS   nUS   nUS   n	US   n
US   nU n[         R                  " US5      n[         R                  R                  R	                  U[         R                  R                  R
                  S	9   [        U5       H  n[         R                  R                  R	                  S
U-  5         US   S:X  a  US   OS n[        [        X5      S UUUUUU	USS9
u  nn[        XSU-
  5      n[        X5      n[        [        X5      U5      n[        XSU-
  5      n[        X5      n[         R                  " X5      nS S S 5        M     [        X5      sS S S 5        $ ! , (       d  f       M  = f! , (       d  f       g = f)Nnum_encoder_layersr6   	num_headsrO   attention_dropoutlayer_preproclayer_postprocrM   r=   layer_%drK   relativemax_relative_disencoder_self_attentionr  r{   r   )r(   r`   r)   r*   r+   rT   r   multihead_attentionr  r  r  r_   )rm   rn   r   r   r{   r	  r6   r
  rO   r  r  r  r  rX  r  ows                    r   re   re     s   
   45'K{#I0123?+M,-NA>>$"D		$	$T1H1H	$	I-.E,,Z%-?@23zA $**<#=GK !*"14/%%51
31 !s-='=>"15)"14f> s-='=>"15KK(- A@ /2 a/5 
J	I@@ 
J	Is%   ;FBE0F0
E?:F
Fr   c                 t   US   nUS   nUS   nUS   nUS   n	US   n
US   nU n[         R                  " US5      n[         R                  R                  R	                  U[         R                  R                  R
                  S	9   [        U5       H  n[         R                  R                  R	                  S
U-  5         US   n[        [        X5      S UUUUUU	USS9
u  nn[        XSU-
  5      n[        X5      n[        [        X5      U5      n[        XSU-
  5      n[        X5      n[         R                  " X5      nS S S 5        M     [         R                  R                  R	                  S[         R                  R                  R
                  S	9   [         R                  " USS9[         R                  " USS9-  n[        [         R                  " USS9USS5      nS S S 5        [        WU
5      sS S S 5        $ ! , (       d  f       GM  = f! , (       d  f       N6= f! , (       d  f       g = f)Nnum_semantic_encoder_layersr6   r
  rO   r  r  r  rM   r=   r  r  r  r  r   pooling_layerrD   r  r   T)r(   r`   r)   r*   r+   rT   r   r  r  r  r  r_   r   r  )rm   rn   r   r   r{   r	  r6   r
  rO   r  r  r  r  rX  r  r  r  r  s                     r   rz   rz     s   
   =>'K{#I0123?+M,-NA>>$"D		$	$T1H1H	$	I-.E,,Z%-?@#)*<#= *"14/%%51
31 !s-='=>"15)"14f> s-='=>"15KK(+ A@ /0 YY\\((ryy||'>'> ) @]]Q(*,--!%A+//F vA.T4IF@ fm4C 
J	I@@.@ @3 
J	Is?   ;H)BHAH)AH(H)
H	H)
H&	"H))
H7decoderc	                    US   n	US   n
US   nUS   nUS   nUS   nUS   nU n[         R                  R                  R                  U[         R                  R                  R                  S9   [        U	5       GH  n[         R                  R                  R                  S	U-  5         US
   S:X  a  US   OS nUb#  [        U[        UU5      U5      n[        UU5      n[        [        UU5      S UU
U
U
UUUUUUSS9u  nn[        UUSU-
  5      n[        UU5      n[        [        UU5      UUU
U
U
UUUSS9
u  nn[        UUSU-
  5      n[        UU5      n[        [        UU5      U5      n[        UUSU-
  5      n[        UU5      nS S S 5        GM     [        UU5      W4sS S S 5        $ ! , (       d  f       GMF  = f! , (       d  f       g = f)Nr?  r6   r
  rO   r  r  r  r=   r  rK   r  r  decoder_self_attention)r   r   rX  r  r{   r   encdec_attentionr  )r(   r)   r*   r+   rT   r   r  r  r  r  r  )r   ro   r   encoder_decoder_attention_biasr   r   r   r   r{   r?  r6   r
  rO   r  r  r  r  rX  r  r  r  s                        r   r   r   H  s      45'K{#I0123?+M,-NA		$	$T1H1H	$	I-.E,,Z%-?@23zA $**<#=GK ! *54>*@*79:@BA 'q.9A*"1m4/%))%5131 !As-='=>"1n5*"1m4"2%%5+
-1 !As-='=>"1n5)"1m4f> As-='=>"1n5W A@ /\ a/2_ 
J	I@@ 
J	Is%   0<G,C"F2G2
G<	G
Gc           	         [         R                  " U 5      S   n[         R                  " U 5      S   n[         R                  " [         R                  " U5      [         R                  5      nUS-  n[
        R                  " [        U5      [        U5      -  5      [         R                  " U[         R                  5      S-
  -  nU[         R                  " [         R                  " [         R                  " U5      [         R                  5      U* -  5      -  n[         R                  " US5      [         R                  " US5      -  n	[         R                  " [         R                  " U	5      [         R                  " U	5      /SS9n
[         R                  " U
SS/S[         R                  R                  R!                  US5      //5      n
[         R"                  " U
SX4/5      n
U [         R                  " XR$                  5      -   $ )NrD   rM   r   r   )r(   r   rY   r   rS   r   logfloatexpr`   rX   sincosr[   r)   r*   modr0  r:   )r  min_timescalemax_timescalelengthchannelspositionnum_timescaleslog_timescale_incrementinv_timescalesscaled_timesignals              r   r^   r^     sp   XXa[^Fxx{1~Hwwrxx'4H]N 
%&})==	>"''.Z\ZdZdBehiBi	j "RVV
("**5"
"	#&$ $N
 	x#bnn^Q&GG YY{+RVVK-@AJFVVFaVa)9)9(A)F%GHIFZZF 56Frwwvww'''r   c                    Uc  [         R                  nU[         R                  :w  a  UR                  nUS:X  a7  U nSU-
  U-  n[         R                  " [         R                  " US5      S5      nOpUS:X  a\  U n[         R                  R                  [         R                  " Xf/S5      SS5      nUSU-
  -  n[         R                  " USSXf/5      nO[        SU-  5      e[         R                  " XS5      $ )NrN   r   rD   r   rE   r   r  )
r(   rS   rR  r`   linalg	band_partrO  r0  ry   rY   )r   r  infr:   r   r  r'  lower_triangles           r   rc   rc     s    }



iiyTzS nnR^^C3Q7		,,GGV$c*B3S>)*jjq!V45*T122773r   c                    UnU R                  5       R                  nU R                  R                  nUS   nUS S U/-   U(       a  XR-  OS /-   n[        R
                  " U [        R                  " [        R                  " U 5      S S US//S5      5      nUR                  U5        SUS-
  /[        SUS-
  5       Vs/ s H  oPM     sn-   U/-   n	[        R                  " Xy5      $ s  snf )NrE   r   rD   )
r  r  r   r  r(   r0  rX   r  r   	transpose)
r  r
  n	old_shaper  lastr  r  r   perms
             r   split_headsr:    s    A""IGGMMER=D#2!$T	t'DDI
**Q		288A;s#3aW"=qA
BCMM)uqy>a(;<(;1Q(;<<wFD<<"" =s   ?C)c           	      d   [         R                  R                  R                  USXU/S9   [         R                  " U 5      nUS   US   US   US   4u  pp[         R                  " U5      S   n[         R                  " U5      S   nUb	  US   US   pUc  [         R
                  " XS	S
9nO[         R
                  " XS	S
9n[         R                  " [         R                  " U / SQ5      XU	-  U/5      n [         R
                  " U [         R                  " W/ SQ5      5      n[         R                  " [         R                  " U/ SQ5      XX/5      nUU-   nUb  UU-  n[         R                  R                  USS9nUS:  a#  [         R                  R                  USU-
  5      nUc"  [         R
                  " UU5      U4sS S S 5        $ [         R
                  " UU5      n[         R                  " [         R                  " U/ SQ5      XU	-  U/5      n[         R
                  " UW5      n[         R                  " [         R                  " U/ SQ5      XX/5      nUU-   n[         R                  " [         R                  " U/ SQ5      XX/5      nUU4sS S S 5        $ ! , (       d  f       g = f)Ndot_product_attentionr  r   rD   rM      rpr_krpr_vTr-  )rM   r   rD   r=  )r   rM   rD   )rD   r   rM   r   r   r7   r   )r(   r)   r*   r+   r   r1  r0  r5  ra   softmaxrd   )qrI  r#  r  dropout_rater{   rprq_shapebshdlqdklkdvr>  r?  r   logits_part1logits_part2r3  outputs_part1outputs_part2r   s                          r   r<  r<    sa    
	$	$6ay 
% 
J((1+ WQZWQZGXXa[^XXa[^?w<*5 ;YYq6F99Qt<L

2<<<8R,.A99Q%'\\%2;&=>L ::\957GIL "L0FdNF%%---@-A#eemmGS<-?@G;99Wa('1G
J 
JJ IIgq1MjjWl3"Wb!#G IIgu5MJJ]I68HJM $m3GjjWi0 "G G#g
J 
J 
Js   FJ!C	J!!
J/c                 ^   [         R                  " U / SQ5      n U R                  5       R                  nUSS  u  p#US S U(       a  U(       a  X#-  OS /-   n[         R                  " U [         R
                  " [         R                  " U 5      S S S//S5      5      n U R                  U5        U $ )N)r   rM   rD   r=  rE   r   )r(   r5  r  r  r0  rX   r   r  )r  r7  abr  s        r   combine_headsrS    s    
Q%A""IRS>DA#21!%!==I


1bii!Sb!1B4 8!<=AKK	Hr   c                    [         R                  " U5         [         R                  " [         R                  " U5      SS/5      n[         R                  " [         R                  " U5      SS/5      nXV-
  nXs-   n[         R                  " US5      n[         R
                  " USU-  5      nXq* S 2S S 24   n[         R                  " X5      nUsS S S 5        $ ! , (       d  f       g = f)NrE   rD   r   rM   )r(   r  r0  r   maximumminimumr\   )	orginal_varlength_q	length_kvr  r{   idxsidysidsrC  s	            r   
create_rprr]    s    
 
t	zz"((9-Aw7zz"((9-2w7k$jja jja"223)*a- ii) 
		s   B6C
C%c           
         X6-  S:w  a  [        S[        U4-  5      eXF-  S:w  a  [        S[        U4-  5      e[        R                  R
                  R                  USX/S9   Uc.  [        U US-  U-   SSSS	9n[        R                  " XX4/SS
9u  pnO5[        XSSSS	9n[        UX4-   SSSS	9n[        R                  " XU/SS
9u  nnUb  [        R                  " X   U/SS
9=oU
'   U	b  [        R                  " X   U/SS
9=nX'   [        X5      n[        X5      n[        UU5      nX6-  nUUS-  -  n[        R                  " U5      S   n[        R                  " U5      S   nUc  Ub  [        R                  R
                  R                  SSU-  S-   X6-  /5      n[        R                  R
                  R                  SSU-  S-   XF-  /5      n[        UUUU5      n[        UUUU5      nUUS.n[        XUX'US9u  nnO[        XUX'5      u  nn[        U5      n[        R                   " US5      n[        UUSSSS	9nUU4sS S S 5        $ ! , (       d  f       g = f)Nr   zFKey size (%d) must be divisible by the number of attention heads (%d).zHValue size (%d) must be divisible by the number of attention heads (%d).r  r  rM   Tqkv_transform)r|   r   q_transformkv_transformrD   r8   r>  r?  )r>  r?  )rC  output_transform)ry   key_size
value_sizer(   r)   r*   r+   r  splitrX   r:  r   rU   r]  r<  rS  r  )queriesmemoriesr  	key_depthvalue_depthoutput_depthr
  rB  r   r   rX  r  r{   combinedrA  rI  r#  key_depth_per_headrX  rY  r>  r?  rC  r  r  s                            r   r  r    s    !T#$% 	% !#V9%&' 	' 
	$	$4& 
% 
( A+%'H hhi=AGGA! w4]KA'$&H 88H+&>QGDAq!$&IIz/@!.D1$MMA5!!$&IIz/@!.D1$MMA
!%%9%&3	%%88A;q>HHQKN	  0 <IILL--!..2I4JKMEIILL--!..2K4LMOEuh	;KLEuh	;KLE!E2C(q$#NDAq(q$EDAq!NN1a 1lD$6HI!tk
( 
( 
(s   #G(I
I#c                    U S   S:X  a0  U S   n[         R                  R                  R                  U* U5      $ U S   S:X  a-  [         R                  R                  R	                  SU S   5      $ U S   S:X  a,  [         R                  R                  R                  U S   SSS9$ U S   S	:X  a,  [         R                  R                  R                  U S   SSS9$ [        S
U S   -  5      e)NrA   uniforminitializer_scaler   r7   normal_unit_scalingfan_avg)r  distributionuniform_unit_scalingzUnrecognized initializer: %s)r(   r)   r*   random_uniform_initializerrR   variance_scaling_initializerry   )r   max_vals     r   r   r   m  s   m	),-yy||66xII		(	*yy||55+,. 	.		"7	7yy||88&'ih 9 P 	P		"8	8yy||88&'" 9 $ 	$
 7!-01 2 	2r   c                    US   S;   a~  [         R                  " U[         R                  S9n[         R                  " US   [         R                  S9nUS   S-  nU[         R                  " US-   US-  -  US-   S-  5      -  nX-  $ US   S	:X  a\  [         R                  R
                  R                  R                  [         R                  " U[         R                  S9US
   US   5      $ US   S:X  a  U $ [        S5      e)Nlearning_rate_decay)linear_warmup_rsqrt_decaynoamr9   warmup_stepsr6   r8   rD   g      piecewise_constantlearning_rate_boundarieslearning_rate_valuesr  zUnknown learning_rate_decay)
r(   rY   rS   rV  r)   r*   r   r|  r]   ry   )r   r   r   stepr{  
multiplierdecays          r   r  r    s    #$(MMww{"**5wwvn5RZZHM*D0
RZZlD6H(I)-D(8: :$$	%	&*>	>yy||!!44GGKrxx0-.7M0NP 	P 
%	&&	0677r   c                 "   / n[        U 6  H  n/ nU H-  u  pE[        R                  " US5      nUR                  U5        M/     [        R                  " SUS9n[        R
                  " US5      nUS   S   nXx4n	UR                  U	5        M     U$ )Nr   )r   r  rD   )r  r(   r`   r   rX   r  )
tower_gradsaverage_gradsgrad_and_varsr'  gr(  
expanded_ggradr#  grad_and_vars
             r   r  r    s    Mk*!DA1-JLL$ " yya.~~dA&!Qy\* + r   c                 l    [         c  U $ [         R                  U [         R                  R                  S9$ )N)compression)_ENGINE	allreduceCompressionfp16)rG   s    r   
all_reducer    s.    V1D1D1I1IJJr   c                      ^  \ rS rSr   S	U 4S jjrS rS\R                  R                  R                  R                  R                  SSS4S jrS
S jrSrU =r$ )r	  i  Fc                 x   > [         [        U ]  X45        Xl        X l        [
        R                  " USS9U l        g )Nr  r   )r   r	  r   
_optimizer_stepr(   r  _step_t)r   r   r  use_lockingr{   r   s        r   r   MultiStepOptimizer.__init__  s3    
 	 $0C#
++Dv>r   c                    [         R                  " U R                  S-   5         Uc  UsS S S 5        $ [        U[         R                  5      (       a  [         R
                  " U5      n[        U5      sS S S 5        $ ! , (       d  f       g = f)N
_Allreduce)r(   r  _namer  IndexedSlicesr  r  )r   rG   s     r   _all_reduceMultiStepOptimizer._all_reduce  s`    ]]4::45~ 65 &""2"233--f5f% 655s   A<?A<<
B
Nc                 d  ^ ^ T R                   R                  XX4XV5      n[        [        U6 5      u  pT R                  S:X  a5  U V	s/ s H  n	T R                  U	5      PM     nn	[        [        X5      5      $ [        US S9n
T R                  T R                  S:X  a  SOSSU
S9n/ n[        X5       H  u  pT R                  UST R                  5      m[        U[        R                  5      (       a6  [        R                  " TUR                  UR                  T R                   S9mO [        R"                  " TUT R                   S9mU4S	 jnUU 4S
 jn[        R$                  " [        R&                  " US5      UU5      nUR)                  U5        M     [        [        X5      5      $ s  sn	f )NrD   c                     U R                   $ r   r   )r  s    r   r   6MultiStepOptimizer.compute_gradients.<locals>.<lambda>  s    r   )keyr   iter)initial_valuer{   colocate_withgrad_accr  c                     > T $ r   r1   )r  s   r   	_acc_grad7MultiStepOptimizer.compute_gradients.<locals>._acc_grad  s    r   c                  @   > TR                  T TR                  -  5      $ r   )r  r  )r  r   s   r   	_avg_grad7MultiStepOptimizer.compute_gradients.<locals>._avg_grad  s    ''4::(=>>r   )r  r  r  r  r  r  rR  _create_non_slot_variable_zeros_slotr  r  r(   r  scatter_addr  r  _use_locking
assign_addr
  rQ  r   )r   r'   r   gate_gradientsaggregation_methodr   	grad_lossr%  r'  r  	first_variter_var	new_gradsr  varr  r  r  s   `                @r   r  $MultiStepOptimizer.compute_gradients  s    ::N'4 sN34 ::?278%QT%%a(%E8E,--&67	11#zzQ!A# 2 %
 	U-ID''ZDH$ 0 011>>LLKK $ 1 1	3 ==d0A0AC ? 77288Ha0)YGDT") ., C	,--C 9s   F-c                 P  ^ ^^^	^
 T R                   S:X  a  T R                  R                  UTTS9$ [        [	        U6 5      u  m	m
U	4S jnUU	UU U
4S jnT R                  S[        R                  " 5       5      n[        R                  " [        R                  " US5      XT5      n[        R                  " U/5         UR                  [        R                  " US-   T R                  5      T R                  S9nS S S 5        [        R                  " UW/6 $ ! , (       d  f       N#= f)NrD   r   c                  *   > [         R                  " T 6 $ r   )r(   group)r'  s   r   _pass_gradients;MultiStepOptimizer.apply_gradients.<locals>._pass_gradients  s    88U##r   c            	        > TR                   R                  [        TT	5      TT5      n [        R                  " U /5         / nT	 HR  nTR                  US5      nUR                  UR                  [        R                  " U5      TR                  S95        MT     [        R                  " U6 nS S S 5        [        R                  " U W/6 $ ! , (       d  f       N#= f)Nr  r  )r  r  r  r(   control_dependenciesget_slotr   assignrV   r  r  )
opzero_opsr  r  zero_opr   r'  r{   r   r   s
        r   _apply_gradients<MultiStepOptimizer.apply_gradients.<locals>._apply_gradients  s    00E8$k49B((".#C#}}S*=HOO MM(3(,(9(9 ( ;< $ ((H- / 88b']++ /.s    A.C
Cr  r   r  )r  r  r  r  r  _get_non_slot_variabler(   get_default_graphr
  rQ  r  r  r$  r  r  r  )r   r%  r   r{   r  r  r  	update_opiter_opr'  r   s   ` ``     @@r   r  "MultiStepOptimizer.apply_gradients  s    ::???22$ 3 8 8 sN34x	$	, 	, ..vr7K7K7MNGGHHXq!#3F	 $$i[1oox!|T\\2 -- & /G 2
 xx)W-.. 21s   ==D
D%)r  r  r  )rD   Fr	  r  )r  r  r  r  r   r  r(   r)   r*   r   	OptimizerGATE_OPr  r  r  r  r  s   @r   r	  r	    sS     "*	?& $()+););)E)E)M)M-16;$(0.d!/ !/r   r	  c                   ^^ [         R                  " U 5      n [         R                  " U 5      S   m/ n[         R                  " S5         [	        T5       Hq  nUR                  [         R                  " [         R                  " [         R                  R                  R                  TT5      U5      UU4S jUU4S j5      5        Ms     S S S 5        [         R                  " XSS9$ ! , (       d  f       N#= f)Nr   z/cpu:0c                     > T T-  S-   $ r  r1   r{  num_datashardss   r   r    shard_features.<locals>.<lambda>#  s    J.$@1$Dr   c                     > T T-  $ r   r1   r  s   r   r   r  $  s
    J.8r   r   )r(   r  r   r  r   r   r
  r  r)   r*   r$  re  )r  r  size_splitsr   r{  s    `  @r   r  r    s    
QA!QJK	8	~&AJJ		((^DD8	:; ' 
 88A++ 
	s   
BC((
C6r   )TNN)gư>NN)r   g     @)g    eN)r7   NN)r]  )NNr   NN)5r   collectionsr   typingr   
tensorflowr(   modelscope.metainfor   modelscope.models.baser   r   modelscope.models.builderr   modelscope.utils.constantr	   __all__register_moduletranslationr
   rS  r  r  rK  rJ  rN  r  r  r  r  r  r  re   rz   r   r^   rc   r:  r<  rS  r]  r  r   r  r  r  r  r)   r*   r   r  r	  r  r1   r   r   <module>r     sm    "   & 0 , +!
" ))v7I7IJ`(5 `( K`(F	$&CD	
&&)%"(V,*3(,  "&	(0\ )+&8	/5l $(#'/3!&@3F(*0
#" (+#":$z !	2 $(#')-!L^2(8$ Ki/++55 i/X,r   