
    im              	          % S SK JrJr  S SKJr  S SKJrJrJrJ	r	J
r
Jr  S SKrS SKJr  S SKJs  Jr  S SKJr  S SKJr  S SKJrJr  S SKJr   " S	 S
\SS9r " S S\R8                  \5      r0 r\\\
\   4   \ S'   S\4S jr! " S S\R8                  5      r" " S S\R8                  5      r#S r$ " S S\5      r% " S S\R8                  5      r& " S S\"5      r'\!" S5       " S S\5      5       r(S+S \RR                  S!\*S"\+S#\RR                  4S$ jjr,\!" S%5       " S& S'\5      5       r-\!" S(5       " S) S*\5      5       r.g),    )ABCabstractmethod)Enum)AnyDictOptionalTupleType	TypedDictN)
LoRALinear)	ModelArgs)RMSNormRMSNormGated)Ropec                       \ rS rSr% Sr\\R                     \S'   \\R                     \S'   \\R                     \S'   \\R                     \S'   \\	   \S'   \\	   \S'   \\R                     \S	'   S
rg)ForwardOptions   zROptional parameters for `Attention.forward` (compative with Python 3.10 and plus).mask	input_posfreqs_cos_overridefreqs_sin_overridein_cache_stateout_cache_statelast_valid_token_pos N)__name__
__module____qualname____firstlineno____doc__r   torchTensor__annotations__r   
LongTensor__static_attributes__r       i/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/executorch/examples/models/llama/attention.pyr   r      sf    \
5<<
  %% .. ..SM!c]""5#3#344r&   r   F)totalc                       \ rS rSrSr\S\R                  S\R                  S\R                  S\S\	\R                  \
\   4   4
S j5       rS	rg
)	Attention   zDAbstract base class for attention mechanisms with unified interface.x	freqs_cos	freqs_sinkwargsreturnc                     g)a  Forward pass for attention mechanism.

Args:
    x: Input tensor of shape (batch_size, seq_len, dim)
    freqs_cos, freqs_sin: Rotary position embedding frequencies
    ForwardOptions: grouped optional args

Returns:
    Tuple of (output tensor, updated cache state)
Nr   selfr,   r-   r.   r/   s        r'   forwardAttention.forward   s    $ 	r&   r   N)r   r   r   r   r    r   r!   r"   r   r	   r   r   r4   r%   r   r&   r'   r*   r*      se    N<< << <<	
 ! 
u||Xc]*	+ r&   r*   ATTENTION_REGISTRYnamec                 2   ^  S[         [           4U 4S jjnU$ )z'Decorator to register attention classesclsc                 6   > U [         TR                  5       '   U $ N)r6   lower)r9   r7   s    r'   	decorator%register_attention.<locals>.decorator8   s    +.4::<(
r&   )r
   r*   )r7   r=   s   ` r'   register_attentionr?   5   s    tI  r&   c                      ^  \ rS rSr\R
                  4S\S\S\S\S\4
U 4S jjjrS\R                  S	\R                  S
\R                  S\
\R                  \R                  4   4S jrSrU =r$ )KVCache?   max_batch_sizemax_context_lengthn_headshead_dimenable_dynamic_shapec           	         > [         TU ]  5         X l        XX$4nXl        X0l        X@l        XPl        U R                  S[        R                  " XvSS95        U R                  S[        R                  " XvSS95        g )Nk_cachecpudtypedevicev_cache)
super__init__rD   rC   rE   rF   rG   register_bufferr!   zeros)	r3   rC   rD   rE   rF   rG   rL   cache_shape	__class__s	           r'   rP   KVCache.__init__@   su     	"4%0BM, $8!u{{;EJ	
 	u{{;EJ	
r&   r   k_valv_valr0   c                    U R                   (       a  US   R                  5       n[        R                  " U5        [        R                  " X@R
                  :  5        SnUR                  U5      n[        R                  " U5      U-   nU R                  R                  XWU5        U R                  R                  XWU5        U R                  U R                  4$ U R                  nU R                  n	X(S S 2S S 2U4'   X9S S 2S S 2U4'   X4$ )Nr      )rG   itemr!   _check_is_size_checkrD   sizearangerI   index_copy_rN   )
r3   r   rV   rW   	start_posdim_to_slice
seq_lengthindicesk_outv_outs
             r'   updateKVCache.updateX   s     $$!!))+I  +LL%<%<<=LL1Jll:.:GLL$$\EBLL$$\EB<<--LLELLE%*!Q	/"%*!Q	/"<r&   )rG   rF   rC   rD   rE   )r   r   r   r   r!   float32intboolrP   r"   r	   rf   r%   __classcell__rT   s   @r'   rA   rA   ?   s     mm

  
 	

 
 #
 
0  .3ll CH<< 	u||U\\)	*   r&   rA   c                      ^  \ rS rSrS\S\S\S\4U 4S jjrS\R                  S\R                  S	\R                  S
\R                  S\R                  S\R                  4S jrSr	U =r
$ )SDPAo   dimrF   n_repmax_context_lenc                 R   > [         TU ]  5         Xl        X l        X0l        X@l        g r;   )rO   rP   rp   rF   rq   rr   )r3   rp   rF   rq   rr   rT   s        r'   rP   SDPA.__init__p   s%     	 
.r&   r   qkvr   r0   c                     UR                  U R                  SS9nUR                  U R                  SS9n[        R                  " X#XGSS9nUR	                  SS5      R                  XVU R                  5      $ )N   rp           	attn_mask	dropout_prY   )repeat_interleaverq   Fscaled_dot_product_attention	transposereshaperp   )	r3   r   ru   rv   rw   bszseqlenr   ys	            r'   r4   SDPA.forward}   sl     

2

2**1cR{{1a ((dhh??r&   )rp   rF   rr   rq   )r   r   r   r   ri   rP   r!   r"   r4   r%   rk   rl   s   @r'   rn   rn   o   s    // / 	/
 /@<<@ <<@ <<	@
 <<@ ll@ 
@ @r&   rn   c                     U[         R                  " U[         R                  S9R                  SS5      -   nX@-
  nU S:  US:  -  XQ:  -  n[         R                  " US:H  S[        S5      5      nU$ )NrL   ry   r   Tz-inf)r!   r^   longviewwherefloat)cache_positionswindow_sizer`   seq_lenpos_qdeltar}   s          r'   #_create_causal_mask_for_ring_bufferr      sn     WEJJ?DDRKKE#E A%%1*59LMII-q%-@Ir&   c                       \ rS rSrSrSrSrg)CacheUpdateStrategy   
RingBufferInvalidr   N)r   r   r   r   RING_BUFFERINVALIDr%   r   r&   r'   r   r      s    KGr&   r   c                   p   ^  \ rS rSr\R
                  4S\S\4U 4S jjjrS\R                  4S jr
SrU =r$ )CachePositionsManager   rD   cache_update_strategyc           	         > [         TU ]  5         U[        R                  :X  d   S5       eXl        U R                  S[        R                  " U R                  [        R                  SS95        g )NzOnly RingBuffer is supportedr   rJ   rK   )	rO   rP   r   r   rD   rQ   r!   rR   r   )r3   rD   r   rT   s      r'   rP   CachePositionsManager.__init__   s`    
 	!%8%D%DD	*)	*D"4KK00ER	
r&   r   c                 .   US   R                  5       n[        R                  " U5        [        R                  " U[        R                  S9U-   nX@R
                  -  n[        R                  " U R
                  4S[        R                  S9n[        R                  " U R
                  [        R                  S9n[        R                  " Xs:  U R                  U5      nU R                  R                  U5        U R                  R                  SXT5        U$ )a  
Calculate indices, into k_cache, v_cache, where to put k_val tensor.
Given the input_pos and length of k_val at sequence dim, the input pos may
have to wrap around if it is smaller than the cache capacity.
If it is larger than the cache capacity then just pick the last
self.max_context_length entries.

Additionally:
Update the cache positions buffer with the new indices.
Given the cache positions in sequence dim, indicated by indices,
we can just update cache_positions buffer using orig_indices.
For example
Given cache capacity of 4 and update of length 3 with start_pos = 2
will have following values
indices = [2, 3, 0]
orig_indices = [2, 3, 4]
So cache_positions after the update will be [4, 1, 2, 3]
Note cache_positions[1] = 1 that is from previous write to the cache.
The corner case here is cache positions before cache rolls over.
For example when start_pos = 0 and update is of length 2, then we have
filled positions 0 and 1 in the buffer, while the rest are invalid. In this case
we have
indices = [0, 1]
orig_indices = [0, 1]
But if we have cache_positins = [0, 1, 0, 0] that is not valid. Hence we have
to make sure that invalid positions have a sentinel value of - 1.
r   r   r   )rZ   r!   r[   r^   r   rD   fullr   r   copy_r_   )	r3   r   r   r`   orig_indicesrc   full_tarange_tensorr   s	            r'   &calculate_positions_and_update_indices<CachePositionsManager.calculate_positions_and_update_indices   s    8 aL%%'	Y'||G5::>J!8!88T446%**MT%<%<EJJO++%t';';V
 	""?3((GBr&   )rD   )r   r   r   r   r   r   ri   rP   r!   r"   r   r%   rk   rl   s   @r'   r   r      s@     6I5T5T

  3
 
) ) )r&   r   c                      ^  \ rS rSr\R
                  4S\S\S\S\S\4
U 4S jjjrS r	S	\R                  S
\R                  S\R                  S\\R                  \R                  4   4S jrSrU =r$ )RingKVCache   rC   rD   rE   rF   rG   c                    > X l          [        TU ]	  UUS-  UUUU5        [        U R                  5      U l        SU l        g )NrY   T)r   rO   rP   r   rD   cache_positions_manageris_ring_buffer)r3   rC   rD   rE   rF   rG   rL   rT   s          r'   rP   RingKVCache.__init__   sT     .,	Z 	" 	
 (=T=T=T'U$"r&   c                 Z    U R                   R                  n[        X0R                  X5      $ r;   )r   r   r   r   )r3   r`   r   r   s       r'   "create_causal_mask_for_ring_buffer.RingKVCache.create_causal_mask_for_ring_buffer  s+    66FF2--y
 	
r&   r   rV   rW   r0   c                 N   UR                  S5      nX@R                  R                  S5      ::  d'   SU SU R                  R                  S5       S35       eU R                  R                  X5      nU R                  (       ab  US   R                  5       n[        R                  " U5        U R                  R                  SXR5        U R                  R                  SXS5        O*X R                  S S 2S S 2U4'   X0R                  S S 2S S 2U4'   U R                  U R                  4$ )NrY   zUpdate sequence length(z3) for kv cache must be smaller than the cache size()r   )
r]   rI   r   r   rG   rZ   r!   r[   r_   rN   )r3   r   rV   rW   r   rc   r`   s          r'   rf   RingKVCache.update%  s	    **Q-,,++
 
 	y$WI-`aeamamararstau`vvwx	y 
 ..UU
 $$!!))+I  +LL$$Q7LL$$Q7*/LLAw'*/LLAw'||T\\))r&   )r   r   r   )r   r   r   r   r!   rh   ri   rj   rP   r   r"   r	   rf   r%   rk   rl   s   @r'   r   r      s     mm@#@#  @# 	@#
 @# #@# @#D
**.3ll*CH<<*	u||U\\)	** *r&   r   mhac                      ^  \ rS rSrS\S\S\S\4U 4S jjrS\	R                  S\	R                  S	\	R                  S
\S\\	R                  \\   4   4
S jrSrU =r$ )AttentionMHAi=  argslayer_idrope_kwargsc           	      8  > [         T
U ]  5         UR                  U l        UR                  U l        UR                  c  U R                  OUR                  U l        U R                  U R                  -  S:X  d   eSnU R                  U-  U l        U R                  U-  U l        U R
                  U R                  -  U l        UR                  U l        UR                  U l	        UR                  U l
        UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                   U l        U R                  U R                  -  U R                  (       a  SOS-  nU R                  (       a`  U R                  nU R                  n[#        UUR$                  UR&                  S9U l        [#        UUR$                  UR&                  S9U l        UR,                  bF  SUR,                  ;   a6  [/        UR                  UUR0                  UR2                  SUR                  S9O([4        R6                  " U R                  X`R                  S	9U l        UR,                  b]  S
UR,                  ;   aM  [/        UR                  UR                  UR                  -  UR0                  UR2                  SUR                  S9O@[4        R6                  " U R                  U R                  U R                  -  U R                  S	9U l        UR,                  b]  SUR,                  ;   aM  [/        UR                  UR                  UR                  -  UR0                  UR2                  SUR                  S9O@[4        R6                  " U R                  U R                  U R                  -  U R                  S	9U l        UR,                  bm  SUR,                  ;   d  SUR,                  ;   aM  [/        UR                  UR                  -  UR                  UR0                  UR2                  SUR                  S9O6[4        R6                  " U R                  U R                  -  U R                  SS	9U l        X l         X0l!        [D        RF                  " [D        RH                  " U R                  U R                  [D        RJ                  SS95      n	U RM                  SU	SS9  U R                  (       a  [O        UR                  UR                  U R                  U R                  UR                   5      U l(        [S        U R
                  U R                  -  U R                  U R                  U R                  S9U l)        gg)z
Multi-head attention layer.

Args:
    args (ModelArgs): Model configuration parameters.
    layer_id (int): Layer index.
    rope (Rope): Rotary position embedding module.
Nr   ry   rY   )epsadd_unit_offsetq_projr{   )in_dimout_dimrankalphadropoutuse_biasbiask_projv_projoutput_projo_projFrJ   rK   r   )
persistent)rp   rF   rq   rr   )*rO   rP   use_kv_cacherE   
n_kv_headsn_local_headsn_local_kv_headsrq   rF   rC   rr   rp   attention_qkv_biasuse_qk_normqk_norm_before_rope
use_q_gaterG   r   norm_epsrms_norm_add_unit_offset	q_norm_fn	k_norm_fntarget_modulesr   r
lora_alphannLinearwqwkwvwor   r   r!   trilonesrj   rQ   rA   kv_cachern   )r3   r   r   r   r   model_parallel_size	q_out_dim
q_norm_dim
k_norm_dimcausal_maskrT   s             r'   rP   AttentionMHA.__init__?  sl    	 --||*.//*A$,,t||doo-222!\\-@@ $3F F''4+@+@@
"11#3388"&"9"9++#'#;#; //$($=$=!LL4==0AaP	JJ$MM $ = =DN
 %MM $ = =DN "".8t?R?R3R xx!VVoo00 488Y5L5LM 	* "".8t?R?R3R xx$--7VVoo00 $//DMM9@W@W 	. "".8t?R?R3R xx$--7VVoo00 $//DMM9@W@W 	. "".!4!44DDWDW8W ||dmm3VVoo00 4<<$--7N 	  !	jjJJ$$$$jj	
 	V[UC###$$))DM &&6jj $ 4 4	DI r&   r,   r-   r.   r/   r0   c           	      r   UR                  S5      nUR                  u  pgnU R                  (       ad  U R                  U5      R	                  XgU R
                  U R                  S-  5      n	[        R                  " U	SSS9u  pUR                  XgS5      nO8U R                  U5      R	                  XgU R
                  U R                  5      n
S nU R                  U5      U R                  U5      pUR	                  XgU R                  U R                  5      nUR	                  XgU R                  U R                  5      nU R                  (       a3  U R                  (       a"  U R                  U
5      n
U R!                  U5      nU R"                  R%                  XX#5      u  pU
R'                  SS5      n
UR'                  SS5      nUR'                  SS5      nU R                  (       a3  U R                  (       d"  U R                  U
5      n
U R!                  U5      nU R(                  (       GaH  Uc   eU R*                  (       ay  US   R-                  5       n[        R.                  " U5        [        R0                  " XR2                  :  5        U
R5                  S5      nU R6                  R9                  SX5      nOU R6                  U   nU R:                  R=                  X\U5      u  p[?        U R:                  SS5      (       a-  U R:                  RA                  US   R-                  5       U5      nU RC                  XZXXgU5      nUb  U[        RD                  " U5      -  nU RG                  U5      S 4$ URI                  U RJ                  SS9nURI                  U RJ                  SS9n[M        U S	5      (       d   eU R6                  S U2S U24   n[N        RP                  " XUUS
S9nUR'                  SS5      R                  XgS5      nUb  U[        RD                  " U5      -  nU RG                  U5      nUS 4$ )Nr   rY   r   rz   ry   r   r   Fr   r{   r|   ))getshaper   r   r   r   rF   r!   chunkr   r   r   r   r   r   r   r   r   r4   r   r   rG   rZ   r[   r\   rr   r]   r   narrowr   rf   getattrr   rn   sigmoidr   r   rq   hasattrr   r   )r3   r,   r-   r.   r/   r   r   r   _
q_and_gateru   gaterv   rw   r`   rb   r}   outputr   s                      r'   r4   AttentionMHA.forward  sy    JJ{+	Q??T//1BJ kk*aR8GA<<R0D
T-?-?OADwwqz4771:1FF3 5 5t}}EFF3 5 5t}}E 8 8q!Aq!A yy  y<KK1KK1KK1D$<$<q!Aq!A(((((%bM..0	$$Y/Y)=)==>VVAY
 II,,Q	F	 !IIi0	==''	a8DAt}}&6>> MMLLaL%%'	 YYyQ3	JF%--"55776?D(( 

2

2tV$$$$yy&'6')*//a4SVW!!!Q'//R@emmD11Ft|r&   )rn   r   rp   rG   rF   r   r   r   rC   rr   rE   r   r   r   rq   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   ri   r   r   rP   r!   r"   r   r	   r   r4   r%   rk   rl   s   @r'   r   r   =  s    DD D 	D
 DLM<<M <<M <<	M
 !M 
u||Xc]*	+M Mr&   r   r,   rp   r   r0   c                 \    [         R                  " X -  R                  USS9U-   5      nX-  $ )NT)rp   keepdim)r!   rsqrtsum)r,   rp   r   inv_norms       r'   _l2normr     s,    {{AE;;3;=CDH<r&   gated_deltanetc                     ^  \ rS rSrSrS\S\S\S\4U 4S jjr	S\
\R                     S	\S
S4S jrS\R                  S
\R                  4S jrS\R                  S\R                  S\R                  S\R                  S\R                  S
\R                  4S jrS\R                  S\R                  S\R                  S\S
\\R                  \
\   4   4
S jrSrU =r$ )AttentionGatedDeltaNeti  zDQwen3.5 linear-attention (Gated DeltaNet) block with internal state.r   r   r   r   c                   > [         TU ]  5         AUR                  U l        UR                  U l        X l        UR                  c   eUR                  c   eUR                  c   eUR                  c   eUR                  U l
        UR                  U l        UR                  U l        UR                  U l        U R                  U R                  -  U l        U R                  U R                  -  U l        UR                   U l        U R                  U R                  -  S:X  d   S5       eU R                  U R                  -  U l        U R                  S-  U R                  -   U l        [(        R*                  " U R                  U R&                  SS9U l        [(        R*                  " U R                  U R                  SS9U l        [(        R*                  " U R                  U R                  SS9U l        [(        R*                  " U R                  U R                  SS9U l        [(        R4                  " U R&                  U R&                  U R"                  U R&                  SSS9U l        [(        R8                  " [:        R<                  " U R                  5      5      U l        [:        R@                  " U R                  5      RC                  SS5      n[(        R8                  " [:        RD                  " U5      5      U l#        [I        U R                  URJ                  S9U l&        [(        R*                  " U R                  U R                  SS9U l'        U RQ                  S	[:        RR                  " U R                  U R&                  U R"                  [:        RT                  S
S95        U RQ                  S[:        RR                  " U R                  U R                  U R                  U R                  [:        RT                  S
S95        g )Nr   zAlinear_num_value_heads must be divisible by linear_num_key_heads.rY   Fr   )in_channelsout_channelskernel_sizegroupsr   padding   )r   
conv_staterJ   rK   recurrent_state)+rO   rP   rp   hidden_sizerC   r   linear_num_key_headslinear_num_value_headslinear_key_head_dimlinear_value_head_dimnum_k_headsnum_v_heads
head_k_dim
head_v_dimkey_dim	value_dimlinear_conv_kernel_dimconv_kernel_sizehead_repeatconv_dimr   r   in_proj_qkv	in_proj_z	in_proj_b	in_proj_aConv1dconv1d	Parameterr!   r   dt_biasemptyuniform_logA_logr   r   normout_projrQ   rR   rh   )r3   r   r   r   r   ArT   s         r'   rP   AttentionGatedDeltaNet.__init__  s    	88"11 ((444**666''333))55544662244)9)994+;+;; $ ; ; t///14	ON	O4++t/?/??q(4>>999T%5%5t}}5Q4#3#3T^^%P4#3#3T5E5EER4#3#3T5E5EERii--==
 ||EJJt/?/?$@AKK(()221b9\\%))A,/
 dmmD			$..$2B2BOKK##%%mm		
 	KK##  mm
	
r&   r   
batch_sizer0   Nc                 X   Uc;  U R                   S U R                  5         U R                  S U R                  5         g US   S:H  R                  U R                   R                  5      nSU-
  nU R                   S U R                  U5        U R                  S U R                  U5        g )Nr         ?)r
  zero_r  torL   mul_)r3   r   r+  resetkeeps        r'   _maybe_reset_state)AttentionGatedDeltaNet._maybe_reset_statei  s     OOKZ(..0  *-3351"&&t'<'<=U{$))$/[j)..t4r&   	mixed_qkvc                    UR                   u  p#nUR                  SS5      nU R                  R                   S   n[        R                  " U R                  S U U/SS9nUS S 2S S 2U* S 24   n[        R
                  " 5          U R                  S U R                  UR                  U R                  R                  5      5        S S S 5        [        R                  " UU R                  R                  U R                  R                  SU R                  S9n[        R                  " US S 2S S 2U* S 24   5      R                  UR                  5      nUR                  SS5      R                  5       $ ! , (       d  f       N= f)Nry   rY   r   rz   r   )r  r  )r   r   r
  r!   catno_gradr   r/  rL   r   r   weightr   r  silu
contiguous)	r3   r5  r+  r   r   	state_lenhidden_states_newnew_conv_stateouts	            r'   _apply_causal_conv)AttentionGatedDeltaNet._apply_causal_convu  s.   !*
Q''1-	OO))"-	!IIt{
'CY&OUWX*1a)+<=]]_OOKZ(..~/@/@AVAV/WX hhKKKK==
 ffSAxy)*--ioo>}}Q"--// _s   AE33
Fquerykeyvaluegbetac           	      `   UR                   n[        USSS9n[        USSS9nXX5U4 Vs/ s H@  nUR                  SS5      R                  5       R	                  [
        R                  5      PMB     snu  pp5nUR                  u  ppUR                  S   nSUR                  S   S-  -  nX-  n[
        R                  " UU	U
UUR                  UR                   S9nU R                  S U R	                  UR                   5      n[        U
5       H  nUS S 2S S 2U4   nUS S 2S S 2U4   nUS S 2S S 2U4   nUS S 2S S 2U4   R                  5       R                  S5      R                  S5      nUS S 2S S 2U4   R                  S5      nUU-  nUUR                  S5      -  R                  S	S
9nUU-
  U-  nUUR                  S5      UR                  S	5      -  -   nUUR                  S5      -  R                  S	S
9US S 2S S 2U4'   M     [
        R                  " 5          U R                  S U R!                  UR	                  U R                  R                   5      5        S S S 5        UR                  SS5      R                  5       R	                  U5      $ s  snf ! , (       d  f       NB= f)Nr   ư>)rp   r   ry   rY   r-  g      ?)rM   rL   rz   )rL   r   r   r;  r/  r!   rh   r   rR   rM   r  rangeexp	unsqueezer   r8  r   )r3   rB  rC  rD  rE  rF  initial_dtyper,   r+  	num_headssequence_length
k_head_dim
v_head_dimscalecore_attn_outlast_recurrent_stateiq_tk_tv_tg_tbeta_tkv_memr   s                           r'   _recurrent_gated_delta_rule2AttentionGatedDeltaNet._recurrent_gated_delta_rule  s    240crt, %q1&
1 KK1((*--emm<1&
"E
 >AYY:
[[_
u{{2#-.<<++
  $33KZ@CCEKKP'A1a.CaAg,C1a.CAq!G*..",,R0::2>C!Q'],,R0F#7##= *S]]2->>CCCKF6\V+E#7#--;#;$ $$  ';S]]2=N&N%S%S &T &M!Q'" (" ]]_  *-33$''(<(<(B(BC 
 &&q!,779<<]KKU&
J _s   AJ!AJ
J-r,   r-   r.   r/   c                    AAUR                  S5      nUR                  u  pgnX`R                  ::  d   SU SU R                   S35       eU R                  XV5        U R	                  U5      n	U R                  U5      R                  XgSU R                  5      n
U R                  U5      nU R                  U5      nU R                  U	5      n	[        R                  " U	U R                  U R                  U R                  /SS9u  pnUR                  XgSU R                  5      nUR                  XgSU R                  5      nUR                  XgSU R                  5      nU R                   S:  a4  UR#                  U R                   SS9nUR#                  U R                   SS9nUR%                  5       nU R&                  R)                  5       R+                  5       * [,        R.                  " UR)                  5       U R0                  -   5      -  nU R3                  XUUU5      nUR                  SU R                  5      nU
R                  SU R                  5      n
U R5                  UU
5      nUR                  XgS5      nU R7                  U5      S 4$ )	Nr   zbatch_size (z) exceeds max_batch_size (r   r   rz   ry   rY   )r   r   rC   r3  r  r  r   r  r  r  r@  r!   splitr  r  r  r  r   r   r&  r   rK  r   softplusr"  r\  r'  r(  )r3   r,   r-   r.   r/   r   r+  r   r   r5  zbarB  rC  rD  rF  rE  rS  s                      r'   r4   AttentionGatedDeltaNet.forward  s<    JJ{+	!"
Q---	W*%?@S@S?TTUV	W- 		6$$Q'	NN1%%j2tONN1NN1++I6	!KK\\4<<8
E
 j2tGkk*r4??Cj2tGa++D,<,<!+DE''(8(8a'@Cyy{ZZ##%%

1779t||3K(LL88UAtT%--b$//BIIb$//*		-3%--j2F}}]+T11r&   )r&  r   r  r  r"  r  r  r  r  r  r  r  r  r  r   rC   r'  r  r  r(  r  )r   r   r   r   r    r   ri   r   r   rP   r   r!   r"   r3  r@  r\  r   r	   r4   r%   rk   rl   s   @r'   r  r    s/   NI
I
 I
 	I

 I
V
5!%,,/
5=@
5	
50ELL 0U\\ 0&7L||7L \\7L ||	7L
 <<7L ll7L 
7Lr-2<<-2 <<-2 <<	-2
 !-2 
u||Xc]*	+-2 -2r&   r  skipc                      ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  S\S\\R                  \	\
   4   4
S jrS	rU =r$ )
AttentionSkipi  c                 "   > [         TU ]  5         g r;   )rO   rP   )r3   r   r/   rT   s      r'   rP   AttentionSkip.__init__  s    r&   r,   r-   r.   r/   r0   c                 
    US 4$ r;   r   r2   s        r'   r4   AttentionSkip.forward  s     $wr&   r   )r   r   r   r   rP   r!   r"   r   r	   r   r   r4   r%   rk   rl   s   @r'   rg  rg    s`    << << <<	
 ! 
u||Xc]*	+ r&   rg  )r   rH  )/abcr   r   enumr   typingr   r   r   r	   r
   r   r!   torch.nnr   torch.nn.functional
functionalr   %executorch.examples.models.llama.lorar   +executorch.examples.models.llama.model_argsr   %executorch.examples.models.llama.normr   r   %executorch.examples.models.llama.roper   r   Moduler*   r6   strr#   r?   rA   rn   r   r   r   r   r   r"   ri   r   r   r  rg  r   r&   r'   <module>rx     sZ   # #  > >     < A G 6	5Ye 	5		3 0 24 Dd9o-. 3S - bii - `@299 @D$ 
9BII 9x^*' ^*B ET9 T Tnu|| #  %,, 
 $%S2Y S2 &S2l FI  r&   