
    iD                     z   S SK r S SKJr  S SKrS SKJs  Jr  S SKJ	r
  S SKJr  S SKJrJr  S SKJ	r	  \ R                   " \5      r " S S\R&                  5      r " S S	\R&                  5      rS
\R                  R&                  SS4S jrS
\R                  R&                  S\R                  R&                  4S jrg)    N)Optional)KVCache)nn)	_MaskType_sdpa_or_flex_attentionc            !         ^  \ rS rSrSrSSSSSSSS.S\S	\S
\S\S\R                  S\R                  S\R                  S\R                  S\\R                     S\\R                     S\\R                     S\\	   S\S\
S\SS4 U 4S jjjrS\S\R                  S\SS4S jrS r S$SSS.S\R"                  S\\R"                     S \\   S!\\R"                     S\R"                  4
S" jjjrS#rU =r$ )%MultiHeadAttention   u  
NOTE: copied from Torchtune's mha.py. Should be mostly 1:1 except
that SDPA is factored out so that it can be swapped for more
efficient ExecuTorch-defined SDPA ops.

Multi-headed attention layer with support for grouped query
attention (GQA) introduced in https://arxiv.org/abs/2305.13245v1.

GQA is a version of multiheaded attention (MHA) which uses fewer
key/value heads than query heads by grouping n query heads for each
key and value head. Multi-Query Attention is an extreme
version where we have a single key and value head shared by all
query heads.

Following is an example of MHA, GQA and MQA with num_heads = 4

(credit for the documentation:
`litgpt.Config <https://github.com/Lightning-AI/litgpt/blob/eda1aaaf391fd689664f95487ab03dc137e213fd/litgpt/config.py>`_).


::

    ┌───┐┌───┐┌───┐┌───┐     ┌───┐    ┌───┐             ┌───┐
    │ v ││ v ││ v ││ v │     │ v │    │ v │             │ v │
    └───┘└───┘└───┘└───┘     └───┘    └───┘             └───┘
    │    │    │    │         │        │                 │
    ┌───┐┌───┐┌───┐┌───┐     ┌───┐    ┌───┐             ┌───┐
    │ k ││ k ││ k ││ k │     │ k │    │ k │             │ k │
    └───┘└───┘└───┘└───┘     └───┘    └───┘             └───┘
    │    │    │    │      ┌──┴──┐  ┌──┴──┐      ┌────┬──┴─┬────┐
    ┌───┐┌───┐┌───┐┌───┐  ┌───┐┌───┐┌───┐┌───┐  ┌───┐┌───┐┌───┐┌───┐
    │ q ││ q ││ q ││ q │  │ q ││ q ││ q ││ q │  │ q ││ q ││ q ││ q │
    └───┘└───┘└───┘└───┘  └───┘└───┘└───┘└───┘  └───┘└───┘└───┘└───┘
    ◀──────────────────▶  ◀──────────────────▶  ◀──────────────────▶
            MHA                    GQA                   MQA
    n_kv_heads =4          n_kv_heads=2           n_kv_heads=1

Args:
    embed_dim (int): embedding dimension for the model
    num_heads (int): number of query heads. For MHA this is also the
        number of heads for key and value
    num_kv_heads (int): number of key and value heads. User should ensure
        ``num_heads % num_kv_heads == 0``. For standard MHA set ``num_kv_heads == num_heads``,
        for GQA ``num_kv_heads < num_heads``, and for MQA set ``num_kv_heads == 1``.
    head_dim (int): dimension of each head, calculated by ``embed_dim // num_heads``.
    q_proj (nn.Module): projection layer for query.
    k_proj (nn.Module): projection layer for key.
    v_proj (nn.Module): projection layer for value.
    output_proj (nn.Module): projection layer for output.
    pos_embeddings (Optional[nn.Module]): positional embeddings layer, e.g. RotaryPositionalEmbeddings.
    q_norm (Optional[nn.Module]): normalization layer for query, e.g. RMSNorm. For decoding, this is applied
        before updating from kv_cache. This means it will only support token wide normalization and not
        batch or sequence wide normalization.
    k_norm (Optional[nn.Module]): normalization layer for key, must be set if q_norm is.
    kv_cache (Optional[KVCache]): KVCache object used to cache key and value
    max_seq_len (int): maximum sequence length supported by the model.
        This is needed to compute the RoPE Cache. Default: 4096.
    is_causal (bool): sets the default mask to causal when no mask is provided
    attn_dropout (float): dropout value passed onto the scaled_dot_product_attention function.
        Default value is 0.0.

Raises:
    ValueError: If ``num_heads % num_kv_heads != 0``
    ValueError: If ``embed_dim % num_heads != 0``
    ValueError: If ``attn_dropout < 0`` or ``attn_dropout > 1``
    ValueError: if q_norm is defined without k_norm or vice versa
Ni   T        )pos_embeddingsq_normk_normkv_cachemax_seq_len	is_causalattn_dropout	embed_dim	num_headsnum_kv_headshead_dimq_projk_projv_projoutput_projr   r   r   r   r   r   r   returnc          
        > [         TU ]  5         X#-  S:w  a  [        SU SU S35      eX-  S:w  a  [        SU SU S35      eUS:  d  US:  a  [        SU S	35      e[        U
5      [        U5      -  (       a  [        S
5      eX l        X0l        Xl        Xl        X@l        Xl	        Xl
        Xl        XPl        X`l        Xpl        Xl        Xl        Xl        Xl        ['        5       U l        [+        U R
                  U R                  U R                  U R,                  (       a  U R                  OSU R                  U R(                  U R                  S9U l        SU l        g )Nr   znum_heads (z%) must be divisible by num_kv_heads ()zembed_dim (z") must be divisible by num_heads (   zattn_dropout (z) must be between 0.0 and 1.0z!q and k norm must be set togetherr   )r   r   r   r   r   attention_fnr   F)super__init__
ValueErrorboolr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _attention_callSDPAtraining_sdpacache_enabled)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   	__class__s                   i/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/executorch/extension/llm/modules/attention.pyr!   MultiHeadAttention.__init__[   sj   & 	#q(i[ )!!-a1 
  A%i[ )'[+ 
 !|a/~i[8UVWW<$v,&@AA #("( &" !&,  78**nn]].2mm**nn--]]

 #    
batch_sizedtypec           	          U R                   b  [        R                  S5        g[        UUU R                  U R
                  USS9U l         U R                   U R                  l         SU l        g)a!  Setup key value caches for attention calculation. If called
after kv_cache is already setup, this will be skipped.

Args:
    batch_size (int): batch size for the caches.
    dtype (torch.dtype): dtype for the caches.
    max_seq_len (int): maximum sequence length model will be run with.
NzWKey value caches are already setup. You cannot call ``setup_caches()`` twice. Skipping.F)r.   r   r   r   r/   transpose_cacheT)r   loggerwarningInferenceKVCacher   r   r'   r(   )r)   r.   r/   r   s       r+   setup_cacheMultiHeadAttention.setup_cache   sa     ==$NNi -%'!.. %DM #'--DJJ!%Dr-   c                 h    U R                   c  [        S5      eU R                   R                  5         g)zReset the key value caches.Nz>Key value caches are not setup. Call ``setup_caches()`` first.)r   RuntimeErrorreset)r)   s    r+   reset_cacheMultiHeadAttention.reset_cache   s.    == P  	r-   )mask	input_posxyr<   r=   c          	      r  ^ ^^^ UR                   u  mpVT R                  U5      nT R                  T R                  -  nUR	                  TUT R                  U-  T R
                  5      nT R                  b  T R                  UTS9nT R                  b  T R                  U5      nUUU 4S jmU 4S jn	UU 4S jn
T R                  c  Uc   S5       eT" U5      u  pO[        R                  " [        R                  " U5      R                  5       R                  5       XU45      u  pnT R                  R                  R                  U5        T R                  R                   R                  U5        T R                  R"                  R                  U5        T R%                  X{UTXSS9nT R'                  U5      $ )a  
Args:
    x (torch.Tensor): input tensor with shape [b x s_x x d] for the query
    y (Optional[torch.Tensor]): second input tensor with shape [b x s_y x d], is the input
        for k and v. For self attention, x=y. Optional only with kv_cache enabled.
    mask (Optional[_MaskType]): Used to mask the scores after the query-key multiplication
        and before the softmax. Either:

        A boolean tensor with shape ``[b x s x s]``, ``[b x s x self.encoder_max_cache_seq_len]``,
        or ``[b x s x self.encoder_max_cache_seq_len]`` if using KV-cacheing with encoder/decoder layers.
        A value of True in row ``i`` and column ``j`` means token ``i`` attends to token ``j``. A value of False means
        token ``i`` does not attend to token ``j``. If no mask is specified, a causal mask
        is used by default.

        A :class:`~torch.nn.attention.flex_attention.BlockMask` for document masking in a packed sequence
        created via `create_block_mask <https://pytorch.org/blog/flexattention/#mask-mods>`_. We  use
        :func:`~torch.nn.attention.flex_attention.flex_attention` when computing attention with block masks.
        Default is None.
    input_pos (Optional[torch.Tensor]): Optional tensor which contains the position ids
        of each token. During training, this is used to indicate the positions
        of each token relative to its sample when packed, shape [b x s].
        During inference, this indicates the position of the current token.
        If none, assume the index of the token is its position id. Default is None.

Raises:
    ValueError: If no ``y`` input and ``kv_cache`` is not enabled.

Returns:
    torch.Tensor: output tensor with attention applied

Notation used for tensor shapes:
    - b: batch size
    - s_x: sequence length for x
    - s_y: sequence length for y
    - n_h: num heads
    - n_kv: num kv heads
    - d: embed dim
    - h_d: head dim
r=   c                 Z  > U R                   S   nTR                  U 5      nTR                  U 5      nUR                  TUSTR                  5      nUR                  TUSTR                  5      nTR
                  b  TR                  UTS9nTR                  b  TR                  U5      nX#4$ )Nr   rA   )shaper   r   viewr   r   r   )r?   s_ykvbr=   r)   s       r+   calculate_kv0MultiHeadAttention.forward.<locals>.calculate_kv  s    ''!*C AAAA q#r4==1Aq#r4==1A"".''Y'? {{&KKN4Kr-   c                 ~   > TR                   R                  5       nUR                  UR                  UR                  4$ N)r   clonek_cachev_cachekv_cache_pos)r?   r   r)   s     r+   true_fn+MultiHeadAttention.forward.<locals>.true_fn  s4    }}**,H##X%5%5x7L7LLLr-   c                    > T" U 5      u  pTR                   R                  5       nUR                  X5        UR                  UR                  UR
                  4$ rM   )r   rN   updaterO   rP   rQ   )r?   rG   rH   r   rJ   r)   s       r+   false_fn,MultiHeadAttention.forward.<locals>.false_fn#  sK    ?DA}}**,HOOA!##X%5%5x7L7LLLr-   zAMust provide y input or use kv_cache to enable streaming decoding)r<   )rD   r   r   r   rE   r   r   r   r   torchcondisnanallitemrO   copy_rP   rQ   r'   r   )r)   r>   r?   r<   r=   s_x_qq_per_kvrR   rV   rG   rH   	cache_posoutputrI   rJ   s   `   `          @@r+   forwardMultiHeadAttention.forward   s   b GG	3 KKN >>T%6%66FF1c4,,x7G *##A#;A ;;"AA	(	M	M == SRS?DAq
 $jjA""$))+WOA) MM!!''*MM!!''*MM&&,,Y7A!Q7''r-   )r$   r'   r   r(   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rM   )__name__
__module____qualname____firstlineno____doc__intr   Moduler   r   r#   floatr!   rX   r/   r5   r:   Tensorr   rd   __static_attributes____classcell__r*   s   @r+   r	   r	      s   B^ /3&*&*&*!#H# H# 	H#
 H# H# 		H# 		H# 		H# YYH# !+H# #H# #H# 7#H# H#  !H#" #H#$ 
%H# H#T&&&+kk&@C&	&8 %)s(
 %),0s(<<s( ELL!s(
 y!s( ELL)s( 
s( s(r-   r	   c                      ^  \ rS rSrSrS\S\S\S\S\SS	4U 4S
 jjr SS\	R                  S\	R                  S\	R                  S\S\S\\   S\	R                  4S jjrSrU =r$ )r%   i?  zf
TorchTune's SDPA which can be optimized and can be swapped
out for a more efficient implementations.
r   r   r   r   r   r   Nc                    > [         TU ]  5         Xl        X l        X0l        U R                  U R                  -  U l        X@l        XPl        X`l        Xpl	        g rM   )
r    r!   r   r   r   ra   r   r   _attention_fnr   )	r)   r   r   r   r   r   r   r   r*   s	           r+   r!   SDPA.__init__E  sL     	(" $*;*;;(") r-   r`   rG   rH   bszseq_lenr<   c           	      X   UR                  SS5      nUR                  SS5      nUR                  SS5      nU R                  U R                  :w  aq  SSU R                  SS4nUR	                  S5      R                  U5      R                  SS5      nUR	                  S5      R                  U5      R                  SS5      nU R                  UUUUU R                  U R                  S L =(       a    US L =(       a    U R                  S9nUR                  SS5      R                  XES5      $ )Nr      rC   )r<   	dropout_pr   )	transposer   r   ra   	unsqueezeexpandflattenrt   r   r   r   reshape)	r)   r`   rG   rH   rv   rw   r<   expand_shaperc   s	            r+   rd   SDPA.forwardY  s    KK1KK1KK1 >>T...DMM2r:LA%%l3;;AqAAA%%l3;;AqAA##''mmt+OO $ 
 1%--cB??r-   )rt   r   r   r   r   r   r   ra   rM   )rf   rg   rh   ri   rj   rk   rm   r#   r!   rX   rn   r   r   rd   ro   rp   rq   s   @r+   r%   r%   ?  s    
!! ! 	!
 ! ! 
!6 %)!@<<!@ <<!@ <<	!@
 !@ !@ y!!@ 
!@ !@r-   r%   moduler   c                    U R                  5        H  u  p[        U[        R                  5      (       a  [	        U U[        UR
                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                   UR"                  UR$                  UR&                  S95        M  [)        U5        M     g )N)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )named_children
isinstanceTorchTuneAttentionr	   setattrr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   replace_mha_with_inference_mha)r   namechilds      r+   _replace_mha_with_inference_mhar   }  s    ,,.e/BBCC"#oo#oo!&!3!3"^^ << << << % 1 1#(#7#7 << <<"^^ % 1 1#oo!&!3!3, +511 /r-   c                     [        U 5        U $ )z
Replace TorchTune's MHA with an inference friendly version of MHA that
separates out the inference-related parts for further optimization.
)r   )r   s    r+   r   r     s    
 $F+Mr-   )loggingtypingr   rX   torchtune.modules.attentionmodules	attentionr   )executorch.extension.llm.modules.kv_cacher   r4   r   !torchtune.modules.attention_utilsr   r   torchtune.modules.kv_cache	getLoggerrf   r2   rl   r	   r%   r   r    r-   r+   <module>r      s       8 8 Q  P .			8	$f( f(R	;@299 ;@|2EHHOO 2 28588?? uxx r-   