
    3Ki 5                        S r SSKJrJr  SSKJr  SSKrSSKJs  J	r
  SSKJrJrJrJr  SSKJr  SSKJrJrJrJr  / SQr\R0                  R3                  \5        \R0                  R3                  \5        \R0                  R3                  \5        \R0                  R3                  \5         " S	 S
\5      r " S S\R6                  5      rS\4S jrS\4S jrg)zCDefines bias subclasses that work with scaled_dot_product_attention    )autoIntEnum)warnN)can_use_efficient_attentioncan_use_flash_attentionis_flash_attention_available
SDPAParams)_raise_kernel_warnings)_calculate_scale_input_requires_grad_postprocess_flash_output_validate_sdpa_input)causal_upper_leftcausal_lower_rightCausalVariant
CausalBiasc                   4    \ rS rSrSr\" 5       r\" 5       rSrg)r   !   a  
Enum for causal variants used in attention mechanisms.

Defines two types of causal biases:

``UPPER_LEFT``: Represents upper-left triangular bias for standard causal attention.
The equivalent pytorch code for constructing this bias is:

.. code-block:: python

    torch.tril(torch.ones(size, dtype=torch.bool))

For instance, with ``shape=(3,4)``, the materialized bias tensor will be:

.. code-block:: text

    [[1, 0, 0, 0],
     [1, 1, 0, 0],
     [1, 1, 1, 0]]


``LOWER_RIGHT``: Represents lower-right triangular bias, the include values are aligned to the lower
right corner of the matrix.

The equivalent pytorch code for constructing this bias is:

.. code-block:: python

    diagonal_offset = size[1] - size[0]
    torch.tril(
        torch.ones(size, dtype=torch.bool),
        diagonal=diagonal_offset,
    )

For instance, with ``shape=(3,4)``, the materialized bias tensor will be:

.. code-block:: text

    [[1, 1, 0, 0],
     [1, 1, 1, 0],
     [1, 1, 1, 1]]

Note that these variants are equivalent to each other when the sequence lengths of the query and key/value
tensors are equal since the triangular matrix is square.

.. warning:: This enum is a prototype and subject to change.
 N)	__name__
__module____qualname____firstlineno____doc__r   
UPPER_LEFTLOWER_RIGHT__static_attributes__r       Y/var/www/html/dynamic-report/venv/lib/python3.13/site-packages/torch/nn/attention/bias.pyr   r   !   s    .` J&Kr   r   c                     ^  \ rS rSrSrS\S\S\SS4U 4S jjrS	\R                  S\R                  4S
 jrS	\R                  S\R                  4S jrSS	\R                  S-  S\R                  4S jjr\    SS\R                  S\R                  S\R                  SS S\S\S\S-  S\S\R                  4S jj5       r\SU 4S jj5       rS\4S jrSrU =r$ )r   V   a  
A bias representing causal attention patterns. For an overview of the bias structure, see the :class:`CausalVariant` enum.

This class is used for defining causal (triangular) attention biases. For construing the bias, there exist
two factory functions: :func:`causal_upper_left` and :func:`causal_lower_right`.

Example:

.. code-block:: python

    from torch.nn.attention.bias import causal_lower_right

    bsz, num_heads, seqlen_q, seqlen_kv, head_dim = 32, 8, 4, 12, 8

    # Create a lower-right causal bias
    attn_bias = causal_lower_right(seqlen_q, seqlen_kv)

    q = torch.randn(
        bsz, num_heads, seqlen_q, head_dim, device="cuda", dtype=torch.float16
    )
    k = torch.randn(
        bsz, num_heads, seqlen_kv, head_dim, device="cuda", dtype=torch.float16
    )
    v = torch.randn(
        bsz, num_heads, seqlen_kv, head_dim, device="cuda", dtype=torch.float16
    )

    out = F.scaled_dot_product_attention(q, k, v, attn_bias)

.. warning:: This class is a prototype and subject to change.
variant	seq_len_q
seq_len_kvreturnNc                    > [        U[        5      (       d   e[        TU ]  5         Xl        X l        X0l        X#:  a   U[        R                  :X  a  [        SSS9  ggg)a  
Initializes the CausalBias instance with a specified variant and sequence lengths.

Args:
    variant (CausalVariant): The type of causal bias to use (either UPPER_LEFT or LOWER_RIGHT).
    seq_len_q (int): The sequence length of the query tensor.
    seq_len_kv (int): The sequence length of the key/value tensor.

Raises a warning if the LOWER_RIGHT variant is used with seq_len_q > seq_len_kv, as it may produce NaNs.
zTLower right causal bias will produce NaNs in the output when seq_len_q > seq_len_kv!   )
stacklevelN)	
isinstancer   super__init__r"   r#   r$   r   r   )selfr"   r#   r$   	__class__s       r   r+   CausalBias.__init__w   s\     '=1111"$!g1J1J&Jf 'K!r   devicec           	          [         R                  " [         R                  " U R                  U R                  U[         R
                  S95      $ )zUpper left causal biasr/   dtype)torchtrilonesr#   r$   boolr,   r/   s     r   _upper_leftCausalBias._upper_left   s1    zzJJt~~tvUZZX
 	
r   c           	          U R                   U R                  -
  n[        R                  " [        R                  " U R                  U R                   U[        R
                  S9US9$ )zLower right causal biasr1   )diagonal)r$   r#   r3   r4   r5   r6   )r,   r/   diagonal_offsets      r   _lower_rightCausalBias._lower_right   sK    //DNN:zzJJejj %	
 	
r   c                     Uc  [         R                  " S5      nU R                  [        R                  :X  a  U R                  U5      $ U R                  [        R                  :X  a  U R                  U5      $ g)aX  
Materializes the causal bias into a tensor form.

Depending on the variant, this method generates either an upper-left or lower-right
triangular matrix to represent the causal bias.

Args:
    device (Optional[torch.device]): The device on which to create the tensor. Defaults to CPU.

Returns:
    torch.Tensor: The materialized bias tensor.
Ncpu)r3   r/   r"   r   r   r8   r   r=   r7   s     r   _materializeCausalBias._materialize   sb     >\\%(F<<=333##F++\\]666$$V,, 7r   querykeyvalue	attn_mask	dropout_p	is_causalscale
enable_gqac                    U(       a  [        S5      eUR                  UR                  :X  d  UR                  [        R
                  :X  a  [        R                  " U UUSUSUUS9$ UR                  [        R                  :X  Ga*  [        XUSXEU5        [        XUSXEU5      n[        U5      (       Ga  U R                  R                  S:X  a  SOSn	U R                  S5      n
[        X5      nX-  S	:g  nU(       a  XU	-  -
  n[         R"                  R$                  R'                  U S	U45      n [         R"                  R$                  R'                  US	U45      n[         R"                  R$                  R'                  US	U45      n[         R(                  R*                  R-                  U UUUSS
US9S	   n[/        X5      $ [1        U5      (       a  S
n[3        XU5      (       a  Sn[         R(                  R*                  R5                  U R7                  SS5      UR7                  SS5      UR7                  SS5      SSSSSU[9        UR                  5      UUSS9S	   R7                  SS5      $ [;        U5        [        R                  " U UUUR=                  U R                  5      US
UUS9$ [        SUR                   35      e)a  
Handles the logic for computing attention with the specified causal bias.

Args:
    query (Tensor): Query tensor; shape :math:`(N, ..., L, E)`.
    key (Tensor): Key tensor; shape :math:`(N, ..., S, E)`.
    value (Tensor): Value tensor; shape :math:`(N, ..., S, Ev)`.
    attn_mask (CausalBias): The type of causal attention to apply.
        A boolean mask where a value of True indicates that the element *should* take part in attention.
        A float mask of the same type as query, key, value that is added to the attention score.
    dropout_p (float): Dropout probability; if greater than 0.0, dropout is applied
    is_causal (bool): If true, assumes upper left causal attention masking and errors if both attn_mask and is_causal
        are set.
    scale (optional float): Scaling factor applied prior to softmax. If None, the default value is set
        to :math:`\frac{1}{\sqrt{E}}`.
    enable_gqa (optional bool): If set to True, Grouped Query Attention (GQA) is enabled, by default it is set to False.

Returns:
    output (Tensor): Attention output; shape :math:`(N, ..., L, Ev)`.

Raises:
    ValueError: If the causal bias variant is not a CausalVariant type.

z.CausalBias should not be used with causal=TrueNT)rF   rG   rH   rI   rJ   xpu@      r   F)rH   return_debug_maskrI      r'   )
biascu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_krG   custom_mask_typecompute_log_sumexprI   seqlen_kz<CausalBias.variant must be a CausalVariant type, but found: )
ValueErrorr#   r$   r"   r   r   Fscaled_dot_product_attentionr   r   r	   r   r/   typesizer   r3   nn
functionalpadopsaten#_scaled_dot_product_flash_attentionr   r   r   _efficient_attention_forward	transposeintr
   rA   )rC   rD   rE   rF   rG   rH   rI   rJ   sdpa_params	alignmentog_head_sizeog_scaleneeds_paddingpad_lenoutrX   s                   r   	_dispatchCausalBias._dispatch   s   F MNN 9#7#77  M$<$<<11#%	 	 -";";; UD)PUV$E4zK '{33"',,"3"3u"<B!	$zz"~+L@ , 8A = ')+CDG!HH//33EAw<HE((--11#7|DC!HH//33EAw<HEiinnHH"&+" I   1CC*;77%*"'E::)-&yy~~BBOOAq)MM!Q'OOAq)!%!%!%!%'%():):%;'9! C   Yq!_%  '{355'44U\\B'#)	 	 NyO`O`Nab r   c                    > Uc  0 nU[         R                  R                  R                  L a  U R                  " U0 UD6$ [
        TU ]  XX45      $ )zjDefines the behavior of torch.nn.functional.scaled_dot_product_attention when the attn_bias is an AttnBias)r3   r_   r`   r\   ro   r*   __torch_function__)clsfunctypesargskwargsr-   s        r   rr   CausalBias.__torch_function__$  sM     >F588&&CCC==$1&11w)$tDDr   c                 >    U R                  5       R                  5       $ N)rA   __repr__)r,   s    r   r{   CausalBias.__repr__-  s      "++--r   )r$   r#   r"   rz   )g        FNF)r   N)r   r   r   r   r   r   rg   r+   r3   r/   Tensorr8   r=   rA   staticmethodfloatr6   ro   classmethodrr   strr{   r   __classcell__)r-   s   @r   r   r   V   s@   @ # 3 SW ,
%,, 
5<< 

5<< 
ELL 
-5<<$#6 -%,, -(  " o||o\\o ||o  	o
 o o t|o o 
o ob E E.# . .r   r   r%   c                  j    [        U 5      S:X  d   S5       eU u  p[        [        R                  X5      $ )a  
Creates an upper-left triangular causal bias.

This function generates a upper-left triangular matrix to represent causal attention bias with a
diagonal offset set so that the inclusive values are aligned to the upper left corner of the matrix.
This equivalent to the `is_causal=True` argument in `scaled_dot_product_attention`.

The equivalent pytorch code for constructing this bias is:

.. code-block:: python

    torch.tril(torch.ones(size, dtype=torch.bool))

For instance, with `shape=(3,4)`, the materialized bias tensor will be:

.. code-block:: text

    [[1, 0, 0, 0],
     [1, 1, 0, 0],
     [1, 1, 1, 0]]

Args:
    size: The size of the bias matrix.

Returns:
    CausalBias: The UPPER_LEFT triangular causal bias variant.
r'   z*causal_upper_left only supports 2D tensors)lenr   r   r   r^   r#   r$   s      r   r   r   1  s5    8 t9>GGG> Im..	FFr   c                  j    [        U 5      S:X  d   S5       eU u  p[        [        R                  X5      $ )a  
Creates a lower-right triangular causal bias.

This function generates a lower-right triangular matrix to represent causal attention bias with a
diagonal offset set so that the inclusive values are aligned to the lower right corner of the matrix.

The equivalent pytorch code for constructing this bias is:

.. code-block:: python

    diagonal_offset = size[1] - size[0]
    torch.tril(
        torch.ones(size, dtype=torch.bool),
        diagonal=diagonal_offset,
    )

For instance, with `shape=(3,4)`, the materialized bias tensor will be:

.. code-block:: text

    [[1, 1, 0, 0],
     [1, 1, 1, 0],
     [1, 1, 1, 1]]

Args:
    size: The size of the bias matrix.

Returns:
    CausalBias: The LOWER_RIGHT triangular causal bias variant.
r'   z+causal_lower_right only supports 2D tensors)r   r   r   r   r   s      r   r   r   R  s5    > t9>HHH> Im//GGr   )r   enumr   r   warningsr   r3   torch.nn.functionalr_   r`   r[   torch.backends.cudar   r   r   r	   torch.nn.attentionr
   torch.nn.attention._utilsr   r   r   r   __all___dynamoallow_in_graphr   r}   r   r   r   r   r   r   <module>r      s    I       6  U   9 :   4 5   8 9   Z (2G 2jX. X.vG
 GB!H !Hr   