
    3Kiu'                     0   S r SSKrSSKJr  SSKJrJr  SSKr\R                  " \	5      r
SS/r\" SS9S	\S
\4S j5       r " S S\5      r\R                   R#                  S0 S9 S+S\R$                  S\R$                  S\R$                  S\R$                  S\R$                  S\S\S\S
\\R$                  \R$                  \R$                  4   4S jj5       r\R*                   S+S\R$                  S\R$                  S\R$                  S\R$                  S\R$                  S\S\S\S
\\R$                  \R$                  \R$                  4   4S jj5       r  S,S\R$                  S\R$                  S\R$                  S\R$                  S\R$                  S\S\S\S\S-  S
\R$                  \\R$                  \R$                  4   -  4S jjrS\S\\S4   S\S
S4S jr\R                   R#                  S 0 S9S!\R$                  S\R$                  S\R$                  S\R$                  S"\R$                  S#\R$                  S\R$                  S\R$                  S\S\S\S$\R$                  S
\\R$                  \R$                  \R$                  4   4S% j5       r\R*                  S!\R$                  S\R$                  S\R$                  S\R$                  S"\R$                  S#\R$                  S\R$                  S\R$                  S\S\S\S$\R$                  S
\\R$                  \R$                  \R$                  4   4S& j5       rS\S!\R$                  S'\R$                  S(\R$                  S
\\R$                  S-  S4   4
S) jr\R9                  \\S*9  g)-z
Variable-length attention implementation using Flash Attention.

This module provides a high-level Python interface for variable-length attention
that calls into the optimized Flash Attention kernels.
    N)	lru_cache)Any
NamedTuplevarlen_attn
AuxRequest   )maxsizedevice_indexreturnc                     g)z;Cache device capability check to avoid repeated CUDA calls.F )r
   s    [/var/www/html/dynamic-report/venv/lib/python3.13/site-packages/torch/nn/attention/varlen.py_should_use_cudnnr      s         c                   (    \ rS rSr% SrSr\\S'   Srg)r      z
Request which auxiliary outputs to compute from varlen_attn.

Each field is a boolean indicating whether that auxiliary output should be computed.
Flser   N)	__name__
__module____qualname____firstlineno____doc__r   bool__annotations____static_attributes__r   r   r   r   r      s     Cr   ztorch_attn::_varlen_attn)mutates_argsquerykeyvaluecu_seq_qcu_seq_kmax_qmax_k	is_causalc                    U R                   =(       a    [        U R                  R                  5      nU(       aX  [        R                  S5        [        R                  R                  R                  U UUSUUUUSSUS5      n	U	S   U	S   U	S   pn
OJ[        R                  S	5        [        R                  R                  R                  U UUUUUUSUSS
9
u  pn  n[        R                  " S[        R                  U R                  S9nXU4$ )z
Private custom op for variable-length attention.

This is the internal implementation. Users should use the public varlen_attn function instead.
#Using cuDNN backend for varlen_attnNT        Fr         -Using Flash Attention backend for varlen_attn)return_debug_mask   dtypedevice)is_cudar   r0   indexloginfotorchopsaten_cudnn_attention_forward_flash_attention_forwardzerosuint64)r   r   r   r    r!   r"   r#   r$   	use_cudnnresultoutputsoftmax_lse	rng_state_
rng_state_s                  r   _varlen_attnrC   $   s   " G"3ELL4F4F"GI6788
 *0F1IvayYY@A/4yy~~/V/V# 0W 0
,Y1 ELLJ 
**r   c                 4   [         R                  " U 5      nU R                  S5      n	U R                  S5      n
[         R                  " X4[         R                  U R
                  S9n[         R                  " S[         R                  U R
                  S9nXU4$ )z
Fake implementation for meta tensor computation and tracing.

Based on the 3D varlen path from meta__flash_attention_forward:
- query shape: (total, num_heads, head_dim)
- logsumexp shape: (num_heads, total_q)
r   r(   r.   r,   )r5   
empty_likesizeemptyfloatr0   r;   )r   r   r   r    r!   r"   r#   r$   r>   total_q	num_heads	logsumexpr@   s                r   _varlen_attn_fakerL   ^   sw    & e$F jjmG

1I	EKKI DU\\JIi''r   
return_auxc	           
          [         R                  R                  R                  XX#XEXg5      u  pnUb  UR                  (       a  X4$ U	$ )a]
  
Compute variable-length attention using Flash Attention.
This function is similar to scaled_dot_product_attention but optimized for
variable-length sequences using cumulative sequence position tensors.
Args:
- query (Tensor): Query tensor; shape :math:`(T_q, H, D)`
- key (Tensor): Key tensor; shape :math:`(T_k, H, D)`
- value (Tensor): Value tensor; shape :math:`(T_k, H, D)`
- cu_seq_q (Tensor): Cumulative sequence positions for queries; shape :math:`(N+1,)`
- cu_seq_k (Tensor): Cumulative sequence positions for keys/values; shape :math:`(N+1,)`
- max_q (int): Maximum query sequence length in the batch.
- max_k (int): Maximum key/value sequence length in the batch.
- is_causal (bool, optional): If set to True, applies causal masking (default: False).
- return_aux (Optional[AuxRequest]): If not None and ``return_aux.lse`` is True, also returns the logsumexp tensor.

Shape legend:
- :math:`N`: Batch size
- :math:`T_q`: Total number of query tokens in the batch (sum of all query sequence lengths)
- :math:`T_k`: Total number of key/value tokens in the batch (sum of all key/value sequence lengths)
- :math:`H`: Number of attention heads
- :math:`D`: Head dimension

Returns:
- Tensor: Output tensor from attention computation
- If ``return_aux`` is not None and ``return_aux.lse`` is True, returns a tuple of Tensors:
(output, lse), where lse is the logsumexp

Example::

    >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
    >>> batch_size, max_seq_len, embed_dim, num_heads = 2, 512, 1024, 16
    >>> head_dim = embed_dim // num_heads
    >>> seq_lengths = []
    >>> for _ in range(batch_size):
    ...     length = torch.randint(1, max_seq_len // 64 + 1, (1,)).item() * 64
    ...     seq_lengths.append(min(length, max_seq_len))
    >>> seq_lengths = torch.tensor(seq_lengths, device="cuda")
    >>> total_tokens = seq_lengths.sum().item()
    >>>
    >>> # Create packed query, key, value tensors
    >>> query = torch.randn(
    ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
    ... )
    >>> key = torch.randn(
    ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
    ... )
    >>> value = torch.randn(
    ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
    ... )
    >>>
    >>> # Build cumulative sequence tensor
    >>> cu_seq = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
    >>> cu_seq[1:] = seq_lengths.cumsum(0)
    >>> max_len = seq_lengths.max().item()
    >>>
    >>> # Call varlen_attn
    >>> output = varlen_attn(
    ...     query, key, value, cu_seq, cu_seq, max_len, max_len, is_causal=False
    ... )
)r5   r6   
torch_attnrC   r   )r   r   r   r    r!   r"   r#   r$   rM   outr   rA   s               r   r   r      sF    N ))&&33EXeKCa *..xJr   ctxinputs.r>   c           
      h    Uu  p4pVpxpUu  pnU R                  X4XVX{X5        Xl        Xl        Xl        g N)save_for_backwardr"   r#   r$   )rQ   rR   r>   r   r   r   r    r!   r"   r#   r$   rP   r   r@   s                 r   _setup_contextrV      s=    EKBE% Ci%excUIIMr   z!torch_attn::_varlen_attn_backwardgrad_outrP   r   r@   c                    [         R                  " SUR                  S9nUR                  =(       a    [	        UR                  R
                  5      nU(       aO  [        R                  S5        [         R                  R                  R                  U UUUUUUUUU	SU
UU5      u  pnON[        R                  S5        [         R                  R                  R                  U UUUUUUUUU	SU
UU5      u  pnXU4$ )Nr   )r0   r&   r'   r*   )r5   rG   r0   r1   r   r2   r3   r4   r6   r7   _cudnn_attention_backward_flash_attention_backward)rW   r   r   r   rP   r   r    r!   r"   r#   r$   r@   unusedr<   dqdkdvs                    r   _varlen_attn_backwardr_      s     [[5<<0FG"3ELL4F4F"GI67YY^^==

" 	@AYY^^==

  2:r   c                     [         R                  " U5      n[         R                  " U5      n[         R                  " U5      nXU4$ )z>
Fake implementation for meta tensor computation and tracing.
)r5   rE   )rW   r   r   r   rP   r   r    r!   r"   r#   r$   r@   
grad_querygrad_key
grad_values                  r   _varlen_attn_backward_fakerd     s?    & !!%(J$H!!%(J++r   grad_lsegrad_rngc                     U R                   u  pEpgppU R                  nU R                  nU R                  n[        R
                  R                  R                  UUUUU	U
UUUUUU5      u  nnnUUUS S S S S S 4	$ rT   )saved_tensorsr"   r#   r$   r5   r6   rO   r_   )rQ   rW   re   rf   r   r   r   r    r!   rP   r   r@   r"   r#   r$   r\   r]   r^   s                     r   	_backwardri   ,  s     BEARAR>EIIEIIEI%%;;JBB r2tT4tT99r   )setup_context)F)FN)r   logging	functoolsr   typingr   r   r5   	getLoggerr   r3   __all__intr   r   r   library	custom_opTensortuplerC   register_fakerL   r   rV   r_   rd   ri   register_autogradr   r   r   <module>rw      s     "  !,
' 1C D  
  3"E 6+<<6+	6+ <<6+ ll	6+
 ll6+ 6+ 6+ 6+ 5<<u||346+ F6+r  (<<(	( <<( ll	(
 ll( ( ( ( 5<<u||34( (P $(L<<L	L <<L ll	L
 llL L L L T!L \\E%,,455L^ U38_ c d  <2N5ll5<<5 
5 <<	5
 
5 
5 ll5 ll5 5 5 5 ||5 5<<u||345 O5p $$,ll,<<, 
, <<	,
 
, 
, ll, ll, , , , ||, 5<<u||34, %,2:	::05:HM:
5<<$#$:4   y  Gr   