
    ?Kio                     r   S SK Jr  S SKrSSKJr  SSKJr   \" 5       (       a	  S SKJr  \r	O\
" S5      e       SS	\R                  R                   S
\R"                  S\R"                  S\R"                  S\\R"                     S\S\R"                  4S jjrg! \ a  r\" \5      rS r	 SrCN|SrCff = f)    )OptionalN   )PagedAttentionCache)is_flash_attn_2_available)flash_attn_varlen_funczFlash Attention 2 is not installed. Please refer to https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install itc                  &    [        S[         35      e)Nz)flash_attn_varlen_func is not available: )	Exceptionmsg)argskwargss     g/var/www/html/dynamic-report/venv/lib/python3.13/site-packages/transformers/integrations/flash_paged.pyFLASH_ATTN_VARLEN_FUNCr      s    CC5IJJ    moduleqkvattention_maskcachereturnc           	         [        U SS5      (       d  SOU R                  S-
  S4nUS:X  a  SOSnUb   UR                  " X#U R                  40 UD6u  p#[	        U[
        5      (       a  X}   nX   n	U
b  [        U
S	5      (       a  U
R                  nO[        nS
U;   a  S
UR                  S
5      0O0 nU" UR                  SS5      R                  S5      R                  5       UR                  5       UR                  5       UR                  [        R                  5      UR                  [        R                  5      R!                  5       UU	4U R"                  SUS.UD6n[	        U[$        5      (       a  US   nUS4$ )a  Perform the forward pass of attention with paged key-value cache.

This function handles the cache updates and performs the attention computation
using the flash_attn_varlen_func for efficient processing.

Args:
    q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
    k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full k
    v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full v
    cu_seq_lens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
       of the sequences in the batch, used to index into q.
    cu_seq_lens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
       of the sequences in the batch, used to index into kv.
    max_seqlen_q: int. Maximum query sequence length in the batch.
    max_seqlen_k: int. Maximum key sequence length in the batch.
    dropout_p: float. Dropout probability.
    softmax_scale: float. The scaling of QK^T before applying softmax.
        Default to 1 / sqrt(headdim).
    causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
    window_size: (left, right). If not (-1, -1), implements sliding window local attention.
    softcap: float. Anything > 0 activates softcapping attention.
sliding_windowF)r      r   full_attentionsliding_attentionNr   s_auxr   T)softmax_scalecausalwindow_size)getattrr   update	layer_idx
isinstancedicthasattrr   r   get	transposesqueeze
contiguoustotorchint32clonescalingtuple)r   r   r   r   r   r   cu_seq_lens_qcu_seq_lens_kmax_seqlen_qmax_seqlen_kimplementationr   r   
layer_typer   custom_kwargsattn_outputs                    r   paged_attention_forwardr9      su   H &-V5Eu%M%MXTZTiTilmTmopSqN%3x%?!EXJ ||A&"2"2=f= -&&%1#/!gn>V&W&W!/!F!F!76=6GWfjj12RM(	Aq!!!$//1		%%++- nn" K +u%%!!nr   )NNNNNNN)typingr   r,   generation.continuous_batchingr   utilsr   
flash_attnr   r   RuntimeErrorr	   ereprr
   nnModuleTensorr9    r   r   <module>rE      s      @ -K ""5!7 ]
 	
 .2!%FHHOOF||F ||F ||	F
 U\\*F F \\F  K
q'CKKs   B B B6!B11B6