
    iS3                     .   S SK r S SKJr  S SKrSqSqSqSqSqSq	S r
 SS\R                  S\R                  S\R                  S	\R                  S
\S\R                  4S jjrS\R                  4S\R                  S
\S\R                  S\\R                  \R                  4   4S jjrS\R                  4S\R                  S
\S\\R                  \R                  4   4S jjr SS\R                  S\R                  S
\S\R                  4S jjrg)    N)TupleFc                    ^^^ [         (       a  g Sq SSKJn   U " 5       (       d  Sqg SqSS KmSS KJm  SSKJn  S VVVs/ s H!  nS  H  nS  H  nU" X#S	.US
S9PM     M     M#     nnnnTR                  U/ SQS9TR                  STR                  STR                  STR                  STR                  STR                  STR                  4U4S jj5       5       m[        R                  R                  SSS9 S$S[        R                  S[        R                  S[        R                  S[        R                  S[        S[        R                  4UU4S jjj5       nUR                   S$S j5       nUqTR                  S TR                  4U4S! jj5       nUqTR                  S TR                  4U4S" jj5       n	U	qTR                  S TR                  4U4S# jj5       n
U
qg s  snnnf )%NTr   )
has_tritonF)Config)       @      )r   r	   r
   )            )BLOCK_SIZE_MBLOCK_SIZE_N   )
num_stages	num_warps)NKM_BUCKETBLOCK_SIZE_K)configskeyr   r   r   r   r   r   c                 <  > TR                  SS9nTR                  SS9nTR                  X{5      nX-  TR                  SU	5      -   U-  nX-  TR                  SU
5      -   U-  nTR                  SU5      nXS S 2S 4   U-  -   US S S 24   -   nUUS S S 24   U-  -   US S 2S 4   -   nX?U-  -   nUUU-  U-  -   nTR                  X4TR                  S9n[        U5       H  nTR                  UUS S S 24   UUU-  -
  :  SS9nTR                  UUS S 2S 4   UUU-  -
  :  SS9nTR                  U5      nTR                  U5      nUTR                  UU5      US S 2S 4   -  US S S 24   -  -  nUU-  nUU-  nUS-  nUS-  nM     UR                  UR                  R                  5      nX-  TR                  SU	5      -   nX-  TR                  SU
5      -   nX/S S 2S 4   U-  -   US S S 24   -   nUS S 2S 4   U:  US S S 24   U:  -  nTR                  UUUS9  g )Nr   axis   dtypeg        )maskotherr    )
program_idcdivarangezerosfloat32rangeloaddottor   
element_tystore) a_ptrb_ptrc_ptra_s_ptrb_s_ptrMr   r   r   r   r   r   pid_mpid_nkoffs_moffs_noffs_ka_ptrsb_ptrsa_s_ptrsb_s_ptrsaccumulatoriaba_sb_scc_ptrsr    tls                                   d/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/torchao/kernel/blockwise_quantization.pyblockwise_fp8_gemm_kernel4_lazy_init_triton.<locals>.blockwise_fp8_gemm_kernel:   st   $ 1%1%GGA$&1l)CCqH&1l)CCqH1l+41,,vdAg>a1,,vag>aZ'f499hh;2::hNqAVD!G_q1|;K7K%KSVWAVAtG_q1|;K7K%KSVWA''(#C''(#C266!Q<#ag,6T1WEEKl"Fl"FMHMH  NN5;;112%		!\(BB%		!\(BB41,,vdAg>q$w!#tQw!(;<
&    zao::blockwise_fp8_gemm )mutates_argsr@   rB   rA   rC   
block_sizereturnc                   >^	^
 U R                  5       (       d   eUR                  5       (       d   eUR                  5       (       d   eUR                  5       (       d   eU R                  S5      nU R                  5       U-  m	UR                  S5      m
[        R                  " [        R
                  " T	5      5      nU R                  " / U R                  5       S S QT
P7S[        R                  06nU	U
U4S jnTU   " XXqUT	T
XVUS9
  U$ )Nr   r   c                 X   > TR                  TU S   5      TR                  TU S   5      4$ )Nr   r   r$   )METAr3   r   tritons    rG   <lambda>C_lazy_init_triton.<locals>._blockwise_fp8_gemm_op.<locals>.<lambda>{   s.    KK4/0KK4/0
rJ   )r   )	is_contiguoussizenumelmathceillog2	new_emptytorchbfloat16)r@   rB   rA   rC   rM   r   r   rD   gridr3   r   rH   rT   s            @@rG   _blockwise_fp8_gemm_op1_lazy_init_triton.<locals>._blockwise_fp8_gemm_opj   s               """"  """"FF2JGGINFF1I99TYYq\*KK@#2@@@
 	"$'!#q!Qz	
 rJ   c                     UR                  S5      nU R                  " / U R                  5       S S QUP7S[        R                  06nU$ )Nr   rP   r   )rX   r]   r^   r_   )r@   rB   rA   rC   rM   r   rD   s          rG   __lazy_init_triton.<locals>._   s?    FF1IKK@#2@@@rJ   
BLOCK_SIZEc                   > T	R                  SS9nXC-  T	R                  SU5      -   nT	R                  X-   5      R                  T	R                  5      nT	R                  T	R                  U5      5      S-  nXg-  nUR                  UR                  R                  5      nT	R                  X-   U5        T	R                  X$-   U5        g)a  
Quantizes the input tensor `x_ptr` and stores the result in `y_ptr` and the scaling factor in `s_ptr`.

Args:
    x_ptr (triton.Pointer): Pointer to the input tensor.
    y_ptr (triton.Pointer): Pointer to the output tensor where quantized values will be stored.
    s_ptr (triton.Pointer): Pointer to the output tensor where scaling factors will be stored.
    BLOCK_SIZE (tl.constexpr): The size of the block to be processed by each program instance.

Returns:
    None
r   r         |@N)
r#   r%   r)   r+   r'   maxabsr   r,   r-   )
x_ptry_ptrs_ptrrf   pidoffsxsyrF   s
            rG   $_fp8_blockwise_act_quant_kernel_impl?_lazy_init_triton.<locals>._fp8_blockwise_act_quant_kernel_impl   s      mmm#"))Az"::GGEL!$$RZZ0FF266!9%EDD''(
q!
a rJ   c                 d  > TR                  SS9nTR                  SS9nTR                  XE5      nXe-  TR                  SU5      -   n	Xu-  TR                  SU5      -   n
U	SS2S4   U-  U
SSS24   -   nU	SS2S4   U:  U
SSS24   U:  -  nTR                  X-   US9R	                  TR
                  5      nTR                  TR                  U5      5      S-  nX-  nUR	                  UR                  R                  5      nTR                  X-   XS9  TR                  X&U-  -   U-   U5        g)a"  
Quantizes the input tensor `x_ptr` and stores the result in `y_ptr` and the scaling factors in `s_ptr`.

Args:
    x_ptr (tl.pointer): Pointer to the input tensor.
    y_ptr (tl.pointer): Pointer to the output tensor where quantized values will be stored.
    s_ptr (tl.pointer): Pointer to the output tensor where scaling factors will be stored.
    M (int): Number of rows in the weight matrix.
    N (int): Number of columns in the weight matrix.
    BLOCK_SIZE (tl.constexpr): The size of the block to be processed by each program instance.
r   r   r   Nr"   rh   )r#   r$   r%   r)   r+   r'   ri   rj   r   r,   r-   )rk   rl   rm   r3   r   rf   r4   r5   nr7   r8   ro   r    rp   rq   rr   rF   s                   rG   '_fp8_blockwise_weight_quant_kernel_implB_lazy_init_triton.<locals>._fp8_blockwise_weight_quant_kernel_impl   s/    1%1%GGA"#bii:&>>#bii:&>>ag"VD!G_4q$w!#tQw!(;<GGELtG,//

;FF266!9%EDD''(
q,
"U*A.rJ   c                   > TR                  SS9nTR                  SS9nTR                  XE5      nXe-  TR                  SU5      -   n	Xu-  TR                  SU5      -   n
U	SS2S4   U-  U
SSS24   -   nU	SS2S4   U:  U
SSS24   U:  -  nTR                  X-   US9R	                  TR
                  5      nTR                  XU-  -   U-   5      nX-  nTR                  X+-   XS9  g)a  
Dequantizes weights using the provided scaling factors and stores the result.

Args:
    x_ptr (tl.pointer): Pointer to the quantized weights.
    s_ptr (tl.pointer): Pointer to the scaling factors.
    y_ptr (tl.pointer): Pointer to the output buffer for dequantized weights.
    M (int): Number of rows in the weight matrix.
    N (int): Number of columns in the weight matrix.
    BLOCK_SIZE (tl.constexpr): Size of the block for tiling.

Returns:
    None
r   r   r   Nr"   )r#   r$   r%   r)   r+   r'   r-   )rk   rm   rl   r3   r   rf   r4   r5   rv   r7   r8   ro   r    rp   rq   rr   rF   s                   rG   )_fp8_blockwise_weight_dequant_kernel_implD_lazy_init_triton.<locals>._fp8_blockwise_weight_dequant_kernel_impl   s    $ 1%1%GGA"#bii:&>>#bii:&>>ag"VD!G_4q$w!#tQw!(;<GGELtG,//

;GGEAI%-.E
q,rJ   r
   )_triton_initializedtorch.utils._tritonr   _triton_availablerT   triton.languagelanguager   autotunejit	constexprr^   library	custom_opTensorintregister_fake_blockwise_fp8_gemm_impl_fp8_blockwise_act_quant_kernel"_fp8_blockwise_weight_quant_kernel$_fp8_blockwise_weight_dequant_kernel)r   r   block_mblock_nr   fp8_gemm_configsra   rd   rs   rw   rz   rH   rF   rT   s              @@@rG   _lazy_init_tritonr      s?    .<<!  )	 )G$G&J 	$>!	
 '	
 %	

 )  	 __ &L   ZZ*' <<*' <<*' ,,*' ll*' ll*' ll*' *'X ]]5BG <<\\ << \\	
  
  H2 )) *
  6ZZ!)+! !0 'K#ZZ//1||/ /8 *Q&ZZ-/1||- -: ,U(k	s   (G(r
   r@   rB   rA   rC   rM   rN   c                 \    [        5         [        (       d  [        S5      e[        XX#U5      $ )Nunsupported without triton)r   r   AssertionErrorr   )r@   rB   rA   rC   rM   s        rG   blockwise_fp8_gemmr      s+     9::#AAJ??rJ   rp   r   c                   ^ ^ [        5         [        (       d  [        S5      eSSKmT R	                  5       (       d   S5       eT R                  S5      U-  S:X  d   SU S35       eU[        R                  [        R                  4;   d   S5       e[        R                  " T US	9nT R                  " / T R                  5       SS QT R                  S5      U-  P7S
[        R                  06nUU 4S jn[        U   " T X4US9  X44$ )a  
Quantizes the input tensor `x` using block-wise quantization with block size being BLOCK_SIZEx1.

Args:
    x (torch.Tensor): The input tensor to be quantized. Must be contiguous and its last dimension size must be divisible by `block_size`.
    block_size (int, optional): The size of the blocks to be used for quantization. Default is 128.
    dtype (torch.dtype, optional): The dtype to use for the quantized tensor. Default is `torch.float8_e4m3fn`.


Returns:
    Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
        - The quantized tensor with dtype `dtype`.
        - A tensor of scaling factors with dtype `torch.float32`.
r   r   NInput tensor must be contiguousrP   z@Last dimension size must be divisible by block_size (block_size=)6dtype must be torch.float8_e4m3fn or torch.float8_e5m2r   r   c                 L   > TR                  TR                  5       U S   5      4$ Nrf   )r$   rY   )metarT   rp   s    rG   rU   )fp8_blockwise_act_quant.<locals>.<lambda>  s    QWWY\0BCErJ   rf   )r   r   r   rT   rW   rX   r^   float8_e4m3fnfloat8_e5m2
empty_liker]   r'   r   )rp   rM   r   rr   rq   r`   rT   s   `     @rG   fp8_blockwise_act_quantr      s	   " 9::?????66":
"a' 
J:,VWX'   @ @@  	%(A	RQVVXcr]RAFF2J*$<REMMRAED#D)!QjI4KrJ   c           	      Z  ^^^ [        5         [        (       d  [        S5      eSSKmU R	                  5       (       d   S5       eU R                  5       S:X  d   S5       eU R                  S5      U-  S:X  a  U R                  S5      U-  S:X  d   SU S	35       eU[        R                  [        R                  4;   d   S
5       eU R                  5       u  mm[        R                  " XS9nU R                  TU-  TU-  [        R                  S9nUUU4S jn[        U   " XUTTUS9  X44$ )a9  
Quantizes the given weight tensor using block-wise quantization with block size being BLOCK_SIZExBLOCK_SIZE.

Args:
    x (torch.Tensor): The weight tensor to be quantized.
    block_size (int, optional): The block size to use for quantization. Defaults to 128.
    dtype (torch.dtype, optional): The dtype to use for the quantized tensor. Defaults to `torch.float8_e4m3fn`.

Returns:
    Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
        - The quantized weight tensor with dtype `dtype`.
        - A tensor of scaling factors with dtype `torch.float32`.
r   r   Nr      z#Input tensor must have 2 dimensionsr   zABoth dimensions of x must be divisible by block_size (block_size=r   r   r   c                 X   > TR                  TU S   5      TR                  TU S   5      4$ r   rR   r   r3   r   rT   s    rG   rU   ,fp8_blockwise_weight_quant.<locals>.<lambda><  .    AtL)*AtL)*rJ   r   )r   r   r   rT   rW   dimrX   r^   r   r   r   r]   r'   r   )	rp   rM   r   rr   rq   r`   r3   r   rT   s	         @@@rG   fp8_blockwise_weight_quantr     s2     9::?????557a<>>><66!9z!Q&166!9z+AQ+F 
KJ<WXYF   @ @@  668DAq(A	AOQ*_EMMJAD 't,Q1azR4KrJ   rq   c           	        ^^^ [        5         [        (       d  [        S5      eSSKmU R	                  5       (       a  UR	                  5       (       d   S5       eU R                  5       S:X  a  UR                  5       S:X  d   S5       eU R                  5       u  mm[        R                  " U [        R                  " 5       S9nUUU4S jn[        U   " XUTTUS	9  U$ )
a  
Dequantizes the given weight tensor using the provided scale tensor.

Args:
    x (torch.Tensor): The quantized weight tensor of shape (M, N).
    s (torch.Tensor): The scale tensor of shape (M, N).
    block_size (int, optional): The block size to use for dequantization. Defaults to 128.

Returns:
    torch.Tensor: The dequantized weight tensor of the same shape as `x`.

Raises:
    AssertionError: If `x` or `s` are not contiguous or if their dimensions are not 2.
r   r   Nz Input tensors must be contiguousr   z$Input tensors must have 2 dimensionsr   c                 X   > TR                  TU S   5      TR                  TU S   5      4$ r   rR   r   s    rG   rU   .fp8_blockwise_weight_dequant.<locals>.<lambda>_  r   rJ   r   )r   r   r   rT   rW   r   rX   r^   r   get_default_dtyper   )rp   rq   rM   rr   r`   r3   r   rT   s        @@@rG   fp8_blockwise_weight_dequantr   D  s    " 9::??!2!2V4VV2557a<AEEGqLP*PP(668DAq%"9"9";<AD ).qQ1THrJ   r|   )rZ   typingr   r^   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rK   rJ   rG   <module>r      s_         "& %) "'+ $OUn 
@||
@	
@ ||
@ 
	
@
 
@ \\
@ (+ATAT#||#!$#38;;#
5<<%&#N (+%2E2E'||'!$'
5<<%&'V 9< ||  25 
\\ rJ   