
    iV                        S SK r S SKJr  S SKrS SKJr  S SKJrJrJ	r	  S SK
Jr  \ R                  " SS5      rSSSSSSSSSS	.	S
 jr        SS jrSSSSSSSSS.S\R                   S\R                   S\R                   S\\R                      S\\R                      S\\R                      S\S\\\\   \\   \\   4      S\\   4S jjrSqSqS rg)    N)Optional)	warn_once)get_metaminimizeupdate)
has_tritonBSR_AUTOTUNEF   )	betaalpha
left_alpharight_alphaoutstoreverboseforceopnamec       	           ^^^^#^$ SSK m$Uc  SnUR                  S   nUR                  5       nUR                  5       nUR	                  5       S-
  nUR                  XS-    u  nnUR                  US-   US-    u  nn[        SSS[        UU-  S5      S	9n[        SUR                  5       U-  U-  UU-  -  -
  S5      nUR                  nUc  UnOUR                  nUUL a  UnOUU4nSUU4nUUUUUTS:H  TS:H  US:H  4n[        UUUS
S9nUc  Sn[        UUSUS4S
S9nUc  UnOU
(       d  U$ S
nXX$U4UUUU$4S jjnUUUUU4S jn[        UUUUSU	S9u  m#nn n!U	(       a  [        SU! SUS SU S S35        U(       a[  U(       a  T#U:X  a  UULdI  [        R                  R                  5       n"[!        UU"UU[#        U#4S j[%        T#5       5       5      5        T#$ )zTune bsr_dense_addmm kernel parameters against the given inputs.

When store is True, the tuning results will be stored in the
database of kernel parameters.
r   Nbsr_dense_addmmr
            )GROUP_SIZE_ROW
num_stages	num_warpsSPLIT_NT)versionexactF      ?c           	      b   >^ ^^^^^ UUUUUUU UU	4	S jnT
R                   R                  USSS9$ )Nc                  &   >	 [        TTTTT TTTTS9	$ )N)r   r   r   r   metar   )r   )	r   r   bsrdenseinputr   r#   r   r   s	   \/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/torchao/kernel/bsr_triton_ops.py	test_func6tune_bsr_dense_addmm.<locals>.bench.<locals>.test_funcX   s+    "%'
 
    i  d   )warmuprep)testingdo_bench)r#   r&   r$   r%   r   r   r(   r   r   r   tritons   `````` r'   bench#tune_bsr_dense_addmm.<locals>.benchW   s+    	 	 ~~&&y#&FFr*   c	                 D   U S;   n	[        SSSSS9U    n
[        [        XW-  S5      S9R                  U 5      n[        SSSSS9U    nU	(       a  US:  a  XU-  -  OX[        U5      -  -  nOXU-  -   nU
b  [        X5      nUb  [	        X5      nU S:X  a
  X]-  S:w  a  U$ U$ )N>   r   r   r
   )r   r   r   r   )r   r   r   r   )dictmaxgetabsmin)namevalue	directionr#   MNKBMBKis_log	min_value	max_value
value_step
next_values                 r'   step_meta_parameter1tune_bsr_dense_addmm.<locals>.step_meta_parameterh   s     11aAaPQUV	QWa155d;	!qQqQRVW
 q= I--S^;<  i!77J Z3J Z3J91!4Lr*   )max_stepr   z-> z
, speedup=z.1fz %, timing=z.3fz msc              3   .   >#    U  H
  nTU   v   M     g 7fN ).0kr#   s     r'   	<genexpr>'tune_bsr_dense_addmm.<locals>.<genexpr>   s     0<a$q'<s   )r0   shapevaluescrow_indicesdimr4   r5   round_nnzdtyper   r   printtorchcudaget_device_namer   tuplesorted)%r&   r$   r%   r   r   r   r   r   r   r   r   r   r=   rQ   rR   
batch_ndimr<   r>   r?   r@   reference_metasparsityrV   	out_dtypeversion_dtyper   keyinitial_metamay_skip_updater1   rF   speeduptimingsensitivity_messagedevice_namer#   r0   s%      ` ``                            @@r'   tune_bsr_dense_addmmri      s@   ( ~"BAZZ\F##%L!!#a'J99Zq.1DAq\\*q.:>:FB Q!Sb!_N
 Qb2-Q77;HIIE
{	II	E	*-*GaBDAItqy%1*
=C FCELa_DQ)L   G G" =>aBSU 2 2:2.D'6. '(7C.VSMMNDL0\5Wjj00206$<00	
 Kr*   c                    Uc  [         R                  nUc  UnUc  SnXyX1S 1:X  GaQ  [         R                  R                  5       nXX#XES:H  US:H  US:H  4nXL a  UnOX4n[	        SUUUUU4S9nUc  US:w  a  [	        SUUUUS4S9nUc  XLa  [	        SUUXS4S9nUc  [	        S/ US S QSPUSS  Q7UUUS4S9nUc  XLa  [	        S/ US S QSPUSS  Q7UXS4S9n[        U=(       d    0 5       H=  nUU   nUS   nUS	   nUU-  nUU-  S:X  d  M"  UU::  d  M*  [        U5      nUU-  US	'   M?     Ub  UR                  " S0 UD6  U$ [        S
U < SU< SU< SU< SU< SU< SU< SU< SU< S35        U=(       d    [        X#-  S5      nU=(       d    SnU
=(       d    Sn
U	=(       d    Sn	[        SUUU
U	S.UD6$ )Nr    r   r
   r   )r   r   *r   r   z@bsr_dense_addmm uses non-optimal triton kernel parameters for M=z K=z N=z Ms=z, Ks=z beta=z alpha=z dtype=z out_dtype=zC. To find optimal triton kernel parameters, run with BSR_AUTOTUNE=1r   )r   r   r   r   rK   )
rX   float16rY   rZ   r   r\   r4   r   r   r5   )r<   r>   r=   MsKsr   r   r   r   r   r   r_   rV   r`   _versionextrarh   rb   ra   r#   matching_metamkeymeta_nsplit_ncs                             r'   bsr_dense_addmm_metarw      s   * }	J7D6Ajj002QB	419eqjA!M!,M}h7	
 <HO!!=#6	D <E2!3hs=SD <$!)#bq')3)QR)!=#6	M $)? (%-c"1g-s-SW-%c2	! }23%d+G	*Lq5A:!q&;D&'1fDO 4 KK % Kt4QD!Ure6bU'D7(E88UHLi\ ZTT (QWaG#(qNqJQI %	
  r*   )r   r   r   r   r   skip_checksmax_gridr#   r&   r$   r%   r   r   r   rx   ry   r#   c                H	  ^^^
^$^%^&^'^(^)^*^+ [        5         SnUR                  5       nUR                  5       nUR                  5       nUR	                  5       S-
  nUR
                  XS-    u  nnUR
                  US-   US-    nUR
                  S   nSSKJm+  SSKJ	nJ
nJnJm)Jn  U" XU5      nUc  UR                  UUU4-   5      nUR                  5       S:X  d  TS:X  d  US:X  d  US:X  d  US:X  aB  TS:X  a  UR!                  5         U$ UR#                  U 5        TS:w  a  UR%                  T5        U$ T
c{  ['        SUR                  5       US   -  US   -  UU-  -  -
  S5      n[(        (       a  [+        U UUTTUUUS	S
S	SS9m
O,[-        UUUUS   US   TTUUR.                  UR.                  S9
m
S
m(S
m*Uc(  S	m(UR                  S5      R0                  " / UQUPUP76 nO*UR2                  " / UQUPSP76 R0                  " / UQUPUP76 nUc(  S	m*UR                  S5      R0                  " / UQUPUP76 nO*UR2                  " / UQSPUP76 R0                  " / UQUPUP76 nUR5                  5       S   S:X  d   eUR5                  5       S   S:X  d   eUnU" XX%Xg5      u  nnnn nnnnUu  m%m$T
R7                  S[9        UT%-  S5      5      nUU-  m&UnU" UT%T&45      nU" UT$T&45      nU" U T%T&45      n U" UT%T&45      nU" UT%T&45      n[:        R<                  T+R>                  [:        R@                  T+R>                  [:        R>                  T+RB                  [:        RB                  T+RB                  [:        RD                  T+RF                  [:        RF                  T+RF                  0UR.                     m'URI                  S5      nURI                  S5      S-
  nURI                  S5      nUUU4n U	b*  [K        U	SS SSS2   5      SS[M        U	SS 5      -
  -  -   n!OSn!USUSUSU SUSUSUSUS0n"TS:w  d   eU$U%U&UUU'U(U
U)U*U+4S jn#U" U#U"U U!5        URO                  5       URO                  5       :w  a*  UR#                  UR3                  UR
                  5      5        U$ )zCompute

  out = beta * input + left_alpha.reshape(-1, 1) * (alpha * (bsr @ dense)) * right_alpha.reshape(1, -1)

where left_alpha, right_alpha are (* + 1)-D tensors when
specified, otherwise, these are treated as tensors filled with
ones.
r   r
   r   r   r   r   N)broadcast_batch_dimslaunch_kernelprepare_inputsptr_stride_extractortile_to_blocksizeTF)	r   r   r   r   r   r   r   r   r   )r_   rV   r`   rK   r   rJ   )r   NN)r   Nr   )r   r   )r   r   Nc                 z   > [         U    " / T
" U6 QTPTP7TS:H  TS:g  TS:H  TTTTTTTR                  :H  TS.
T	D6  g )Nr
   r   )
beta_is_onebeta_is_nonzeroalpha_is_oneleft_alpha_is_oneright_alpha_is_oneBLOCKSIZE_ROWBLOCKSIZE_INNERBLOCKSIZE_COL
allow_tf32	acc_dtype)_bsr_strided_addmm_kernelfloat32)gridsliced_tensorsr@   r?   BNr   r   dot_out_dtyper   r#   r~   r   tls     r'   kernelbsr_dense_addmm.<locals>.kernel  ss    !$' 	
!>2	
	
 	
 	 AI!/1$

2#	
 	
r*   )(_lazy_init_tritonrQ   rR   col_indicesrS   rP   triton.languagelanguagetorch.sparse._triton_opsr{   r|   r}   r~   r   	new_emptyrU   zero_copy_mul_rT   AUTOTUNEri   rw   rV   expandviewstrider6   r5   rX   rl   r   bfloat16float64int8int32sizer[   lendata_ptr),r&   r$   r%   r   r   r   r   r   rx   ry   r#   f_namerQ   rR   r   r]   r<   r>   	blocksizer=   r{   r|   r}   r   original_batch_dims_broadcastedr_   
out_backupr   out_untiled	n_batchesn_block_rowsn_block_cols	full_gridgrid_blockstensor_dims_mapr   r@   r?   r   r   r   r~   r   r   s,      ``     `                         @@@@@@@@r'   r   r      s7   . FZZ\F##%L//#K!!#a'J99Zq.1DAqZ!^j1n=IBA !  ';6&N#
{oo=AFG
xxzQ%1*Q!q&AF19IIK
 
 IIeqy
|SXXZ)A,61EQOOQRS8'%'(D (!!!kk))D  __R(// 
,
./
12

  __L&ELqL!LSS 
,
./
12

 !oob)00 
,
./
12
 "&&N(GNNANUU 
,
./
12
 r"a'''#q(((J 	s5kG	 FBhhy#a2gq/2G	
gBK
C"b
*Ceb"X.Eeb"X.E":Bx8J#K"b:K 	rzz

rzzrzz

BHHRXX 
iiM 

1I$$R(1,L::b>LL,7IHRaL2./'QXbq\AR=R2SS 	m_{}K[[	O A::
 
$ &/9kB
||~,,.. 	))**:*:;<r*   c            !        ^ [         (       a  g Sq [        5       (       d  g SS Kn SS KJm  U R
                  STR                  STR                  STR                  STR                  STR                  STR                  S	TR                  S
TR                  STR                  STR                  STR                  STR                  STR                  STR                  STR                  STR                  4 U4S jj5       nUqg )NTr   left_alpha_tiled_col_strideleft_alpha_col_block_strideright_alpha_tiled_row_strideright_alpha_row_block_strider   r   r   r   r   r   r   r   r   r   r   r   c7           	        > US:X  d   eUS:X  d   eUS:X  d   eU!S:X  d   eTPR                  SS9n7TPR                  SS9n8TPR                  SS9n9TPR                  SS9n:TPR                  SS9n;TPR                  U8U9U:U;U55      u  n8n9UUU7-  -   UU8-  -   n<TPR                  U<5      n=TPR                  U<U-   5      n>U>U=-
  n?TPR	                  SU05      n@TPR	                  SU25      nAU1S:  d	  U1S-  S:w  a  SnBOU1nBTPR	                  SWB5      nCU UU7-  -   UU=-  -   UW@S S 2S 4   -  -   UWAS S S 24   -  -   nDUUU7-  -   UU9-  -   UUAS S 2S 4   -  -   UUCS S S 24   -  -   nEU#U$U7-  -   U%U8-  -   U&U9-  -   U'U@S S 2S 4   -  -   U(UCS S S 24   -  -   nFUU	U7-  -   U
U=-  -   nGTPR                  U0UB4U3S9nH[        U?5       Hd  nITPR                  WD5      nJTPR                  WG5      nKTPR                  WEUUK-  -   WCS S S 24   U1:  S9nLWHTPR                  UJULU4U3S9-  nHUDU-  nDUGU
-  nGMf     U-(       d  WHU*-  nHU.(       dB  UUU7-  -   UU8-  -   UU9-  -   UW@S S 2S 4   -  -   UWCS S S 24   -  -   nMWHTPR                  UM5      -  nHU/(       dB  UUU7-  -   UU8-  -   U U9-  -   U!W@S S 2S 4   -  -   U"WCS S S 24   -  -   nNWHTPR                  UN5      -  nHU,(       aa  UUU7-  -   UU8-  -   UU9-  -   UW@S S 2S 4   -  -   UWCS S S 24   -  -   nOU+(       a  WHTPR                  WO5      -  nHOWHU)TPR                  WO5      -  -  nHTPR                  WFWHR                  U#R                  R                  5      WCS S S 24   U1:  S9  g )	Nr   r   )axisr
      )rV   )mask)r   r`   )
program_idnum_programs	swizzle2dloadarangezerosrangedotr   torV   
element_ty)Q
values_ptrvalues_batch_stridevalues_nnz_stridevalues_row_block_stridevalues_col_block_stridecrow_indices_ptrcrow_indices_batch_stridecrow_indices_stridecol_indices_ptrcol_indices_batch_stridecol_indices_stride	input_ptrinput_batch_strideinput_tiled_row_strideinput_tiled_col_strideinput_row_block_strideinput_col_block_stride	dense_ptrdense_batch_stridedense_tiled_row_stridedense_tiled_col_stridedense_row_block_stridedense_col_block_strideleft_alpha_ptrleft_alpha_batch_strideleft_alpha_tiled_row_strider   left_alpha_row_block_strider   right_alpha_ptrright_alpha_batch_strider   right_alpha_tiled_col_strider   right_alpha_col_block_stride
output_ptroutput_batch_strideoutput_tiled_row_strideoutput_tiled_col_strideoutput_row_block_strideoutput_col_block_strider   r   r   r   r   r   r   r   r   r   r   r   r   r   	batch_pidrow_block_pidcol_block_pidr   r   crow_indices_offset_ptr
nnz_offsetnnz_offset_nextrow_nnzrow_block_arangeinner_block_arangePADDED_BLOCKSIZE_COLcol_block_arangevalues_block_ptrsdense_block_ptrsoutput_ptrscol_index_nnz_ptroutput_acc_block_values_blockdense_row_idxdense_blockleft_alpha_ptrsright_alpha_ptrs
input_ptrsr   sQ                                                                                   r'   _bsr_strided_addmm_kernel_impl9_lazy_init_triton.<locals>._bsr_strided_addmm_kernel_impl  s<   V +a///*a///+q000+q000MMqM)	1-1-A.A.')||=,n(
$}
 ')34!M12 	 
 WW45
''"9<O"OP "J.99Q6YYq/:2!3q!813 1> 99Q(<= !I-.*,- &(8D(AAB &(:47(CC	D 	  9,-$}45 %'9!T''BBC %'7a'@@	A 	 !I-.%56 &56 &(8D(AA	B
 &(8q(AAB 	 &23 :-. 	 8801 $ 
 wA77#45L GG$56M'' #9M#II%dAg.> " K kjI !' ! 
 !22!33%  ( % )I56-=> .=> .0@D0II	J
 .0@q0IIJ   88!*Y67.>? />? /1A!T'1JJ	K
 /1A$'1JJK  (8 99$y01(=89 )=89 )+;AtG+DD	E
 )+;D!G+DDE   BGGJ$77  D277:+>$>>  	
 0 0 ; ;<!$'*]: 	 	
r*   )_triton_initializedr   r0   r   r   jit	constexprr   )r0   r  r   s     @r'   r   r     sS   << ZZU
L &(\\MU
P &(\\QU
Z ')ll[U
^ ')ll_U
x \\yU
z {U
| ll}U
~ <<U
@ LLAU
B ||CU
D ||EU
F GU
H <<IU
J LLKU
L MU
N OU
 U
n !?r*   )NNNNNNNr   )ostypingr   rX   torch._dynamo.utilsr   torch.sparse._triton_ops_metar   r   r   torch.utils._tritonr   getenvr   ri   rw   Tensorboolr[   intr4   r   r  r   r   rK   r*   r'   <module>r     sM   
   ) D D *99^U+ 



E` 
^L 

)-*."&MQB<<B	B <<B &B %,,'B 
%,,	B B uXc]HSM8C=HIJB 4.BL    d?r*   