
    iX                        S SK JrJrJr  S SKrS SKrSrSrSrSr	SSS	S
SS.SSSSS
SS.SSSSSSS.SSSSS
SS.SSSS
SS.SSSSS.S.r
S3S\\   4S  jjrS!r S4S"\S#\\   S$\\   S%\\\R                  \4      4S& jjr S3S'\R                  S(\R                  S)\R                  S$\\   S\\   S%\R                  4S* jjrS'\R                  S(\R                  S)\R                  S#\\   S$\\   S\\   4S+ jr S3S#\\   S$\\   S,\S\\   4S- jjr S4S"\S.\\   S%\\\R                  \4      4S/ jjr S3S.\\   S\\   4S0 jjrS3S\\   4S1 jjr S3S'\R                  S(\R                  S)\R                  S.\\   S\\   4
S2 jjrg)5    )ListOptionalUnionNg      ?         g vCg 
`Cg   .YvBg(\?gq=
ףp?)bf16_peak_topsfp8_peak_topspeak_mem_bw_bytes_secpct_achievable_gemm_topspct_achievable_mem_bwg sCg s/Cg s?Cg   gB)r	   r
   fp4_peak_topsr   r   r   g mvHCg   Bg=
ףp=?gzG?g ֒Cg ֒"Cg   xHBg  @Bg  @Bg  @Cg  BwC)r	   r
   r   r   )zNVIDIA H100zNVIDIA B200zNVIDIA B300 SXM6 ACzNVIDIA GB200zAMD Instinct MI300XzNVIDIA GeForce RTX 5090gpu_namec                 X    U c  [         R                  R                  S5      n [        U    $ )Nr   )torchcudaget_device_namegpu_name_to_specs)r   s    f/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/torchao/testing/training/roofline_utils.py	get_specsr   e   s'    ::--a0X&&    g>tensor_rolefloat8_recipe_namemx_recipe_namereturnc                 :   X-  nSnUS:X  ae  US:X  a/  U(       a  Sn	O	[         U-  n	[         U-  [        U-  -   n
U
nU	SX/nGOU(       a  Sn	O	[         U-  n	[         U-  S[        -  U-  -   n
U	SU
/nGOWUS:X  a  US:X  aL  U(       a  S[        U-  -   n	O[         U-  [        U-  -   n	[         U-  S-  n[         U-  [        U-  -   n
XU
/nGOU(       a  S[        U-  -   [         U-  -   n	O[         U-  [        U-  -   [         U-  -   n	[         U-  [        U-  -   nX/nGOUS:X  am  US;   a,  U(       a  S[        U-  -   n	O[         U-  [        U-  -   n	U	/nGOtUS:X  a.  [         U-  [        U-  -   n	Sn[         U-  [        U-  -   n
XU
/nGO@ S	5       eUS
:X  a,  U(       a  S[        U-  -   n	O[         U-  [        U-  -   n	U	/nGOUS:X  ar  US;   a;  U(       a  S[        U-  -   n	O[         U-  [        U-  -   n	[         U-  [        U-  -   nO-US:X  a   [         U-  [        U-  -   n	[        U-  S-  nO S	5       eX/nOUS;   d   SU< 35       eUS:X  a;  U(       a  S[        U-  -   n	O[         U-  [        U-  -   n	[         U-  [        U-  -   nO:U(       a  S[        U-  -   n	O[         U-  [        U-  -   n	[         U-  [        U-  -   nX/nU Vs/ s H  nXS   -  U S   -  PM     nnU Vs/ s H  n[        R                  " U[
        5      PM      nnU$ s  snf s  snf )a  
Calculates the roofline estimate of casting one of the gemm inputs
(input, weight or grad_output) to float8 in fwd+bwd.

Inputs: dim0 and dim1 (shape), tensor_role (input|weight|grad_output), recipe names
Outputs: list of read/write traffic overhead in seconds, one for each kernel
N
tensorwiseweightr   r   rowwiserowwise_with_gw_hp)inputgrad_outputunsupported mxfp8_32x32_flexible_gemm_layoutmxfp8_32x32_weight)mxfp8_emulatedmxfp8_cublasmxfp8_cublas_rceilmxfp4_cutlasszunsupported mx_recipe_name=r)   r   r   )BYTES_PER_EL_BF16BYTES_PER_EL_FLOAT8BYTES_PER_EL_FLOAT4sympyMaxKERNEL_LAUNCH_OVERHEAD_SEC)specsdim0dim1r   r   r   fuse_with_prevnumel	res_byteskernel_1_rwkernel_3_rwkernel_4_rwkernel_2_rwxres_ss                  r    get_tensor_memory_traffic_ovhd_sr<   q   s   " KEI\)("  0%7+e36IE6QQK%K$aBI  0%7+e3a:M6MPU6UUK$a5I	y	((" "5"==/%7:MPU:UU+e3a7K+e36IE6QQK$;?I  +e336G%6OO 
 &-)E12'%/0 
 ,e36IE6QQK$2I	3	322 "5"==/%7:MPU:UU$IH$
 ,e36IE6QQKK+e36IE6QQK$;?I'-'5	=	= 1E99K+e36IE6QQK M		/	/
 22 "5"==/%7:MPU:UU+e36IE6QQKH$ ,e36IE6QQK-59K (-'5 .	  "
 
 	,
 *.*+	, 
 _,"5"==/%7:MPU:UU+e36IE6QQK"5"==/%7:MPU:UU+e36IE6QQK .	
 A 	
)**U3J-KK 
  @EEu!UYYq45uEEL Fs   L*%LMKNc                    [        U5      nSU -  U-  U-  nU[        R                  L a  US   nOMU[        R                  [        R                  4;   a  US   nO#U[        R
                  L a  US   nO
 SU 35       eXx-  US   -  n	X-  X-  -   n
X-  nUb|  UR                  S5      (       d
   SU 35       eU[        R                  [        R                  [        R
                  4;   d   S	5       eUR                  S
5      (       a  SOSnX-  nX-   n
U[        R                  L a  U
[        -  U[        -  -   nOiU[        R                  [        R                  4;   a  U
[        -  U[        -  -   nO1U[        R
                  L a  U
[        -  U[        -  -   nO
 SU 35       eXS   -  US   -  n[        R                  " X[        5      $ )Nr   r	   r
   r   zunsupported dtype: r   )mxfp8mxfp4nvfp4zUnsupported recipe r#   mx       r   r   )r   r   bfloat16float8_e4m3fnfloat8_e5m2float4_e2m1fn_x2
startswithr*   r+   r,   r-   r.   r/   )r=   r>   r?   dtyper   r   r0   gemm_ops	peak_topscompute_gemm_time_s	num_reads
num_writes
block_sizenum_scale_readsbytes_rwmem_gemm_time_ss                   r   get_individual_gemm_time_sympyrV   *  s    hE1uqy1}H*+		5&&(9(9:	:/*		%((	(/*	3+E733u".7Q1RR IJ!(()DEE 	
!.!12	
E ""
 
 	 		 
 *44T::R
#1/	00:@Q3QQ	5&&(9(9:	:22ZBS5SS	%((	(22ZBS5SS3+E733u011E:Q4RR  99(;UVVr   c                     UUUpnUS:X  a  [         R                  n	[        XX'XV5      n
[        XXXV5      n[        XX)XV5      nX-   U-   nU$ )Nr    )r   rG   rV   )r=   r>   r?   rL   r   r   r   gemm_dtype_inputgemm_dtype_grad_inputgemm_dtype_grad_weightgemm_output_time_sgemm_grad_input_time_sgemm_grad_weight_time_stotals                 r   get_gemm_time_sympyr_   b  sz     	 .D
 11!&7	a> <	a =	a 7:QQELr   enable_fusion_modelingc           
          [        U5      n[        UU USUUUS9n[        UUUSUUSS9n	[        UU USUUUS9n
[        / UQU	QU
Q5      nU$ )Nr!   )r   r   r   r3   r   Fr"   )r   r<   sum)r=   r>   r?   r   r   r`   r   r0   fwd_fp8_input_memfwd_fp8_weight_memgi_fp8_grad_output_memress               r   get_float8_mem_sympyrg     s     hE 9		-%- :		-% >		!-%- P!P$6P9OP
QCJr   recipe_namec                 p   USL d   S5       eUS:X  d   S5       eX-  nSn1 SknU=S:X  a"    [         U-  n	[         U-  [        U-  -   n
X/nGO=S:X  a$    [         U-  [        U-  -   n	U	[        U-  -  n	U	/nO=o(       a@  UR                  S	5      (       a*    [         U-  [        U-  -   n	U	[        U-  US
-  -  -  n	U	/nO=S:X  a(    [         U-  [        U-  -   [        U-  US
-  -  -   n	U	/nOw=S:X  a3    [         U-  n	[         U-  [        U-  -   [        U-  US-  -  -   n
X-   /nO>S:X  a'  [         U-  [        U-  -   [        U-  US-  -  -   n	U	/nO [        SU SU 35      eU Vs/ s H  nXS   -  U S   -  PM     nnU Vs/ s H  n[        R                  " U[        5      PM      nnU$ s  snf s  snf )zw
Inference version of `get_tensor_memory_traffic_ovhd_s`.
The only thing happening here is we quantize the activation.
Fr#   r!   z*inference only quantizes input activationsN>   r   r   mxfp4*mxfp8*nvfp4*r   r   rA   rE   rB   rC   rF   nvfp4_staticzUnknown recipe name: z. Allowed recipes: r   r   )	r*   r+   BYTES_PER_EL_FLOAT32rK   r,   
ValueErrorr-   r.   r/   )r0   r1   r2   r   rh   r3   r4   r5   allowed_recipesr6   r7   namer:   r;   s                 r   *get_inference_tensor_memory_traffic_ovhd_srr     sK    U"1M1"'!O#OO! KEIMO
 ,e3K+e36IE6QQK$2I ,e36IE6QQK/$66K$ITdoog66T ,e36IE6QQK.5DDK$I
 "E)%-. &,
;	<  %I ,e3K "E)%-. &,
;	<  %23I "E)%-. &,
;	<  %I'} 5$$3#46  A 	
)**U3J-KK 
  @EEu!UYYq45uEEL Fs   (F.%F3c           	      R    [        U5      n[        UU USUSS9n[        / UQ5      nU$ )Nr!   F)r   rh   r3   )r   rr   rb   )r=   r>   r?   rh   r   r0   rc   rf   s           r   get_inference_float8_mem_sympyrt   +  sF     hE C		 "!"
#CJr   c                     [        U5      n[        U -  U-  S-  nXTS   -  US   -  n[        R                  " U[        5      nU$ )Nr   r   r   )r   r*   r-   r.   r/   )r=   r>   r?   r   r0   	kernel_rwr;   s          r   'get_inference_bf16_activation_mem_sympyrw   A  sO    hE!A%)A-I566?V9WWEIIe78ELr   c                      [        XX#XE5      nU$ N)rV   )r=   r>   r?   rL   rh   r   r[   s          r   get_inference_gemm_time_sympyrz   K  s     8	a r   ry   )F)typingr   r   r   r-   r   r,   r+   r*   rn   r   strr   r/   Symbolfloatr<   rV   r_   boolrg   rr   rt   rw   rz    r   r   <module>r      s   ) (      
 ! !'$( "& " "< %) "&%, "!%$(!%
 " "( %) "&$ " !' %) "& # !) SQ h' ' +  v 	v
 !v SMv 
%e#
$%v~ #5W||5W||5W ||5W
 SM5W sm5W \\5Wp|||| ||
 ! SM smR #3 !	3
 SM3 !3 sm3x n 	n
 #n 
%e#
$%nl # #	
 sm,x}   #|||| ||
 # smr   