
    i8              &          S SK r S SKJr  S SKrS SKJr  \R
                  R                  SS5      r\R                  S5        \R                  S5        \R                  S5        \R                  S	5        \R                  S
\R                  R                  R                  /S9  \R                  S5        \R                  S5        \R                  S5        \R                  S5        \R                  S5        \R                  S5        S rS r\ R                  S 5       r              SgS\S\S\S\\   S\S\S\\   S\S\S\S\S \S!\S"\S#\S$\S%\S&\4$S' jjr\" S(5                    SgS\S\S\S\\   S\S\S\\   S\S\S\S\S \S!\S"\S#\S$\S%\S&\4$S) jj5       r  ShS*\S+\S,\S-\S.\S/\\   S0\\R*                     S&\4S1 jjr\" S25        ShS*\S+\S,\S-\S.\S/\\   S0\\R*                     S&\4S3 jj5       rS,\S&\\44S4 jr\" S55      S,\S&\\44S6 j5       rS7\S8\S9\S:\S&\4
S; jr\" S<5      S7\S8\S9\S:\S&\4
S= j5       rS7\S8\S9\S:\S>\S?\S/\\   S@\\   S0\\R*                     S&\4SA jr\" SB5      S7\S8\S9\S:\S>\S?\S/\\   S@\\   S0\\R*                     S&\4SC j5       r\ R                  " 5       SD 5       rSE r\" SF5      SG\SH\SI\SJ\4SK j5       rS,\SL\SM\S&\4SN jr\" SO5      S,\SL\SM\S&\4SP j5       rS*\SQ\SR\S,\SS\ST\SU\S/\\   S0\R*                  4SV jr\" SW5      S*\SQ\SR\S,\SS\ST\SU\S/\\   S0\R*                  S&\4SX j5       r\" SY5      SZ\S[\S\\S]\S$\S^\S_\S0\R*                  S&\4S` j5       rS,\SL\S&\4Sa jr\" Sb5      S,\SL\S&\4Sc j5       rS*\SQ\S,\SS\S/\\   S0\R*                  4Sd jr \" Se5      S*\SQ\S,\SS\S/\\   S0\R*                  S&\4Sf j5       rg)i    N)Optional)TensortorchaoFRAGMENTzrowwise_scaled_linear_sparse_cutlass_f8f8(Tensor input, Tensor input_scale, Tensor weight, Tensor weight_meta, Tensor weight_scale, Tensor? bias=None, ScalarType? out_dtype=None) -> TensorzLto_sparse_semi_structured_cutlass_sm9x_f8(Tensor weight) -> (Tensor, Tensor)z\swizzle_mm(Tensor mat1, Tensor mat2, bool mat1_is_swizzled, bool mat2_is_swizzled) -> Tensorzswizzle_scaled_mm(Tensor mat1, Tensor mat2, bool mat1_is_swizzled, bool mat2_is_swizzled, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None) -> TensorzImx_fp8_bf16(Tensor a, Tensor b, Tensor a_scale, Tensor b_scale) -> Tensor)tagsa6  qscaled_dot_product(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, float? scale=None, float q_scale=1.0, int q_zp=0, float k_scale=1.0, int k_zp=0, float v_scale=1.0, int v_zp=0, float a_scale=1.0, int a_zp=0, float o_scale=1.0, int o_zp=0) -> Tensorzida8w4_linear_prepack_cpu(Tensor weight, Tensor scales, Tensor qzeros) -> (Tensor, Tensor, Tensor, Tensor)zda8w4_linear_cpu(Tensor input, Tensor input_scales, Tensor input_qzeros, Tensor weight, Tensor weight_scales, Tensor weight_qzeros, Tensor compensation, Tensor? bias, ScalarType output_dtype) -> Tensorz_scaled_embedding_bag(Tensor qweight, Tensor indices, Tensor offsets, Tensor weight_scale, float o_scale, int mode, bool include_last_offset, ScalarType output_dtype) -> TensorzKfloat8_linear_prepack_cpu(Tensor weight, Tensor scales) -> (Tensor, Tensor)zfloat8_linear_cpu(Tensor input, Tensor input_scales, Tensor weight, Tensor weight_scales, Tensor? bias, ScalarType output_dtype) -> Tensorc                    ^  U 4S jnU$ )Nc                 P   > [         R                  R                  T 5      " U 5      $ N)torchlibraryregister_fakefuncnames    J/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/torchao/ops.py	decorator%register_custom_op.<locals>.decorator4   s    }}**dV5d;;     r   r   s   ` r   register_custom_opr   3   s    < r   c                    ^  U 4S jnU$ )Nc                 N   > [         R                  R                  T SS9" U 5      $ )Nr   )mutates_args)r   r   	custom_opr   s    r   r   *register_custom_op_impl.<locals>.decorator;   s$    }}&&$r&B4HHr   r   r   s   ` r   register_custom_op_implr   :   s    I r   c                      [         R                  R                  [         R                  R                  5       5      n U R                  S-  U R
                  -   nU$ )N
   )r   cudaget_device_propertiescurrent_devicemajorminor)device_propscompute_capabilitys     r   cached_compute_capabilityr'   A   sE    ::33EJJ4M4M4OPL%++b0<3E3EEr   querykeyvalue	attn_mask	dropout_p	is_causalscaleq_scaleq_zpk_scalek_zpv_scalev_zpa_scalea_zpo_scaleo_zpreturnc                     [         R                  R                  R                  R	                  U UUUUUUUUU	U
UUUUUU5      $ )a  
Quantized SDPA with quantized inputs and outputs.
Arguments
    query: input query tensor,
    key: input key tensor,
    value: input value tensor,
    attn_mask: attention mask tensor,
    dropout_p: dropout probability,
    is_causal: causal flag,
    scale: scaling factor applied prior to softmax,
    q_scale: scale for query from linear quantization,
    q_zp: zero point for query from linear quantization,
    k_scale: scale for key from linear quantization,
    k_zp: zero point of key from linear quantization,
    v_scale: zero point for value from linear quantization,
    v_zp: zero point of value from linear quantization,
    a_scale: scale for attention from softmax quantization,
    a_zp: zero point for attention from softmax quantization,
    o_scale: scale for output from linear quantization,
    o_zp: zero point for output from linear quantization,
Returns
    output of quantized SDPA
)r   opsr   qscaled_dot_productdefaultr(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   s                    r   r<   r<   H   s[    T 990088# r   ztorchao::qscaled_dot_productc                     U $ r
   r   r>   s                    r   _r@      s	    ( Lr   inputinput_scaleweightweight_metaweight_scalebias	out_dtypec           	      n    [         R                  R                  R                  R	                  XX#XEU5      $ )a  
CUTLASS-based row-wise scaled F8F8 linear operator, for sparsified weight case.
Args:
    input: quantized input tensor, in row-major layout.
    input_scale: scale factors for input tensor, has to be tensor of the same shape as the input tensor, minus the last dimension.
    weight: sparsified quantized weight matrix, in row-major layout.
    weight_meta: sparsify metadata for weight tensor.
    weight_scale: scale factors for weight tensor, one value per row of weight matrix (thus also tensor of the same shape as the weight tensor, minus the last dimension).
    bias: an optional vector of size equal to number of rows of weight tensor, or None.
    out_dtype: optional data type for output tensor.
Returns:
    output: result tensor, in row-major layout.
)r   r;   r   )rowwise_scaled_linear_sparse_cutlass_f8f8r=   )rA   rB   rC   rD   rE   rF   rG   s          r   rI   rI      s0    . 99FFNNFY r   z2torchao::rowwise_scaled_linear_sparse_cutlass_f8f8c                     Ub  UOUR                   nU R                  n[        R                  " / U R                  S S QUR                  S   P7XxS9$ )Nr   dtypedevice)rM   rN   r   emptyshape)	rA   rB   rC   rD   rE   rF   rG   rM   rN   s	            r   r@   r@      sN     #.IK4E4EE\\F;;;Sb);6<<?;5XXr   c                 h    [         R                  R                  R                  R	                  U 5      $ )a  
CUTLASS-based conversion from sparsified input tensor to corresponding compressed tensor, along with corresponding metadata tensor.
Args:
    weight: input tensor, in row-major layout.
Returns:
    weight_compressed: compressed weight tensor, with sparsity eliminated, in row-major layout.
    weight_meta: metadata tensor, describing the sparsity structure of the input tensor, also in row-major layout.
)r   r;   r   )to_sparse_semi_structured_cutlass_sm9x_f8r=   rC   s    r   rR   rR      s%     99FFNNvVVr   z2torchao::to_sparse_semi_structured_cutlass_sm9x_f8c                     U R                  U S   U S   S-  5      U R                  U S   [        U S   S-  S5      [        R                  S94$ )Nr               rM   )	new_emptymaxr   charrS   s    r   r@   r@      sV     	F1IN3Cq	Q$;5::N r   mat1mat2mat1_is_swizzledmat2_is_swizzledc                 j    [         R                  R                  R                  R	                  XX#5      $ )zG
Similar to torch.mm but Tensor inputs can be SwizzleTensor instances.
)r   r;   r   
swizzle_mmr=   r]   r^   r_   r`   s       r   rb   rb      s-     99''//$ r   ztorchao::swizzle_mmc                 Z    U R                  U R                  S   UR                  S   5      $ Nr   rU   rZ   rP   rc   s       r   r@   r@      s%     >>$**Q-A77r   scale_ascale_bscale_resultc	                 x    [         R                  R                  R                  R	                  U UUUUUUUU5	      $ )zH
Similar to torch.mm but Tensor inputs can be SwizzleTensor instances.

)r   r;   r   swizzle_scaled_mmr=   	r]   r^   r_   r`   rg   rh   rF   ri   rG   s	            r   rk   rk      sB     99..66
 
r   ztorchao::swizzle_scaled_mmc	                 Z    U R                  U R                  S   UR                  S   5      $ re   rf   rl   s	            r   r@   r@     s%     >>$**Q-A77r   c                      [        [        S5      (       a   [        R                  [        R                  4$ [        R                  4$ )zGTODO: when e8m0 is hardened and major release lets remove uint8 supportfloat8_e8m0fnu)hasattrr   uint8ro   r   r   r   _get_dtypesrr   %  s3     u&''U1122KK>r   c                    ^ ^ [        5       n[        R                  " T R                  U;   U 4S j5        [        R                  " TR                  U;   U4S j5        g )Nc                  "   > ST R                    3$ )Nz4A_scale tensor must be uint8 or float8_e8m0fnu, got rY   )A_scales   r   <lambda>%_check_scale_dtypes.<locals>.<lambda>2      Fw}}oVr   c                  "   > ST R                    3$ )Nz4B_scale tensor must be uint8 or float8_e8m0fnu, got rY   )B_scales   r   rv   rw   6  rx   r   )rr   r   _checkrM   )ru   rz   allowed_dtypess   `` r   _check_scale_dtypesr}   -  sB     ]N	LL'V 
LL'Vr   ztorchao::mx_fp8_bf16ABru   rz   c                     [         R                  " U R                  S5      UR                  S5      4[         R                  U R                  S9$ )zMeta impl for mx_fp8_bf16r   rU   rL   )r   rO   sizebfloat16rN   )r~   r   ru   rz   s       r   meta_mx_fp8_bf16r   :  s4     ;;q	166!9-U^^AHHUUr   scalesqzerosc                 j    [         R                  R                  R                  R	                  XU5      $ )z
Prepack weights for DA8W4 linear operator on CPU.
Args:
    weight: weight tensor.
    scales: scales for weight tensor.
    qzeros: zero points for weight tensor.
Returns:
    packed weight, scales, and zero points.
)r   r;   r   da8w4_linear_prepack_cpur=   rC   r   r   s      r   r   r   @  s'     9955==ffUUr   z!torchao::da8w4_linear_prepack_cpuc                 2    XU[         R                  " 5       4$ r
   )r   r   r   s      r   r@   r@   Q  s    65<<>11r   input_scalesinput_qzerosweight_scalesweight_qzeroscompensationc	                 x    [         R                  R                  R                  R	                  U UUUUUUUU5	      $ )a  
DA8W4 linear operator on CPU.
Args:
    input: input tensor.
    input_scales: scales for input tensor.
    input_qzeros: zero points for input tensor.
    weight: weight tensor.
    weight_scales: scales for weight tensor.
    weight_qzeros: zero points for weight tensor.
    compensation: compensation tensor for weight.
    bias: optional bias tensor.
    out_dtype: output data type.
Returns:
    output tensor in out_dtype.
)r   r;   r   da8w4_linear_cpur=   )	rA   r   r   rC   r   r   r   rF   rG   s	            r   r   r   V  sB    4 99--55
 
r   ztorchao::da8w4_linear_cpuc	                     UR                  5       S:X  d   eUR                  S5      UR                  S5      -  S-  n	U R                  " / U R                  S S QU	P7SU06$ )N   r      rV   rK   rM   dimr   rZ   rP   )
rA   r   r   rC   r   r   r   rF   rG   Ns
             r   r@   r@   }  s^     ::<1AQ'!+A??AEKK,AaAyAAr   ztorchao::_scaled_embedding_bagqweightindicesoffsetsw_scalesmodeinclude_last_offsetc                 p    US:X  d   eUR                   S   S-
  nU R                  XR                   S   US9$ )NTr   rU   rY   )rP   rZ   )	r   r   r   r   r7   r   r   rG   
batch_sizes	            r   r@   r@     sD     $&&&q!A%JZq)9KKr   c                 h    [         R                  R                  R                  R	                  X5      $ )z
Prepack weights for float8 linear operator on CPU.
Args:
    weight: weight tensor.
    scales: scales for weight tensor.
Returns:
    packed weight, packed scales
)r   r;   r   float8_linear_prepack_cpur=   rC   r   s     r   r   r     s%     9966>>vNNr   z"torchao::float8_linear_prepack_cpuc                     X4$ r
   r   r   s     r   r@   r@     s
    >r   c                 r    [         R                  R                  R                  R	                  U UUUUU5      $ )a  
float8 linear operator on CPU.
Args:
    input: input tensor.
    input_scales: scales for input tensor.
    weight: weight tensor.
    weight_scales: scales for weight tensor.
    bias: optional bias tensor.
    out_dtype: output data type.
Returns:
    output tensor in out_dtype.
)r   r;   r   float8_linear_cpur=   )rA   r   rC   r   rF   rG   s         r   r   r     s9    ( 99..66 r   ztorchao::float8_linear_cpuc                    UR                  5       S;   d   eUR                  5       S:X  a#  UR                  S5      UR                  S5      -  OUR                  S5      nU R                  " / U R                  S S QUP7SU06$ )N)rV   r   r   r   r   rK   rM   r   )rA   r   rC   r   rF   rG   r   s          r   r@   r@     sp     ::<6!!!+1::<1+<AQ'&++a.A??AEKK,AaAyAAr   )Ng        FN      ?r   r   r   r   r   r   r   r   r   )NN)!	functoolstypingr   r   r   r   Librarylibdefine_CTagneeds_fixed_stride_orderr   r   	lru_cacher'   floatboolintr<   r@   rM   rI   rR   rb   rk   rr   r}   r   r   r   r   r   r   r   r   <module>r      s	      mmIz2 

 C 

R 

b 

 T
 

O
((,,
/
/	0   

 } 

o 

 P 

 w 

Q 

 Q
   #'!#<<	< < 	<
 < < E?< < < < < < < < <  !<" #<$ %<~ 23
 #'!#	  	
   E?          !" #$ % 48 "'+  	
  6
 $ 8 HI "'+YYY Y 	Y
 Y 6
Y $Y Y JY"WWfW HI		f	 J	
26JN )*8
88268JN88 +8

  	
   6
 6" $ 8 018
8
8 8 	8
 8 8 6
8 6"8 $8 8 28  
 *+V V6 VF VV V ,V
VVV V 	V" 782f 2f 2f 2 2 92$$$ $ 	$
 $ $ $ 6
$ {{$N /0BBB B 	B
 B B B 6
B {{B B 1B  45LLL L 	L
 L L L {{L L 6L OOO O 89f f   :  	
 6
 {{< 01
B
B
B 
B 	
B
 6

B {{
B 
B 2
Br   