
    iP                     0   % S SK JrJrJrJr  S SKrS SKJr  S SKJ	r	J
r
  S SKJrJr  S SKJr  \R                   R"                  r\R                   R$                  r\R                   R&                  r0 r\\\4   \S'      S(S\R,                  S	\R,                  S
\R,                  S\R,                  S\R.                  S\\R,                     S\\R,                     S\S\R,                  4S jjrS rS r\" \R8                  R:                  \R<                  R:                  \R>                  R:                  \R@                  R,                  \RB                  RD                  \RF                  R:                  /5      S)S j5       r$\" \RJ                  R:                  /5      S)S j5       r&\" \RN                  R:                  \RP                  RR                  /5      S)S j5       r*\" \RV                  R:                  /5      S)S j5       r,\" \RZ                  R,                  /5      S)S j5       r.\" \R^                  R:                  /5      S)S j5       r0\" \Rb                  Rd                  /5      S)S j5       r3S\	S\	4S jr4\" \Rj                  R:                  \Rl                  R:                  /5      S)S j5       r7\" \Rp                  R:                  /5      S)S j5       r9\" \Rt                  R:                  /5      S)S  j5       r;\" \Rx                  R:                  /5      S)S! j5       r=\" \R|                  R:                  \R|                  R:                  /5      S)S" j5       r?\" \R                  R:                  \R                  R:                  /5      S)S# j5       rA\" S$5      (       a%  \" \R                  R:                  /5      S)S% j5       rC\" \R                  R:                  /5      S)S& j5       rE\" \R                  R:                  /5      S)S' j5       rGg)*    )AnyDictOptionalTupleN)tree_map)Float8TrainingTensorchoose_scaled_mm_config)is_row_majorpad_tensor_for_matmul)torch_version_at_leastFLOAT8_OPS_TABLEa_dataa_scaleb_datab_scaleoutput_dtypeoutput_scalebiasuse_fast_accumreturnc                    UR                  5       nUR                  5       n	Sn
UR                  U R                  S   S4:H  =(       a    UR                  SUR                  S   4:H  nU(       a-  U(       d&  X-  n
UR                  S5      nUR                  S5      n	UnU[        R                  [        R
                  4;   a  U(       a  [        R                  nSnU[        R
                  :X  a  UnSn[        R                  " U UUU	UUUUS9nU
b  X-  nUb  X-  nU[        R                  [        R
                  4;   a  U(       a  UR                  U5      nU$ )z
This is the unwrapped version of addmm_float8, which does not take in Float8TrainingTensors
as inputs. This is used to standardize the logic between subclassed and non subclassed
versions of the linear module.
Nr       )scale_ascale_br   scale_result	out_dtyper   )	
reciprocalshapenew_onestorchfloat16float32bfloat16
_scaled_mmto)r   r   r   r   r   r   r   r   a_inverse_scaleb_inverse_scalepost_inverse_scaleis_rowwise_scaling
orig_dtype	post_biasoutputs                  X/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/torchao/float8/float8_ops.pyaddmm_float8_unwrappedr/      sM    ((*O((*O 6<<?A*>> 7==	QU D
 . ->)2226)2226 JemmU]]338J~~Iu}}$	!%	F %$emmU]]338J:&M    c                 J    [        UR                  5      S;   d
   U  S35       eg )N)r   r   z+ with axiswise scaling is not supported yet)lenr   )aten_opscales     r.   _assert_tensorwise_scaler5   _   s1     	EKKF"? =>	? 	#r0   c                    ^  U 4S jnU$ )z(Register aten ops to the float8 op tablec                    > T H8  nU[         ;   a"  [        SU S[         U   R                   35      eU [         U'   M:     U $ )Nz
Float8 op z is already registered to )r   RuntimeError__name__)funcopaten_opss     r.   	decoratorimplements.<locals>.decoratorj   sV    B%%" $>?OPR?S?\?\>]^  $(R   r0   r   )r<   r=   s   ` r.   
implementsr?   g   s     r0   c                     [        XS   R                  5        U " US   R                  /USS  Q70 UD6n[        UUS   R                  US   R                  US   R
                  US   R                  5      $ Nr   r   )r5   _scale_datar   _orig_dtype_linear_mm_config_gemm_input_role)r3   argskwargsnew_datas       r.   float8_desugar_oprJ   v   sx     W1gnn5tAw}}:tABx:6:HQQQ!!Q   r0   c                     U " US   R                   /USS  Q70 UD6nU " US   R                  /USS  Q70 UD6n[        UUUS   R                  US   R                  US   R
                  5      $ rA   )rC   rB   r   rD   rE   rF   )r3   rG   rH   rI   	new_scales        r.    float8_desugar_data_and_scale_oprM      s     tAw}}:tABx:6:HQ<ab<V<IQQ!!Q   r0   c                    U " US   R                   /USS  Q70 UD6nUS   R                  R                  S:  a  U " US   R                  /USS  Q70 UD6nOUS   R                  nU [        R                  R
                  :X  a  [        XS   R                  5        US   R                  nUnUb  US:X  a  US:H    OUS:H    [        UUUS   R                  US   R                  US   R                  U5      $ )Nr   r   )rC   rB   ndimaten	transposeintr5   _axiswise_dimr   rD   rE   rF   )r3   rG   rH   rI   rL   old_axiswise_dimnew_axiswise_dims          r.   float8_transposerW      s     tAw}}:tABx:6:HAw~~QDGNN@T!"X@@	GNN	$..$$$ q'..9Aw,,'#q "!QQ!!Q   r0   c                 .   US   US   pCU[        UR                  R                  5      :X  am  U " US   R                  /USS  Q70 UD6n[        UUS   R                  US   R
                  US   R                  US   R                  US   R                  5      $ [        US   R                  R                  5      S:  a  [        XU5      $ UR                  n[        U5      S:X  a  US:X  ag  U " UR                  U40 UD6nSUS   /nU " UR                  U40 UD6n[        UUUR
                  UR                  UR                  UR                  5      $ US:X  d  U[        UR                  5      S-
  :X  a_  U " UR                  U40 UD6nUS   S/nU " UR                  U40 UD6nSn	[        UUUR
                  UR                  UR                  U	5      $ [        U  SUR                   SUR                  R                   SUR                   SU S	3
5      e)
Nr   r      rO   z# with axiswise scaling and t.shape z t._scale.shape z t._axiswise_dim z new_shape z is not supported yet.)listrC   r   r   rB   rD   rE   rF   rT   r2   rJ   AssertionError)
r3   rG   rH   t	new_shaperI   axiswise_dimnew_scale_shaperL   rV   s
             r.   float8_viewr`      sI   7DGy D''47==>48>v>#GNNGG%%G$$G!!
 	
 47>> 1$ 77 ??L
9~1qww	<V<H )B-0O/DVDI'##""  R<CL14D#Eqww	<V<H(|Q/O/DVDI!'##""   )6qwwi?OPQPXPXP^P^O__pqr  rA  rA  qB  BM  NW  MX  Xn  	o r0   c                    ^ U " TS   R                   /TSS  Q70 UD6n[        U TS   R                  5        U4S jn[        XC5      n[	        U5      $ )Nr   r   c                    > [        U TS   R                  TS   R                  TS   R                  TS   R                  5      $ )Nr   )r   rB   rD   rE   rF   )datarG   s    r.   make_float8!float8_split.<locals>.make_float8   sE    #GNNGG%%G$$
 	
r0   )rC   r5   rB   maprZ   )r3   rG   rH   new_data_tensorsrd   outs    `    r.   float8_splitri      sS    tAw}}BtABxB6BWd1gnn5
 k
,C9r0   c                    US   nUS   R                   nUS   R                  nUS   R                  nUS   R                  R                  nUS   R
                  n/ n	U H  n
[        U
[        5      (       d   S5       eU
R                   U:X  d   S5       eU
R                  UL d   S5       eU
R                  UL d   S5       eU
R                  R                  U:X  d   S5       eU
R
                  UL d   S5       e[        X
R                  5        U	R                  U
R                  R                  [        R                  5      5        M     U " U	/USS  Q70 UD6nUR                  U5      n[        XXFU5      $ )	Nr   z7Expecting all chunks to be of type Float8TrainingTensorz,Expecting all chunks to be of the same dtypezCExpecting all chunks to have thee same scale as a result of a splitzGExpecting all chunks to have thee same mm config as a result of a splitzCExpecting all chunks to be of the same dtype as a result of a splitzLExpecting all chunks to have the same gemm_input_role as a result of a splitr   )rD   rB   rE   rC   dtyperF   
isinstancer   r5   appendviewr!   uint8)r3   rG   rH   chunked_tensorsr+   r4   	mm_config	fp8_dtypegemm_input_role
chunk_datachunkrI   s               r.   
float8_catrv   	  s   377O #//JA%%E"44I"((..I%a(99OJ %!566 	
E	
6   J. 	
:	
. ||u$ 	
Q	
$ &&)3 	
U	
3 {{  I- 	
Q	
- %%8 	
Z	
8 	!,,7%++**5;;78) !, z7DH77H}}Y'HXXr0   c                 t    [        XS   R                  5        S n[        X15      n[        X25      nU " U0 UD6$ )a  Be careful with this function, this is a "fallback" op that
casts the output of the op to the original precision. And performs the op.

We currently need this to support the backward for admmm bias.
"addmm" -> out
"hp_gradBias" <-"sum" <- "identity" <- gradOut <- "hp_gradOut"
r   c                 P    [        U [        5      (       a  U R                  5       $ U $ N)rl   r   to_original_precision)xs    r.   unwrap!float8_cast_up_op.<locals>.unwrap9  s$    a-..**,,r0   )r5   rB   r   )r3   rG   rH   r|   new_args
new_kwargss         r.   float8_cast_up_opr   .  s?     W1gnn5
 %H&)JH+
++r0   abc                     U R                   nU R                  nUR                   n[        U R                  U R                  UR                  UR                  5      nUR
                  (       a  U R                   R                  S5      UR                   R                  S5      :X  d?   SU R                   R                  S5       SUR                   R                  S5       35       e[        USS9n[        USS9n[        UR                  5       5      (       d  UR                  5       n[        UR                  5       5      (       a,  UR                  5       R                  5       R                  5       nUR                  nU R                  c<  UR                  b/  UR                  UR                  S   5      R                  SS5      nOHU R                  b;  UR                  c.  UR                  UR                  S   5      R                  SS5      nX#XF4$ )Nr   r   z"Inner dims must match for mm, got z and )dimsrO   )rC   rB   r	   rF   rE   pad_inner_dimsizer   r
   stride
contiguousr\   rT   repeatr   reshape)r   r   r   r   r   scaled_mm_configr   s          r.   preprocess_addmmr   C  s   WWFhhGWWF.					 %%ww||A!'',,q/1 	
0a0Aqww||TUFWX	
1 'vA6&vA6((""$FMMO$$&&(**,hhG 	1??#>..a199"a@	
	$)@..a199!R@F++r0   c                    US   nUS   n[        U[        5      (       a  [        U[        5      (       d)   SR                  [        U5      [        U5      5      5       e[	        X45      u  pVpxUR
                  n	[        UR                  UR                  UR                  UR                  5      n
U
R                  (       ap  [        R                  " UR                  R                  5       UR                  -  UR                  R                  5       UR                  -  5      R                  U	5      $ [!        UUUUU	S S U
R"                  S9nU$ )Nr   r   zFExpecting  both Float8TrainingTensor for mm inputs but found {} and {}r   r   r   )rl   r   formattyper   rD   r	   rF   rE   emulater!   mmrC   floatrB   r&   r/   r   )r3   rG   rH   r   r   r   r   r   r   r   r   
tensor_outs               r.   	float8_mmr   l  s+   QAQAa-..:	4 4 OVVQa 
 (8'=$FV==L.					 xx!((2AGGMMOahh4NORR
 	
 ('66	J r0   c                    [        US   [        R                  5      (       a0  [        US   [        5      (       a  [        US   [        5      (       d   eUS   nUS   nUS   n[	        XE5      u  pgpUR
                  n
UR                  U
:X  d   S5       e[        UR                  UR                  UR                  UR                  5      nUR                  (       at  [        R                  " UR                  R                  5       UR                  -  UR                  R                  5       UR                  -  5      R                  U
5      nX-   $ [!        UUUU	U
S UUR"                  S9nU$ )Nr   r   rY   z"bias dtype must match output dtyper   )rl   r!   Tensorr   r   rD   rk   r	   rF   rE   r   r   rC   r   rB   r&   r/   r   )r3   rG   rH   r   r   r   r   r   r   r   r   r   rh   r   s                 r.   float8_addmmr     sL    	47ELL))tAw 455tAw 455	6 7DQAQA'7'=$FV==L::%K'KK%.					 hhqww}}1177==?QXX3MNQQ
 z''66	J r0   c                 p    [        XS   R                  5        US   R                  US   R                  :H  $ rA   )r5   rB   r   r3   rG   rH   s      r.   float8_is_same_sizer     s-    W1gnn57==DGMM))r0   c           	      r   [        US   [        5      (       d   e[        U5      S:X  a  SU;   d   S5       eUS   [        R                  [        R
                  1;   d   S5       e[        US   R                  US   R                  US   US   R                  US   R                  US   R                  5      $ )z{This gets called when running matmul under autocast
when the input is a Float8TrainingTensor, presenting as a fp32
tensor.
r   r   rk   z%Only support dtype kwarg for autocastzKOnly support floating point conversion for autocast w/ Float8TrainingTensor)rl   r   r2   r!   r"   r$   rC   rB   rE   rF   rT   r   s      r.   autocast_to_copyr     s     d1g34444v;!6 1 /1 '?  U UU   QQwQ!!Q  Q r0   c                 P   [        XS   R                  5        US   n[        U[        5      (       d   S[	        U5       35       eUR
                  nUR                  5       nU " U/USS Q70 UD6n[        UUR                  UR                  UR                  UR                  5      $ )z#
override funcol with FP8 handling
r   z9expecting a Float8TrainingTensor for allgather but found r   N)
r5   rB   rl   r   r   rC   r   rD   rE   rF   r3   rG   rH   	fp8_inputfp8_datafp8_outs         r.   allgather_fp8r     s     W1gnn5QIi!566 
CDOCTU6 H""$Hh4ab4V4G##"" r0   c                    [        XS   R                  5        US   n[        U[        5      (       d   eUR                  nU " U/USS  Q70 UD6n[        UUR                  UR
                  UR                  UR                  5      $ rA   r5   rB   rl   r   rC   rD   rE   rF   r   s         r.   wait_tensor_fp8r     s    W1gnn5QIi!56666Hh4ab4V4G##"" r0   z
2.11.0.devc                    [        XS   R                  5        US   n[        U[        5      (       d   eUR                  nU " U/USS Q70 UD6n[        UUR                  UR
                  UR                  UR                  5      $ )z
Handle _wrap_tensor_autograd for Float8TrainingTensor.
This wraps the underlying fp8 data in AsyncCollectiveTensor while
preserving the Float8TrainingTensor wrapper with its scale and metadata.
r   r   Nr   r   s         r.   wrap_tensor_autograd_fp8r     s     	!q'..9G	)%9::::??(8T!"X88#!!''&&
 	
r0   c                    US   nUS   n[        U[        5      (       d   e[        U[        5      (       d   e[        X1S   R                  5        UR                  UR                  :X  d   eUR                  UR                  :X  d   eUR
                  UR
                  :X  d   eUR                  nUR                  nU " XQS   U/USS  Q70 UD6n[        UUR                  UR
                  UR                  UR                  5      $ )Nr   rY   r      )	rl   r   r5   rB   rk   rD   rC   rE   rF   )r3   rG   rH   fp8_self
fp8_valuesr   fp8_values_datar   s           r.   index_put_fp8r     s    AwHaJh 45555j"67777XAw~~6??j/////>>Z-----:#9#9999~~H &&OhQN48NvNG""!! r0   c                    US   nUS   n[        U[        5      (       dI  [        U[        5      (       a4  UR                  5       n[        XR                  5        U " X5/USS  Q70 UD6$ [        U[        5      (       Ga?  [        U[        5      (       Ga)  [        XR                  5        UR
                  UR
                  :X  d   S5       eUR                  UR                  :X  d   S5       eUR                  UR                  :X  d   S5       eUR                  R                  UR                  R                  :X  d   S5       eUR                  UR                  :X  d   S5       eU " UR                  UR                  /USS  Q70 UD6n[        UUR                  UR
                  UR                  UR                  5      $ [        S	5      e)
Nr   r   rY   z<Expecting both Float8TrainingTensors to be of the same dtypez<Expecting both Float8TrainingTensors to have thee same scalez@Expecting both Float8TrainingTensors to have thee same mm configz=Expecting both Float8TrainingTensors to be of the same dtypetzEExpecting both Float8TrainingTensors to have the same gemm_input_rolez7Unsupported semantics for copy_ in Float8TrainingTensor)rl   r   rz   r5   rB   rD   rE   rC   rk   rF   r8   )r3   rG   rH   selfsrcsrc_hpr   s          r.   copy_fp8r   /  s    7D
q'Cd011j!7 7 **, **5t9d12h9&99	D.	/	/J!5 5 	!**53??2 	
J	
2 {{cjj( 	
J	
( %%)>)>> 	
N	
> zz399??2 	
K	
2 $$(<(<< 	
S	
< $**ciiE$qr(EfE#KK""!!
 	
 TUUr0   )NNFry   )Htypingr   r   r   r   r!   torch.utils._pytreer   %torchao.float8.float8_training_tensorr   r	   torchao.float8.float8_utilsr
   r   torchao.utilsr   opsrQ   c10d_functional_c10d_functionalr   __annotations__r   rk   boolr/   r5   r?   _unsafe_viewdefault
as_stridedcloneslicefill_Scalarr   rJ   detachrM   r\   rR   rS   rW   rn   r`   splitri   catrv   sumdim_IntListr   r   r   matmulr   addmmr   is_same_sizer   _to_copyr   all_gather_into_tensorr   wait_tensorr   _wrap_tensor_autogradr   
index_put_r   copy_r   r   r0   r.   <module>r      s   . -  ( L 0yy~~))++99-- #% $sCx. % ,0#' ?LL?\\? LL? \\	?
 ++? 5<<(? 5<<
 ? ? \\?D? !!





				 
	
	 8 TYY 3 !3l TZZ  !$ TXX!Y  !YH TXX!!"#, $,(&,, &,1E &,R TWW__dkk1123 4D TZZ !! "!H T&&'(* )*
 T]]""#$ %. ..66//77, _((002B2N2N2V2VWX Y" ,''!77??@A
 B
* T__$$%& ', TZZ !-V "-Vr0   