
    i                        S SK r S SKrS SKJr  S SKJr  S SKJr  S SKJrJ	r	  \R                  R                  r\R                  R                  r\R                  R                  r\R                  rS\S\4S jr " S	 S
\5      r\R'                  \R(                  R*                  5      S 5       r\R'                  \R.                  R*                  5      S 5       r\R'                  \R0                  R2                  5      S 5       r\R'                  \R4                  R*                  5      S 5       r\R6                  R*                  \R6                  R*                  \R8                  R*                  \R8                  R*                  \R:                  R*                  /r\	" S5      (       a%  \R?                  \R@                  R*                  5        \R'                  \5      S 5       r\R'                  \RB                  R*                  5      S 5       r\R'                  \RD                  R                  5      S 5       r\" \/5        g)    N)Tensor)add_safe_globals)return_and_correct_aliasing)TorchAOBaseTensortorch_version_at_leastinput
block_sizec                 z   U R                   nU R                  SU5      n U R                  5       R                  S5      R	                  S5      [
        R                  " [        5      R                  -  nXR                  SS5      -  n U R                  [        5      R                  S5      nUR                  U5      U4$ )Ng-q=   )
shapeviewabsamaxcliptorchfinfoDTYPEmaxto)r   r	   r   scalecodess        Y/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/torchao/optim/subclass_fp8.pyquantize_fp8r      s    KKEJJr:&EIIKR %%e,u{{5/A/E/EEEJJr1%%EHHUO  $E::ee##    c            	       (   \ rS rSrSS/r\ SS\S\S\R                  S-  4S jj5       r	SS\S\S\R                  S-  4S jjr
S r\ SS	 j5       rSS
 jr\   SS\S\R                   R"                  S\R                  S-  4S jj5       rS rSrg)OptimStateFp8"   r   r   Ndtypec                 V    [         R                  " XR                  UR                  US9$ )N)devicer   )r   _make_wrapper_subclassr   r!   )clsr   r   r   s       r   __new__OptimStateFp8.__new__&   s&     ,,U\\
 	
r   c                     UR                   [        L d   eUR                  S:X  d   eXl        X l        UR                  5       UR                  5       -  U l        g)a  Create quantized FP8 optimizer state.

Args
    codes: quantized FP8 E4M3FN data. Has the same shape as the original float tensor.
    scale: scale data for block-wise quantization.

NOTE: To get block-wise scale, the original float tensor is first reshape to (-1, block_size).
Thus, the last dimension of the original float tensor is not necessarily divisible by block size.
Given `codes` and `scale`, `block_size` is calculated as `codes.numel() // scale.numel()`.
r   N)r   r   ndimr   r   numelr	   )selfr   r   r   s       r   __init__OptimStateFp8.__init__1   sH     {{e###zzQ

++-5;;=8r   c                 4    U R                   U R                  /4$ N)tensor_attrsr   r)   s    r   __tensor_flatten__ OptimStateFp8.__tensor_flatten__B   s      4::,..r   c                 Z    U " / U R                    Vs/ s H  oQU   PM	     snQUQ76 $ s  snf r-   )r.   )r#   tensor_data_dicttensor_attributes
outer_sizeouter_stridenames         r   __tensor_unflatten__"OptimStateFp8.__tensor_unflatten__E   s>      
141A1AB1At$1AB
EV
 	
Bs   (
c                    U R                   R                  5       nUR                  SU R                  5      U R                  R                  SS5      -  nUb  UR                  U5      nUR                  U R                   R                  5      $ )Nr   r   )r   floatr   r	   r   r   r   )r)   output_dtype
float_datas      r   
dequantizeOptimStateFp8.dequantizeM   sj    ZZ%%'
__R9DJJOOBPQ<RR
##|4Jtzz//00r   r	   r!   c                     [         R                  " U[        US9n[         R                  " UR                  5       U-  US9nU " XVUS9$ )N)r   r!   r!   r   )r   zerosr   r(   )r#   r   r	   r!   r   r   r   s          r   rC   OptimStateFp8.zerosU   s>     Ev>EKKMZ7G5u--r   c                     U R                   R                   SU R                   S[        U R                  5       SU R
                   SU R                   SU R                   S3$ )Nz(block_size=z, shape=z, dtype=z	, device=z, requires_grad=))	__class____name__r	   tupler   r   r!   requires_gradr/   s    r   __repr__OptimStateFp8.__repr__a   sd    ~~&&'|DOO3D E4::&'x

|9T[[M R!//03	
r   )r	   r   r   r-   )NN)   NN)rH   
__module____qualname____firstlineno__r.   staticmethodr   r   r   r$   r*   r0   classmethodr8   r>   inttypesDevicerC   rK   __static_attributes__ r   r   r   r   "   s    W%L 
 %)	

 
 {{T!	
 
9f 9V 9EKK$<N 9"/ PT
 
1  %)$(	. 	. ""		.
 {{T!	. 	.
r   r   c                 D   US   nUS   n[        U[        5      (       a}  [        U[        5      (       ah  UR                  UR                  :X  d   eUR                  R	                  UR                  5        UR
                  R	                  UR
                  5        U$ [        U[        5      (       aO  [        XTR                  5      u  pgUR                  R	                  U5        UR
                  R	                  U5        U$ UR	                  UR                  5       5        U$ )Nr   r   )
isinstancer   r	   r   copy_r   r   r>   )funcrT   argskwargsdstsrcr   r   s           r   _r`   i   s    
q'C
q'C#}%%*S-*H*H~~///				"				" J 
C	'	'#C8				
 J 			#.."#Jr   c                     UR                  SUS   R                  5      nUR                  SS 5      n[        US   R                  R	                  US9US   R
                  R	                  US9US9n[        XX65      $ )Nr   r   r!   rA   rB   )getr   r   r   r   r   r   )r[   rT   r\   r]   r   r!   outs          r   r`   r`   ~   sz     JJwQ.EZZ$'F
Q'Q'C
 't6??r   c                     U Vs/ s H*  n[        U[        5      (       a  UR                  5       OUPM,     nnU " U0 UD6$ s  snf r-   )rY   r   r>   r[   rT   r\   r]   xs        r   r`   r`      sC    KOP4ajM::ALLNA4DP    Qs   1A c                 h    Uu  pE[        UR                  R                  U5      UR                  5      $ r-   )r   r   r   r   )r[   rT   r\   r]   rf   r   s         r   r`   r`      s'    HAe,agg66r   z
2.11.0.devc           	          US   n[        U[        5      (       d  [        S[        U5       35      e[        U " UR                  /USS  Q70 UD6U " UR
                  /USS  Q70 UD65      $ )Nr   z$expecting a OptimStateFp8 but found r   )rY   r   
ValueErrortyper   r   re   s        r   r`   r`      sv    QAa''?QyIJJ QWW*tABx*6*QWW*tABx*6* r   c                     US   R                   R                  5       =(       a    US   R                  R                  5       $ )Nr   )r   	is_pinnedr   )r[   rT   r\   r]   s       r   r`   r`      s/    7==""$Ba)@)@)BBr   c                    US S u  pEpg[        U5      S:  a  US   OSnUS:w  a  [        S5      eUS:w  a  [        S5      eUR                  n	[        R                  " UR
                  SS  5      n
Xj-  U	-  S:w  d  Xz-  U	-  S:w  a"  [        SUR
                   SU	 SU S	U S
3	5      e[        UR                  Xg UR                  Xj-  U	-  Xz-  U	-   5      $ )N   r   r   z+Only support aten.slice along the first dimz#Only support aten.slice with step=1zInvalid start or end for shape=z and block_size=zD. Make sure start and end align with block boundary. Received start=z, end=.)	lenri   r	   mathprodr   r   r   r   )r[   rT   r\   r]   rf   dimstartendstepr	   strides              r   r`   r`      s    bqAE$i!m47D axFGGqy>??JYYqwwqr{#F 	*$)clj-HA-M-aggY6Fzl S#WF3%q2
 	
 		*,s|z/IJ r   )#rq   r   r   torch.serializationr   torch.utils._python_dispatchr   torchao.utilsr   r   opsatenc10d_functional_c10d_functionalfloat8_e4m3fnr   rS   r   r   
implementsrZ   defaultr`   _to_copylerpScalarr   all_gather_into_tensorwait_tensordetach_optim_state_fp8_c10d_opsappend_wrap_tensor_autogradrl   slicerW   r   r   <module>r      s      0 D Cyy~~))++99-- $ $C $D
% D
N $**,,- .( $--//0	@ 1	@ $))**+! ,! $))++,7 -7 **22++33''  ((KK  ,''$$%5%K%K%S%ST 34	 5	 $..001C 2C
 $**++, -8 - !r   