
    i&                        S SK r S SKrS SKJr  S SKJr  S SKJr  S SKJrJ	r	  SSK
JrJrJrJr  \R                  R                   r\R                  R"                  r\R                  R$                  rS SKJr  \" SS	9S
 5       r\" SS	9S 5       r " S S\5      r\R1                  \R2                  R4                  5      S 5       r\R1                  \R8                  R4                  5      S 5       r\R1                  \R:                  R<                  5      S 5       r\R1                  \R>                  R4                  5      S 5       r\R@                  R4                  \R@                  R4                  \RB                  R4                  \RB                  R4                  \RD                  R4                  /r#\	" S5      (       a%  \#RI                  \RJ                  R4                  5        \R1                  \#5      S 5       r\R1                  \RL                  R4                  5      S 5       r\R1                  \RN                  R                  5      S 5       r\" \/5        g)    N)Tensor)add_safe_globals)return_and_correct_aliasing)TorchAOBaseTensortorch_version_at_least   )create_dynamic_mapdequant_with_qmapquantize_4bit_with_qmapscale_tensor)	lru_cache)maxsizec                      [        SSS5      $ )NT      )r	        Z/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/torchao/optim/subclass_4bit.pyget_qmap_signedr   "   s    dAq))r   c                  R    [         R                  " SSSSS9SS  R                  5       $ )Nr   r      cpudevice)torchlinspacetolistr   r   r   get_qmap_unsignedr   '   s'    >>!Q51!"5<<>>r   c                   @   \ rS rSr/ SQr\ SS\S\S\S\S\R                  S-  4
S	 jj5       r
 SS\S\S\S\S\R                  S-  4
S
 jjrS r\ SS j5       rSS jr\    SS\S\S\R"                  R$                  S\R                  S-  4S jj5       rS rSrg)OptimState4bit,   )codesscaleqmapNr"   r#   r$   signeddtypec                 B    [         R                  " XUR                  US9$ )N)r   r&   )r   _make_wrapper_subclassr   )clsr"   r#   r$   r%   shaper&   s          r   __new__OptimState4bit.__new__0   s"     ,,u||5
 	
r   c                 V   UR                   [        R                  L d   eUR                  S:X  d   eUR                  S:X  d   eUR                   [        R                  L d   eXl        X l        X0l        X@l        XPl	        UR                  5       S-  UR                  5       -  U l        g)a  Create quantized 4-bit optimizer state as proposed in https://arxiv.org/abs/2309.01507

Args
    codes: quantized and packed 4-bit data stored as uint8.
    scale: scale data for block-wise quantization.
    qmap: lookup table that maps between quantized value (code) and float value.
    signed: whether the tensor is signed or unsigned.
    shape: shape of original float tensor.

NOTE: To get block-wise scale, the original float tensor is first reshape to (-1, block_size).
Thus, the last dimension of the original float tensor is not necessarily divisible by block size.
Given `codes` and `scale`, `block_size` is calculated as `codes.numel() * 2 // scale.numel()`.
The extra `* 2` is because `codes` is 4-bit data packed in 8-bit storage.
r      N)r&   r   uint8ndimfloat32r"   r#   r$   r%   _shapenumel
block_size)selfr"   r#   r$   r%   r*   r&   s          r   __init__OptimState4bit.__init__>   s    . {{ekk)))zzQzzQzzU]]***

	++-!+u{{}<r   c                 `    U R                   U R                  U R                  U R                  /4$ N)tensor_attrsr%   r2   r&   r5   s    r   __tensor_flatten__!OptimState4bit.__tensor_flatten__`   s&      4;;TZZ"HHHr   c                 Z    U " / U R                    Vs/ s H  oQU   PM	     snQUQ76 $ s  snf r9   )r:   )r)   tensor_data_dicttensor_attributes
outer_sizeouter_stridenames         r   __tensor_unflatten__#OptimState4bit.__tensor_unflatten__c   s>      
141A1AB1At$1AB
EV
 	
Bs   (
c                    [         R                  " U R                  S-	  U R                  S-  /SS9n[        X R                  U R
                  5      nUb  UR                  U5      nUR                  U R                  5      $ )Nr      )dim)	r   stackr"   r
   r$   r#   toviewr2   )r5   output_dtyper"   
float_datas       r   
dequantizeOptimState4bit.dequantizek   sc    TZZ1_djj6.ABK&uiiD
##|4Jt{{++r   r4   r   c           	      l   [        U[        5      (       a  U4OUn[        R                  " U5      n[        R
                  " US-  [        R                  US9n[        R
                  " Xc-  US9nU(       a
  [        5       O	[        5       n	[        R                  " U	[        R                  US9n
U " XxXXS9$ )Nr.   )r&   r   r   r&   )
isinstanceintmathprodr   zerosr/   r   r   tensorr1   )r)   r*   r%   r4   r   r&   n_elemsr"   r#   	qmap_listr$   s              r   rW   OptimState4bit.zerosr   s     'uc22))E"GqLFKG1&A)/O%5F5H	||IU]]6J5uBBr   c                     U R                   R                   SU R                   SU R                   S[	        U R
                  5       SU R                   SU R                   SU R                   S3$ )Nz(signed=z, block_size=z, shape=z, dtype=z	, device=z, requires_grad=))		__class____name__r%   r4   tupler*   r&   r   requires_gradr;   s    r   __repr__OptimState4bit.__repr__   sn    ~~&&'x}M$//IZ [4::&'x

|9T[[M R!//03	
r   )r2   r4   r"   r$   r#   r%   r9   )NN)T   NN)r_   
__module____qualname____firstlineno__r:   staticmethodr   boolr   r&   r+   r6   r<   classmethodrD   rO   rT   typesDevicerW   rb   __static_attributes__r   r   r   r    r    ,   s'   -L  %)

 
 	

 
 {{T!
 
( %) = =  = 	 =
  = {{T! =DI PT
 
,  %)$(C C 	C
 ""C {{T!C C"
r   r    c                    US   nUS   n[        U[        5      (       a  [        U[        5      (       a  UR                  UR                  :X  a4  UR                  UR                  :X  a  UR                  UR                  :X  d   eUR
                  R                  UR
                  5        UR                  R                  UR                  5        U$ [        U[        5      (       a  [        UR                  S5      UR                  5      u  pg[        XdR                  5      nUR
                  R                  US S S2   S-  USS S2   -  5        UR                  R                  U5        U$ UR                  UR                  5       5        U$ )Nr   r   rH   r.   r   )rS   r    r%   r4   r2   r"   copy_r#   r   rL   r   r$   rO   )	funcrk   argskwargsdstsrc
scaled_srcr#   r"   s	            r   _rv      s,   
q'C
q'C#~&&:c>+J+JJJ#**$#..0

cjj(	
) 					"				" J 
C	(	(("s~~F
'
HH=		ssqE!$Q$K78		
 J 			#.."#Jr   c           	      h   UR                  SUS   R                  5      nUR                  SS 5      n[        US   R                  R	                  US9US   R
                  R	                  US9US   R                  R	                  US9US   R                  US   R                  US9n[        XX65      $ )Nr&   r   r   r   rR   )
getr&   r    r"   rK   r#   r$   r%   r*   r   )rp   rk   rq   rr   r&   r   outs          r   rv   rv      s     JJwQ.EZZ$'F
Q'Q'Qv&QQC 't6??r   c                     U Vs/ s H*  n[        U[        5      (       a  UR                  5       OUPM,     nnU " U0 UD6$ s  snf r9   )rS   r    rO   )rp   rk   rq   rr   xs        r   rv   rv      sC    LPQDqjN;;ALLNBDDQ    Rs   1A c                    Uu  pE[        UR                  5      [        U5      :X  aA  [        UR                  UR                  UR
                  UR                  UR                  5      $ [        U5      S:X  aO  US   S:X  aF  [        UR                  UR                  UR
                  UR                  UR                  5       45      $ [        UR                  R                   S35      e)Nr   r   rH   z4 only supports .view() with same shape or shape=[-1])r`   r*   r    r"   r#   r$   r%   r2   lenr3   
ValueErrorr^   r_   )rp   rk   rq   rr   r{   r*   s         r   rv   rv      s    HAQWW~u%aggqww!((KK
5zQ58r>aggqww1779,OO
;;
  TU r   z
2.11.0.devc                    US   n[        U[        5      (       d  [        S[        U5       35      eU " UR                  /USS  Q70 UD6nU " UR
                  /USS  Q70 UD6nUR                  S   UR                  5       -  UR                  R                  5       -  4UR                  SS  -   n[        XVUR                  R                  5       UR                  U5      $ )Nr   z%expecting a OptimState4bit but found r   )rS   r    r~   typer"   r#   r2   r3   r$   cloner%   )rp   rk   rq   rr   r{   r"   r#   r*   s           r   rv   rv      s    QAa((@a	JKK.48.v.E.48.v.E XXa[5;;=(AGGMMO;=LE %%HHr   c                     US   R                   R                  5       =(       aA    US   R                  R                  5       =(       a    US   R                  R                  5       $ )Nr   )r"   	is_pinnedr#   r$   )rp   rk   rq   rr   s       r   rv   rv      sO     	Q! 	%GMM##%	%GLL""$r   c                    US S u  pEpg[        U5      S:  a  US   OSnUS:w  a  [        S5      eUS:w  a  [        S5      eUR                  n	[        R                  " UR
                  SS  5      n
Xj-  U	-  S:w  d  Xz-  U	-  S:w  a"  [        SUR
                   SU	 SU S	U S
3	5      eUR                  Xj-  S-  Xz-  S-   nUR                  Xj-  U	-  Xz-  U	-   nUR
                  S   UR                  5       -  UR                  R                  5       -  4UR
                  SS  -   n[        XUR                  R                  5       UR                  U5      $ )Nr   r   r   z+Only support aten.slice along the first dimz#Only support aten.slice with step=1zInvalid start or end for shape=z and block_size=zD. Make sure start and end align with block boundary. Received start=z, end=.r.   )r}   r~   r4   rU   rV   r*   r"   r#   r3   r    r$   r   r%   )rp   rk   rq   rr   r{   rI   startendstepr4   strider"   r#   r*   s                 r   rv   rv      s^   bqAE$i!m47D axFGGqy>??JYYqwwqr{#F 	*$)clj-HA-M-aggY6Fzl S#WF3%q2
 	
 GGENa'#,!*;<EGGENj03<:3MNE WWQZ%++-'177==?:<qwwqr{JE%%HHr   )(rU   r   r   torch.serializationr   torch.utils._python_dispatchr   torchao.utilsr   r   quant_utilsr	   r
   r   r   opsatenc10d_functional_c10d_functional	functoolsr   r   r   r    
implementsro   defaultrv   _to_copylerpScalarrL   all_gather_into_tensorwait_tensordetach_optim_state_4bit_c10d_opsappend_wrap_tensor_autogradr   slicer   r   r   <module>r      s=      0 D C  yy~~))++99--    1* * 1? ?]
& ]
@ 4::--. /4 4==001@ 2@ 499++,! -! 499,,- ." **22++33''  ((KK  ,''%%&6&L&L&T&TU 56I 7I$ 4>>112 3 4::,,-I .IB .! "r   