
    i%*                       S r SSKrSSKrSSKrSSKrSSKrSSKJr  SSKJ	r	J
r
  SSKJr  SSKJrJrJrJrJrJr  SSKJr  SSKrSSKJr  SSKJs  Js  Jr  SSKrSSKJr  SSKJrJ r J!r!J"r"  SS	K#J$r$  SS
K%J&r&  SSK'J(r(  SSK)J*r*J+r+J,r,J-r-J.r.  SSK/J0r0  SSK1J2r2  SSK3J4r4  SSK5J6r6  SSK7J8r8J9r9J:r:J;r;J<r<J=r=J>r>J?r?J@r@JArAJBrBJCrCJDrDJErEJFrFJGrG  SSKHJIrIJJrJ  SSKKJLrLJMrMJNrNJOrO  SSKPJQrQJRrR  SSKSJTrTJUrUJVrVJWrWJXrX  SSKYJZrZJ[r[  SSK\J]r]J^r^  SSK_J`r`  SSKaJbrbJcrcJdrdJere  SSKfJgrgJhrh  SSKJiri  \R                  " \k5      rl/ SQrm\!\dR                  /0ro\!S0rp   SrS\\\S 4      S!S4S" jjrqS# rrSsS$ jrsStS% jrtSS&.S'\R                  S(\\4   S)\\4   S*\\\R.                  R                  \v/\w4      4S+ jjrxS, rySSS-.S. jrz\rS4S'\R.                  R                  S/\S*\\\R.                  R                  \v/\w4      S0\\R                  R                     4S1 jjr|S2\R                  S!\R                  4S3 jr~S2\R                  S!\R                  4S4 jrS2\R                  S!\R                  4S5 jr\	 " S6 S7\5      5       rSSS8.S9\\R                     S:\\R                     4S; jjr\J" \5      S<SSS=.S>\R.                  R                  S/\S?\vS9\\R                     S:\\R                     S!\R.                  R                  4S@ jj5       r\	 " SA SB\5      5       rSC r\J" \5      S<SD.S>\R.                  R                  S/\S?\vS!\R.                  R                  4SE jj5       r\	 " SF SG\5      5       r\J" \5      S<SD.S>\R.                  R                  S/\S?\vS!\R.                  R                  4SH jj5       r\	 " SI SJ\5      5       rSK r\J" \5      S<SD.S>\R.                  R                  S/\S?\v4SL jj5       rS2\R                  S!\R                  4SM jrS2\R                  S!\R                  4SN jrSO\TSP\TS!S4SQ jr\	 " SR SS\5      5       rST r\J" \5      S<SD.S>\R.                  R                  S/\S!\R.                  R                  4SU jj5       r\	 " SV SW\5      5       r\J" \5      S<SD.S>\R.                  R                  S/\4SX jj5       r\	 " SY SZ\5      5       rS[ r\J" \5      S<SD.S>\R.                  R                  S/\S?\vS!\R.                  R                  4S\ jj5       r\	 " S] S^\5      5       rS_ r\J" \5      S<SD.S>\R.                  R                  S/\S?\v4S` jj5       rSa\DSb\R                  Sc\GR4                  S!S4Sd jr\	 " Se Sf\5      5       rSSS8.S9\\R                     S:\\R                     4Sg jjr\J" \5      S<SSS=.S>\R.                  R                  S/\S?\vS9\\R                     S:\\R                     S!\R.                  R                  4Sh jj5       r\	 " Si Sj\5      5       r\rSk\\R.                  R                  \/\R.                  R                  4   S!\w4Sl jrS>\R.                  R                  Sm\vS/\4Sn jrSm\vS/\4So jrS>\R                  Sm\vS/\4Sp jrS>\(S!\GRJ                  4Sq jr\GRN                  GRQ                  \~\/5        g)uap  
Quantization APIs

Generally these APIs can be applied directly to any model
with Linear modules to obtain quantized linear ops. The intended
usage involves applying torch.compile to the model afterwards
both because primitives were designed based on the fusions that
come along with it and because that is how we access the intended quantized
and mixed GEMM kernels
    N)OrderedDict)	dataclassfield)partial)AnyCallableListOptionalTupleUnion)AOBaseConfig)AffineQuantizedTensorPlainLayoutTensorCoreTiledLayoutto_affine_quantized_intx)Layout)
e4m3_dtype)Float8Linear)Float8MMConfigFP8Granularity_check_hardware_support!_granularity_is_a_1_128_w_128_128_normalize_granularity)(Float8StaticActivationFloat8WeightConfig)$LinearActivationWeightObservedTensor)AffineQuantizedObserverBase)KernelPreference)Float8PackingFormatFloat8TensorInt4ChooseQParamsAlgorithmInt4PackingFormatInt4PlainInt32TensorInt4PreshuffledTensor
Int4TensorInt4TilePackedTo4dTensor
Int8TensorIntxChooseQParamsAlgorithmIntxOpaqueTensorIntxPackingFormatIntxUnpackedToInt8TensorQuantizeTensorToFloat8KwargsQuantizeTensorToInt8KwargsSparse2x4CUTLASSFloat8Tensor)_QUANTIZE_CONFIG_HANDLER register_quantize_module_handler)_fp8_mm_compat_linear_extra_repr_module_extra_repr_quantization_type)is_MI300is_sm_at_least_89   )GranularityPerAxisPerGroupPerRow	PerTensor)LinearActivationQuantizedTensorto_linear_activation_quantized)Int4WeightOnlyQuantizerInt8DynActInt4WeightQuantizer) intx_quantization_aware_training)_DTYPE_TO_QVALUE_BOUNDSMappingTypeZeroPointDomainquantize_affine)	QuantizerTwoStepQuantizer)_get_per_token_block_size)	swap_conv2d_1x1_to_linearrE   rF   r>   _get_subclass_inserter	quantize_r@   r?   ModuleFqnToConfigF
extra_args.returnc           	      $   U" XSS 5      (       a  Ub  U R                  US9  U" U /UQ76 n U $ [        U R                  5       5      nU H1  u  px[        UUUU U S3UU5      n	XLd  M   U	c  M%  [	        XU	5        M3     Ub  U R                  US9  U $ )a  
Recursively replaces each child module in `model` with the result of `replacement_fn(child)`
if `filter_fn(child)` returns `True`.

Args:
    model (torch.nn.Module): The model containing modules to be replaced.
    replacement_fn (Callable[[torch.nn.Module], torch.nn.Module]): The function to replace matching modules.
    filter_fn (Callable[[torch.nn.Module], bool]): The filter function to determine which modules to replace.
    cur_fqn (str, optional): The current fully qualified name of the module being processed. Defaults to "".
    device (device, optional): Device to move the model to before applying `filter_fn`. Defaults to None.
    extra_args (Tuple[Any, ...], optional): optional extra args to pass to `replacement_fn`.

Returns:
    None
Ndevice.)tolistnamed_children)_replace_with_custom_fn_if_matches_filtersetattr)
modelreplacement_fn	filter_fncur_fqnrQ   rL   named_children_listnamechild	new_childs
             ]/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/torchao/quantization/quant_api.pyrV   rV      s    . %%HHFH#u2z2"5#7#7#9:.KDA)D6#I %)*?Y/ / HHFH#    c                    SSK Jn  [        U [        R                  R
                  5      =(       a    [        U S5      =(       a    [        U R                  [        5      (       + =(       a{    [        U R                  [        5      (       + =(       aU    [        U R                  U5      (       + =(       a3    [        U [        R                  R                  R                  5      (       + $ )Nr   )_AffineFakeQuantizedTensorweight)5torchao.quantization.qat.affine_fake_quantized_tensorrc   
isinstancetorchnnLinearhasattrrd   r   r<   moduleslinearNonDynamicallyQuantizableLinear)modargsrc   s      r`   
_is_linearrp      s     	3( 	SC"	S3::'<==	S 3::'FGG	S 3::'ABB		S
 3

 1 1 Q QRRra   c                 l   ^ ^^^^ TR                  SS5      mTR                  SS5      mU UUUU4S jnU$ )a  
Returns a function which inserts the given subclass into all linear modules
in the model. The inserted module will have its weight set to the result of
`cls(mod.weight, **kwargs)`. If parametrization is enabled then this will be done using
torch.nn.utils.parametrize instead of directly setting the attribute on the module.

Args:
    cls (torch.Tensor): The class to insert as a child module.
    kwargs (Any): Any additional arguments for the constructor.
constructorsubclass_constructormethod
from_floatc                   > T(       a  [         R                  R                  TR                  " U R                  40 TD6SS9U l        U R                  R                  5       u  p[        R                  " U S[        TT5      " U6 5        U $ [         R                  R                  [        TT5      " U R                  40 TD6SS9U l        U $ )NFrequires_gradrd   )	rg   rh   	Parameterru   rd   __tensor_flatten__parametrizeregister_parametrizationgetattr)lin_ro   clsrr   enable_parametrizationru   kwargss      r`   insert_subclass/_get_subclass_inserter.<locals>.insert_subclass   s    !++szz4V4E , CJ jj335GA00XwsK8$? 
 ++Z(>v># , CJ
 
ra   )pop)r   r   r   r   rr   ru   s   ``` @@r`   rI   rI      s8     **],BCKHl3J " ra   c                 x   ^  " S S[         R                  R                  5      mU4S jnUc  S n[        XUS9  g)za
Changes all conv2d 1x1 modules to equivalent linear modules so that they can then be quantized.
c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )2swap_conv2d_1x1_to_linear.<locals>.PermuteSandwich   c                 .   > [         TU ]  5         Xl        g N)super__init__rn   )selfrn   	__class__s     r`   r   ;swap_conv2d_1x1_to_linear.<locals>.PermuteSandwich.__init__   s    GHra   c                 r    U R                  US   R                  SSSS5      5      R                  SSSS5      $ )Nr         r6   )rn   permute)r   ro   s     r`   forward:swap_conv2d_1x1_to_linear.<locals>.PermuteSandwich.forward   s5    88DGOOAq!Q78@@Q1MMra   )rn   )__name__
__module____qualname____firstlineno__r   r   __static_attributes____classcell__)r   s   @r`   PermuteSandwichr      s    		N 	Nra   r   c                 T  > U R                   S:X  d   e[        R                  R                  U R                  U R
                  U R                  S L S9n[        R                  R                  U R                  R                  SS5      5      Ul        U R                  Ul        T" U5      $ )Nr6   r6   )biasrO   )
kernel_sizerg   rh   ri   in_channelsout_channelsr   ry   rd   squeeze)convr~   r   s     r`   replace_conv2d_1x15swap_conv2d_1x1_to_linear.<locals>.replace_conv2d_1x1  s    6)))hhood//tyyD7H  
 XX''(;(;B(CD
99s##ra   Nc                 v    [        U [        R                  R                  5      =(       a    U R                  S:H  $ )Nr   )rf   rg   rh   Conv2dr   )rn   ro   s     r`   <lambda>+swap_conv2d_1x1_to_linear.<locals>.<lambda>  s,    z(
 ((oo'((ra   rZ   )rg   rh   ModulerV   )rX   rZ   r   r   s      @r`   rH   rH      s<    
N%((// N$ (	 .Yra   r   rX   input_observerweight_observerrZ   c                p   ^^ S[         R                  4UU4S jjn[        U UUc  [        5        gU5        g)a6  
Converts the weight of a linear module to a LinearActivationWeightObservedTensor.

This function wraps the weight of the given linear module with a LinearActivationWeightObservedTensor,
which enables observation of both input and weight tensors during forward passes.
The wrapped weight is then re-wrapped as a nn.Parameter to maintain compatibility
with PyTorch's module system.

Example::

```
    import torch
    import torch.nn as nn
    from torchao.quantization import PerTensor
    from torchao.quantization.linear_observer_tensor import insert_observers_
    from torchao.quantization.observer import (
        AffineQuantizedMinMaxObserver,
        MappingType
    )

    # Create observers
    input_observer = AffineQuantizedMinMaxObserver(
        MappingType.SYMMETRIC,
        torch.float8_e4m3fn,
        granularity_type=PerTensor(),
        eps=torch.finfo(torch.float32).eps,
        scale_dtype=torch.float,
        zero_point_dtype=torch.int,
        zero_point_domain=ZeroPointDomain.NONE,
    )

    # Create a linear module
    linear_module = nn.Linear(10, 20)

    # Convert the linear module's weight to an observed tensor
    insert_observers_(linear_module, input_observer, weight_observer=None)

    # The linear_module can now be used as usual, with observers calculating statistics
    output = linear_module(torch.randn(10, 10))

    # Get the scale and zero point of the input observer
    scale, zero_point = linear_module.weight.input_observer.calculate_qparams()
```

Args:
    model (nn.Module): The nn.Module to convert.
    input_observer (Optional[AffineQuantizedObserverBase]): Observer for input tensor.
    weight_observer (Optional[AffineQuantizedObserverBase]): Observer for weight tensor.
    filter_fn (Optional[Callable[[torch.nn.Module, str], bool]]): Filter function to select which modules to convert.
        If not provided, all linear modules will be converted. This function should take a module and its fully qualified name.

Returns:
    nn.Linear: The modified linear module with its weight wrapped in a LinearActivationWeightObservedTensor.
linear_modulec                    > [         R                  " [        R                  " U R                  TTS9U R                  R
                  S9U l        U $ )N)r   r   rw   )rh   ry   r   ru   rd   rx   )r   r   r   s    r`   convert_to_linear_observer5insert_observers_.<locals>.convert_to_linear_observerT  sJ    !||0;;$$- /
 (..<< 
 ra   N)rh   ri   rV   rp   )rX   r   r   rZ   r   s    ``  r`   insert_observers_r     s;    |
")) 
 
 ."'
 .7ra   c                     SU R                   R                  S    SU R                   R                  S    S[        U R                   5       3$ )Nznum_embeddings=r   z, embedding_dim=r6   z	, weight=)rd   shaper3   r   s    r`   _embedding_extra_reprr   g  sb    T[[..q122B4;;CTCTUVCWBXXabtuy  vA  vA  cB  bC  D  Dra   )allow_requires_gradpropagate_biasc                    ^ ^^^ UU UU4S jnU$ )zHelper function to apply the constructor that quantizes the weight Tensor (with additional kwargs)
to the weight of linear module
c                   > T=(       a    U R                   R                  nTS:X  a  U R                  TS'   [        R                  R                  T" U R                   40 TD6US9U l         [        R                  " [        U 5      U l	        U $ )NTr   rw   )
rd   rx   r   rg   rh   ry   types
MethodTyper1   
extra_repr)r~   rx   r   rr   r   r   s     r`   r   6_get_linear_subclass_inserter.<locals>.insert_subclassr  sv    +H

0H0HT! XXF6NXX''

-f-] ( 

 ))*<cB
ra    )rr   r   r   r   r   s   ```` r`   _get_linear_subclass_inserterr   k  s      ra   configrQ   c                    [         R                  R                  S5        [        U[        5      (       a  Ub  [        S5      e[        U R                  5       5      nUR                  5        H  u  pV[        XQ5      (       d5  [        XeU5      (       d$  SUR                  ;   d  M8  [        U5      (       d  MJ  [        XeU5      nUb  UR                  US9nXvLd  Mn  US:w  d  Mv  UR                  S5      S   nUR!                  U5      R!                  S5      n	XI   n
[#        XU5        M     g[        U[$        5      (       a,  Uc  [        OUn[&        [)        U5         n[+        U UUUU4S	9  g[-        S
5      e)an  Convert the weight of linear modules in the model with `config`, model is modified inplace

Args:
    model (torch.nn.Module): input model
    config (AOBaseConfig): a workflow configuration object.
    filter_fn (Optional[Callable[[torch.nn.Module, str], bool]]): function that takes a nn.Module instance and fully qualified name of the module, returns True if we want to run `config` on
    the weight of the module
    device (device, optional): Device to move module to before applying `filter_fn`. This can be set to `"cuda"` to speed up quantization. The final model will be on the specified `device`.
        Defaults to None (do not change device).

Example::

    import torch
    import torch.nn as nn
    from torchao import quantize_

    # quantize with some predefined `config` method that corresponds to
    # optimized execution paths or kernels (e.g. int4 tinygemm kernel)
    # also customizable with arguments
    # currently options are
    # Int8DynamicActivationInt8WeightConfig (optimized with int8 mm op and torch.compile)
    # Int4WeightOnlyConfig (optimized with int4 tinygemm kernel and torch.compile)
    # Int8WeightOnlyConfig (optimized with int8 mm op and torch.compile
    from torchao.quantization.quant_api import Int4WeightOnlyConfig

    m = nn.Sequential(nn.Linear(32, 1024), nn.Linear(1024, 32))
    quantize_(m, Int4WeightOnlyConfig(group_size=32))

ztorchao.quantization.quantize_NzuCustom filter_fn and FqnToConfig were both specified. Only filter_fn=None is supported when FqnToConfig is specified._defaultrP    rR   rO   )rQ   rL   zPassing a generic Callable to `quantize_` is no longer recommended and will be deprecated at a later release. Please see https://github.com/pytorch/ao/issues/1690 for instructions on how to pass in workflow configuration instead.)rg   _C_log_api_usage_oncerf   FqnToConfig
ValueErrordictnamed_modulesitemsfqn_matches_fqn_config _module_param_matches_fqn_configfqn_to_configrp   _fqn_to_config_handlerrS   splitremovesuffixrW   r   r.   typerV   AssertionError)rX   r   rZ   rQ   r   
module_fqnmodulereplacement
child_name
parent_fqnparent_modulehandlers               r`   rJ   rJ     sb   F 
HH  !AB&+&&  H  U0023"/"5"5"7J&z::3FOO&"6"66:f;M;M4VP%"-..."?K,r1A!+!1!1#!6r!:J!+!8!8!D!Q!QRU!VJ$1$=MM{C #8 
FL	)	)"+"3J	*4<81y	
  x
 	
ra   xc           
         [         R                  n[        R                  n[        R                  n[        R
                  " [        R                  5      R                  n[        R                  n[        U U[        U 5      UUUUS9$ )zGThis is defined here instead of local function to support serialization)epsscale_dtypezero_point_dtype)	rB   
ASYMMETRICrg   int8float32finfor   r   rG   )r   mapping_typetarget_dtyper   r   r   s         r`   _int8_asymm_per_token_quantr     sh    ))L::L--K
++emm
$
(
(Czz#	!!$) ra   c                    [         R                  n[        R                  n[        R                  n[        R
                  " [        R                  5      R                  n[        R                  nSnSn[        U U[        U 5      UUUUUUS9	nU$ )Nr      )	quant_min	quant_maxr   r   r   )
rB   r   rg   uint8r   r   r   int32r   rG   )	r   r   r   r   r   r   r   r   outs	            r`   _uint8_asymm_per_token_quantr     s|    ))L;;L--K
++emm
$
(
(C{{II
"	!!$)
C Jra   c                     [         R                  n[        R                  nSnSnSn[	        U U[        U 5      UUUU[        R                  S9$ Nh㈵>   r   r   r   r   )rB   	SYMMETRICrg   r   r   rG   r   r   r   r   r   r   r   s         r`   _int8_symm_per_token_quantr     sS    ((L::L
CII#	!!$MM	 	ra   c                   .   \ rS rSr% Sr\R                  r\R                  \	S'   \
" S5      r\\	S'   \R                  r\\	S'   Sr\\R                     \	S'   \R$                  r\\	S	'   \R*                  r\\	S
'   \R0                  r\\	S'   Sr\\	S'   S rSrg)%Int8DynamicActivationIntxWeightConfigi  a\  
Configuration for dynamically quantizing activations to torch.int8 and weights to torch.intx, with 1 <= x <= 8.
More specifically, activations are dynamically quantized to 8-bits at a per-token granularity with scales/zeros.
Weights are quantized with scales/zeros in a groupwise or channelwise manner using the number of bits specified by weight_dtype.

args:
    `weight_dtype`: The dtype to use for weight quantization.  Must be torch.intx, where 1 <= x <= 8.
   ` weight_granularity`: The granularity to use for weight quantization.  Must be PerGroup or PerAxis(axis=0).
    `weight_mapping_type`: The type of mapping to use for the weight quantization.
        Must be one of MappingType.ASYMMETRIC or MappingType.SYMMETRIC.  MappingType.SYMMETRIC requires ZeroPointDomain.NONE
    `weight_scale_dtype`: The dtype to use for the weight scale.
    `act_mapping_type`: The type of mapping to use for the activation quantization.
        Must be one of MappingType.ASYMMETRIC or MappingType.SYMMETRIC.
    `intx_packing_format`: The format to use for the packed weight tensor (version 2 only).
        - unpacked_to_int8: this format is the default and is intended for export applications like ExecuTorch.
        - opaque_torchao_auto: this format is optimized for CPU performance.
    `intx_choose_qparams_algorithm`: The algorithm to use for choosing the quantization parameters.
    `version`: version of the config to use, only subset of above args are valid based on version, see note for more details.

Example:

.. literalinclude:: ../../examples/inference/int8_dynamic_activation_intx_weight.py
   :language: python
weight_dtype    weight_granularityweight_mapping_typeNweight_scale_dtypeact_mapping_typeintx_packing_formatintx_choose_qparams_algorithmr   versionc           	      2   [         R                  R                  S5        U R                  [	        SS5       Vs/ s H  n[        [         SU 35      PM     sn;   d   SU R                   35       e[        U R                  [        [        45      (       d   SU R                   35       e[        U R                  [        5      (       a8  U R                  R                  S:X  d   SU R                  R                   35       eU R                  [        R                  [        R                  [        R                  4;   d   S	U R                   35       eU R                   [        R                  [        R                  4;   d   S
U R                    35       eg s  snf )Nz:torchao.quantization.Int8DynamicActivationIntxWeightConfigr6   	   int<weight_dtype must be torch.intx, where 1 <= x <= 8, but got z8weight_granularity must be PerAxis or PerGroup, but got r   zaxis must be 0, but got z~weight_mapping_type must be MappingType.ASYMMETRIC or MappingType.SYMMETRIC or MappingType.SYMMETRIC_NO_CLIPPING_ERR, but got zRact_mapping_type must be MappingType.ASYMMETRIC or MappingType.SYMMETRIC, but got )rg   r   r   r   ranger}   rf   r   r8   r9   axisr  rB   r   r   SYMMETRIC_NO_CLIPPING_ERRr  r   bs     r`   __post_init__3Int8DynamicActivationIntxWeightConfig.__post_init__,  s   $$H	
   aQR$T1WUc!I%>$TT 	
J4K\K\J]^	
T $11GX3FGG 	
FtG^G^F_`	
G d--w77**//14 *4+B+B+G+G*HI4 ''""!!11,
 
 	

 M  NR  Nf  Nf  Mg  h	
 
 $$""!!)
 
 	
 aaeavav`wx		
 
# %Us   Fr   )r   r   r   r   __doc__rg   r   r   dtype__annotations__r9   r   r7   rB   r   r  r  r
   r   r  r)   UNPACKED_TO_INT8r  r'   AFFINEr  r  r	  r  r   r   ra   r`   r   r     s    2 !&

L%++*&.rl2'2'<'<<04-4$/$:$:k:->-O-O*O")) "#=  GS
ra   r   custom_scalecustom_zero_pointr  r  c                F   UR                   nUR                  nUR                  nUR                  nUR                  n	UR
                  n
UR                  nU R                  5       S:X  d   SU R                  5        35       e[        U[        5      (       a  UR                  nOW[        U[        5      (       a4  UR                  S:X  d   SUR                   35       eU R                  S   nO[        SU 35      eSU4nUR                  S:X  d   eU	[         R"                  :X  d   e[$        R&                  [$        R(                  [$        R*                  [$        R,                  /nU
[$        R.                  :X  d  X;   d
   SU
 35       eUb=  UR0                  [2        R4                  :X  a  UR7                  [2        R8                  5      n[:        R<                  " U UUUS	UUUS
9nUb  XR0                  :w  a  [?        XU5        UnX;   a  [@        RB                  " UUU
S9nS nUU4$ )Nr   zFInt8DynamicActivationIntxWeightConfig only works for 2-d Tensor, got: r   %axis must be 0 with PerAxis, but got rO   z4weight_granularity must be PerGroup or PerAxis, got r6   Unsupported packing format: int8_asym_per_token)r   activation_quantizationr  r  r  )r   r  )"r   r   r  r  r  r  r  dimrf   r9   
group_sizer8   r  r   r   r  rB   r   r)   OPAQUE_ATEN_KLEIDIAIOPAQUE_TORCHAO_AUTOOPAQUE_TORCHAO_KLEIDIAIOPAQUE_TORCHAO_LOWBITr  r  rg   r   rS   r   r*   from_hp+_adjust_scale_dtype_in_intx_unpacked_tensorr(   !from_intx_unpacked_to_int8_tensor)rd   r   r   r  r  r   r   r  r  r  r  r  r   
block_sizeopaque_formats
new_weightnew_biass                    r`   4_int8_dynamic_activation_intx_weight_quantize_tensorr,  I  sF    &&L22 4422.. 44$*$H$H!::<1 
PQWQ[Q[Q]P^_ $h//'22
	&	0	0!&&!+ 	
34F4K4K3LM	
+ \\"%
BCUBVW
 	
 ZJ>>Q{55555..--11//	N 	0AAA0< 
&&9%:;<	1 $):)@)@EKK)O-00<)11( 5&C!+	J %*<*L3 2	
 H ,%GGX;N

 xra   rd   )parameter_namer  r  r   r-  c          	         [        [        X5      U R                  UUUS9u  pV[        U U[        R
                  R                  USS95        Uc  S U l        [        R                  " [        [        U R                  US9U 5      U l        U $ )Nr  Frw   original_extra_reprr-  )r,  r}   r   rW   rg   rh   ry   r   r   r   r2   r   )r   r   r-  r  r  r*  r+  s          r`   ._int8_dynamic_activation_intx_weight_transformr1    s     P'!+J :U;
 (( & 1 1)	

 	F Mra   c                       \ rS rSr% SrSr\\S'   Sr\	\S'   \
R                  r\
\S'   \R                  r\\S'   S	r\\S
'   Sr\\S'   S rSrg)Int4WeightOnlyConfigi  a  
Configuration for int4 weight only quantization, only groupwise quantization is supported
right now, and we support version 1 and version 2, that are implemented differently although with
same support. In version 2, different target are mainly distinguished by `packing_format` arg, and in version 1, mainly by `layout`.

Args:
    `group_size`: parameter for quantization, controls the granularity of quantization, smaller
     size is more fine grained, choices are [256, 128, 64, 32], used in both version 1 and 2
    `int4_packing_format`: the packing format for int4 tensor, used in version 2 only
     `int4_choose_qparams_algorithm`: variants of choose qparams algorithm to use for int4,
     currently support TINYGEMM ("tinygemm") and HQQ ("hqq"), used in version 2 only
    `set_inductor_config`: if True, adjusts `torchinductor` settings to recommended values. used in both version 1 and 2
    `version`: version of the config to use, default is 2
    `int4_tile_packed_ntile`: ntile size for TILED_PACKED_TO_4D format, default is 8 for CUDA platform, 16 for ROCm platform

Example:

.. literalinclude:: ../../examples/inference/int4_weight_only.py
   :language: python
   r   Tset_inductor_configint4_packing_formatint4_choose_qparams_algorithm   int4_tile_packed_ntiler   r  c                 p    U R                   S;   d   S5       e[        R                  R                  S5        g )N)r8     z-int4_tile_packed_ntile must be either 8 or 16z)torchao.quantization.Int4WeightOnlyConfig)r9  rg   r   r   r   s    r`   r  "Int4WeightOnlyConfig.__post_init__  s5    **g5 	
;	
5 	$$%PQra   r   N)r   r   r   r   r  r   r	  r  r5  boolr!   PLAINr6  r    TINYGEMMr7  r9  r  r  r   r   ra   r`   r3  r3    sc    * J $$->-D-D*D"++ "#=  #$C#GSRra   r3  c                    UR                   nUR                  nUR                  nUR                  nU R                  S   U-  S:w  a'  [
        R                  SU R                   SU 35        U $ [        [        U R                  S-
  5       Vs/ s H  nSPM     snU/-   5      nUR                  S:X  d   e[        U5      nU[        R                  :X  a  U[        R                  :X  d   SU S35       eU[        R                   :X  a&  ["        R$                  " U U[&        R(                  S	9nU$ U[        R*                  :X  a  [,        R$                  " U U5      nU$ U[        R.                  :X  a  [0        R$                  " U U5      nU$ U[        R                  :X  a  [2        R$                  " U UUUS
9nU$ [5        SU 35      es  snf )NrO   r   zZSkipping quantizing weight with int4 weight only quantization because the shape of weight z# is not compatible with group_size r6   r   zBInt4ChooseQParamsAlgorithm.HQQ is not supported by packing format zF, it's only supported by Int4PackingFormat.TILE_PACKED_TO_4D currentlyactivation_dtype)r7  
ntile_sizez!Unsupported int4 packing format: )r   r7  r6  r9  r   loggerinfotupler  ndimr  rT   r    HQQr!   TILE_PACKED_TO_4DPRESHUFFLEDr#   r%  rg   bfloat16r>  r$   PLAIN_INT32r"   r%   r   )	rd   r   r   r7  r6  r9  r   r(  r*  s	            r`   !_int4_weight_only_quantize_tensorrM    s    ""J$*$H$H! 44#::||B*$)hioiuiuhv  wZ  [e  Zf  g	
 5q#9:#9a#9:j\IJJ>>Qj!J$(B(F(FF"&7&I&II 	
PQdPe fS T	
I
 /;;;*22"^^


 	 1 7 7	7''

 	 1 = =	=)11

 	 1 C C	C-55*G-	

 <=P<QRSSO ;s   Gr-  c          	         UR                   (       a(  [        R                  R                  R	                  5         [        X5      (       d   SU S3SU  S3-   5       e[        [        X5      U5      n[        U U[        R                  R                  USS95        [        R                  " [        [        U R                   US9U 5      U l        U $ )Nz8applying int4 weight only quant requires module to have 
 attribute but  does not have oneFrw   r/  )r5  torchaoquantizationutils"recommended_inductor_config_setterrj   rM  r}   rW   rg   rh   ry   r   r   r   r2   r   r   r   r-  r*  s       r`   _int4_weight_only_transformrX    s     !!""EEG6** 
B>BRR\]&+
,	-* 3'J :U;
 (( & 1 1)	

 	F Mra   c                   (    \ rS rSr% SrSr\\S'   Srg)'Float8DynamicActivationInt4WeightConfigi9  a  Configuration for apply float8 dynamic per row quantization and int4
per group weight quantization to linear
(only group_size 128 is supported right now since underlying kernel used only supports 128
and above and no benefits of making it bigger)

Args:
    `int4_packing_format`: how the weight is packed, supported values are "preshuffled" and "plain"

Example:

.. literalinclude:: ../../examples/inference/float8_dynamic_activation_int4_weight.py
   :language: python
preshuffledr6  r   N)	r   r   r   r   r  r6  r!   r  r   r   ra   r`   rZ  rZ  9  s     .;*:ra   rZ  c          	         [        X5      (       d   SU S3SU  S3-   5       eUR                  nUS;   d
   SU 35       e[        X5      nSn[        [	        UR
                  S-
  5       Vs/ s H  nSPM     snU/-   5      nUS	:X  a/  [        R                  " U R                  U[        R                  S
9nO.[        R                  " U R                  U[        R                  S
9n[        U U[        R                  R                  USS95        [        R                   " [#        [$        U R&                  US9U 5      U l        U $ s  snf )NzMapplying float8 dynamic activation int4 weight quant requires module to have rP  rQ  rR  )r[  plainzIonly preshuffled and plain int4_packing_format supported right now, got: r4  r6   r[  rA  Frw   r/  )rj   r6  r}   rT   r  rG  r#   r%  rd   rg   float8_e4m3fnr$   rW   rh   ry   r   r   r   r2   r   )	r   r   r-  r6  rd   r   r   r(  r*  s	            r`   0_float8_dynamic_activation_int4_weight_transformr_  L  sj    6** 
WXfWggqr&+
,	-* !44 #   TTgShi	  V,FJ%a"89"8Qq"89ZLHIJm+*22MM"00

  ''MM"00

 :U;
 (( & 1 1)	

 	F M; :s   +Ec                   n    \ rS rSr% SrSr\\   \S'   \	" 5       r
\\   \S'   Sr\\S'   Sr\\S	'   S
 rSrg)Int8WeightOnlyConfigi  a  
Configuration for applying int8 weight-only symmetric per-channel quantization to linear layers.

Args:
    group_size (version 1) - Controls the granularity of quantization.
    If None, applies per-channel quantization. Otherwise, applies per-group quantization with the specified group size.
    granularity (version 2) - Quantization granularity.
        PerRow() for per-channel quantization, PerTensor() for per-tensor quantization,
        PerGroup(group_size) for per-group quantization.
    set_inductor_config: bool = True - If True, adjusts `torchinductor` settings to recommended values
        for better performance with this quantization scheme.

Example:

.. literalinclude:: ../../examples/inference/int8_weight_only.py
   :language: python
Nr   granularityTr5  r6   r  c                 >   [         R                  R                  S5        U R                  S:X  an  U R                  b"   SU R                   SU R                   S35       e[        U R                  [        [        [        45      (       d   SU R                   35       eg g )Nz)torchao.quantization.Int8WeightOnlyConfigr   z1Only support version 2 with group_size=None, got z. Use granularity=PerGroup(z
) instead.z<granularity must be PerTensor, PerRow, or PerGroup, but got )
rg   r   r   r  r   rf   rb  r;   r:   r9   r   s    r`   r  "Int8WeightOnlyConfig.__post_init__  s    $$%PQ<<1??* CDOOCT U,,0OO+<JH* d..FH0MNN NtO_O_N`aN ra   r   )r   r   r   r   r  r   r
   r	  r  r:   rb  r7   r5  r=  r  r  r   r   ra   r`   ra  ra    sB    $ !%J$)/K+&1 $$GS	ra   ra  c           	      l   UR                   S:X  a  [        R                  " S5        [        R                  n[
        R                  n[
        R                  " [
        R                  5      R                  n[
        R                  nUR                  nUc  U R                  S   n[        [        U R                  5       S-
  5       Vs/ s H  nSPM     snU/-   5      n[!        U UUUUUS9n	U	$ UR                   S:X  d   SUR                    35       e["        R$                  " XR&                  S9n	U	$ s  snf )Nr6   zConfig Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more detailsrO   )r   r   r   Unexpected version: )rb  )r  warningswarnrB   r   rg   r   r   r   r   int64r   r   rF  r  r  r   r&   r%  rb  )
rd   r   r   r   r   r   r   r   r(  r*  s
             r`   !_int8_weight_only_quantize_tensorrj    s   ~~ \	
 #,,zzkk%--(,, ;;&&
b)JuVZZ\A-='>?'>!A'>?:,NO
--

  ~~"K&:6>>:J$KK"''<N<NO
 @s   D1c          	         UR                   (       a(  [        R                  R                  R	                  5         [        X5      (       d   S5       e[        [        X5      U5      n[        U U[        R                  R                  USS95        [        R                  " [        [        U R                   US9U 5      U l        U $ )Nzqapplying int8 weight only quant requires module to have {parameter_name} attribute but {module} does not have oneFrw   r/  )r5  rS  rT  rU  rV  rj   rj  r}   rW   rg   rh   ry   r   r   r   r2   r   r   r   r-  quantized_tensors       r`   _int8_weight_only_transformrn    s     !!""EEG6** 	,* 9' +5A
 (( & 1 1)	

 	F Mra   c                     [         R                  n[        R                  nSnSnSn[	        U U[        U 5      UUUUU R                  [        R                  :X  a  [        R                  S9$ S S9$ r   )	rB   r   rg   r   r   rG   r  float16r   r   s         r`   (_int8_symm_per_token_reduced_range_quantrq    ss    ((L::L
CII#	!!$%&WW%=EMM	 	 DH	 	ra   c                 
   [         R                  n[        R                  nSnSnSnU R                  S   S:X  a  U $ [        U U[        U 5      UUUUU R                  [        R                  :X  a  [        R                  S9$ S S9$ )Nr   r   r   r6   r   )
rB   r   rg   r   r   r   rG   r  rp  r   r   s         r`   4_int8_symm_per_token_reduced_range_quant_noop_decoders    s     ((L::L
CIIwwqzQ'%a()*EMM)A	
 		
 HL	
 		
ra   act_granularityr   c                 8   [         [        4n[        X5      (       d  [        S[	        U 5       S35      e[        U [        5      (       a)  U R
                  S:w  a  [        SU R
                   S35      e[        X5      (       d  [        S[	        U5       S35      eg )Nz)Unsupported activation granularity type: z*. Only PerTensor and PerRow are supported.rO   zMOnly PerRow(dim=-1) is supported for activation quantization, got PerRow(dim=zS). Per-feature activation quantization is not supported due to slicing limitations.z%Unsupported weight granularity type: )r;   r:   rf   r   r   r  )rt  r   	supporteds      r`   _validate_granularity_int8rw    s     F#Io117_8M7N O7 8
 	
 /6**/B/Bb/H-112 3_`
 	
 (443D9K4L3M N7 8
 	
 5ra   c                       \ rS rSr% SrSr\\   \S'   \	R                  r\\	   \S'   Sr\\S'   \" 5       r\\\\\\4   \\   4      \S'   S	r\\S
'   Sr\\S'   S rSrg)%Int8DynamicActivationInt8WeightConfigi%  a  
Configuration for applying int8 dynamic per-token activation and int8 per-channel weight
quantization to linear layers.

Args:
    layout: Optional[Layout] = PlainLayout() - Tensor layout for the quantized weights. Controls how the
        quantized data is stored and accessed.
    granularity: Optional[Union[Granularity, Tuple[Granularity, Granularity], List[Granularity]]] = PerRow()
        The granularity for quantization. Can be either a single granularity (applied to both
        activations and weights) or a tuple / list of two granularities (first for activations, second for weights).
        If None, defaults to PerRow for both. Only PerTensor and PerRow are supported.
    act_mapping_type: Optional[MappingType] = MappingType.SYMMETRIC - Mapping type for activation quantization.
        SYMMETRIC and ASYMMETRIC are supported for version 2.
    weight_only_decode: bool = False - If True, only quantizes weights during forward pass and keeps activations
        in original precision during decode operations.
    set_inductor_config: bool = True - If True, adjusts `torchinductor` settings to recommended values
        for better performance with this quantization scheme.
    version (int): the version of the config, version 1 is using AffineQuantizedTensor that we plan to deprecate/split, version 2 is using Int8Tensor

Example:

.. literalinclude:: ../../examples/inference/int8_dynamic_activation_int8_weight.py
   :language: python
Nlayoutr  Fweight_only_decoderb  Tr5  r6   r  c                     [         R                  R                  S5        U R                  c  [	        5       U l        U R
                  S:X  a.  [        R                  " U R                  5      u  p[        X5        g g )Nz:torchao.quantization.Int8DynamicActivationInt8WeightConfigr   )
rg   r   r   rz  r   r  r&   r   rb  rw  r   rt  r   s      r`   r  3Int8DynamicActivationInt8WeightConfig.__post_init__I  sc    $$H	
 ;;%-DK<<12<2S2S  3/O 'K	 ra   )rz  )r   r   r   r   r  rz  r
   r   r  rB   r   r  r{  r=  r:   rb  r   r7   r   rT   r5  r  r	  r  r   r   ra   r`   ry  ry  %  s    2  $FHV#.9.C.Ch{+C$$ 	 k5k!9:D<MMN  !%$GS
Lra   ry  c                 P   UR                   S:X  Ga%  UR                  nUR                  nUR                  nU R                  S   nUS::  a'  [
        R                  SU R                   SU 35        U $ [        R                  n[        R                  nS n[        R                  n	[        R                  " [        R                  5      R                  n
[        R                   nU(       a  ["        nO!U[        R                  :X  a  [$        nO[&        nU" U 5      n[)        U UUU	U
UUUS9n[+        X5      nU$ [,        R.                  " UR0                  5      u  nnUR                   S:X  d   S	UR                    35       e[,        R2                  " U U[5        UUR                  S
9S9nU$ )Nr6   rO   r;  zKSkipping applying Int8DynamicActivationInt8WeightConfig to weight of shape z  because `in_feature` is <= 16: c                     [        [        U R                  5       S-
  5       Vs/ s H  nSPM     snU R                  S   /-   5      $ s  snf )Nr6   rO   )rF  r  r  r   )r   r   s     r`   get_weight_block_sizeS_int8_dynamic_activation_int8_weight_quantize_tensor.<locals>.get_weight_block_sizei  s?    U1557Q;%78%7!%78AGGBK=HII8s   A)r   r   _layoutzero_point_domainr   rf  rb  r   )rb  act_quant_kwargs)r  rz  r  r{  r   rD  rE  rB   r   rC   NONErg   r   r   r   r   ri  rs  rq  r   r   r=   r&   r   rb  r%  r,   )rd   r   rz  r  r{  in_featuresr   weight_zero_point_domainr  r   r   r   input_quant_funcr(  r*  quantized_weightrt  r   s                     r`   4_int8_dynamic_activation_int8_weight_quantize_tensorr  V  s   ~~!22#66ll2&"KK]^d^j^j]k2;-A M #,,#2#7#7 	J zzkk%--(,, ;;S  ;#8#88#K #> *62
--6	

 :*W   /9.O.O/
++ ~~"K&:6>>:J$KK"%--*7+#44
 ra   c          	         UR                   (       a(  [        R                  R                  R	                  5         [        X5      (       d   SU S3SU  S3-   5       e[        [        X5      U5      n[        U U[        R                  R                  USS95        [        R                  " [        [        U R                   US9U 5      U l        U $ )NzKapplying int8 dynamic activation int8 weight quant requires module to have rP  rQ  rR  Frw   r/  )r5  rS  rT  rU  rV  rj   r  r}   rW   rg   rh   ry   r   r   r   r2   r   rW  s       r`   ._int8_dynamic_activation_int8_weight_transformr    s     !!""EEG6** 
UVdUeeop&+
,	-* F'J :U;
 (( & 1 1)	

 	F Mra   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   \" 5       r\\\\\\4   \\   4      \	S'   \R$                  r\\   \	S'   Sr\\	S	'   S
r\\	S'   S rS\4S jrSrg)$Int8StaticActivationInt8WeightConfigi  a  
Configuration for applying int8 static quantization to both activation and weight

Args:
    act_quant_scale (torch.Tensor): The scale tensor for activation quantization.
    act_quant_zero_point (torch.Tensor): The zero_point tensor for activation quantization (asymmetric only).
    granularity (Optional[Union[Granularity, Tuple[Granularity, Granularity], List[Granularity]]] = PerRow()):
        The granularity for quantization. Can be either a single granularity (applied to both
        activations and weights) or a tuple / list of two granularities (first for activations, second for weights).
        If None, defaults to PerRow for both. Only PerTensor and PerRow are supported.
    act_mapping_type (MappingType): The mapping type for activation quantization. SYMMETRIC and ASYMMETRIC are supported.
    set_inductor_config (bool): if True, adjusts `torchinductor` settings to recommended values.
    version (int): the version of the config
Nact_quant_scaleact_quant_zero_pointrb  r  Tr5  r6   r  c                     [         R                  R                  S5        [        R                  " U R
                  5      u  p[        X5        g )Nz9torchao.quantization.Int8StaticActivationInt8WeightConfig)rg   r   r   r&   r   rb  rw  r}  s      r`   r  2Int8StaticActivationInt8WeightConfig.__post_init__  s?    $$G	
 /9.O.O/
+ 	#?Gra   rM   c                 n    [         R                  " U R                  5      u  p[        UU R                  S9$ )zGet the activation quantization kwargs for static quantization.

Returns:
    QuantizeTensorToInt8Kwargs with the configured granularity and mapping type.
r  )r&   r   rb  r,   r  r}  s      r`   get_act_quant_kwargs9Int8StaticActivationInt8WeightConfig.get_act_quant_kwargs  s;     /9.O.O/
+ *'..
 	
ra   r   )r   r   r   r   r  r  r
   rg   Tensorr  r  r:   rb  r   r7   r   rT   rB   r   r  r5  r=  r  r	  r  r,   r  r   r   ra   r`   r  r    s     /3OXell+237(5<<07 	 k5k!9:D<MMN  /:.C.Ch{+C $$GSH
&@ 
ra   r  c          	         [         R                  " UR                  5      u  p4[        X5      (       d   SU S35       eUR                  (       a(  [
        R                  R                  R                  5         S nUR                  b  UR                  R                  5       n[         R                  " [        X5      U[        UUR                  S9UR                  R                  5       US9n[!        U U["        R$                  R'                  USS95        [(        R*                  " [-        [.        U R0                  US9U 5      U l        U $ )Nz#Expected module to have attribute `z` but not foundr  )rb  r  r  r  Frw   r/  )r&   r   rb  rj   r5  rS  rT  rU  rV  r  detachr%  r}   r,   r  r  rW   rg   rh   ry   r   r   r   r2   r   )r   r   r-  activation_granularityr   r  rm  s          r`   -_int8_static_activation_int8_weight_transformr    s9    2<1R1R2. 6** 
-n-=_M* !!""EEG"".%::AAC!))'&3.00
 ..5571	 +5A
 (( & 1 1)	

 	F Mra   c                   ^    \ rS rSr% Sr\r\R                  \	S'   Sr
\\	S'   Sr\\	S'   S rS	rg
)Float8WeightOnlyConfigi  ah  
Configuration for applying float8 weight-only symmetric per-channel quantization to linear layers.

Args:
    weight_dtype (torch.dtype): The target data type for weight quantization. Default is torch.float8_e4m3fn.
    set_inductor_config (bool): if True, adjusts `torchinductor` settings to recommended values.
    version (int): the version of the config, version 1 is deprecated, version 2 is using Float8Tensor (default)

Note:
    The actual matmul will be computed in original precision of the weight tensor.

Example:

.. literalinclude:: ../../examples/inference/float8_weight_only.py
   :language: python
r   Tr5  r   r  c                 B    [         R                  R                  S5        g )Nz+torchao.quantization.Float8WeightOnlyConfig)rg   r   r   r   s    r`   r  $Float8WeightOnlyConfig.__post_init__2  s    $$%RSra   r   N)r   r   r   r   r  r   r   rg   r  r  r5  r=  r  r	  r  r   r   ra   r`   r  r    s2    " !+L%++* $$GSTra   r  c                     UR                   S:X  d   SUR                    35       eUR                  n[        R                  " X[	        5       S9nU$ )Nr   rf  )float8_dtyperb  )r  r   r   r%  r:   )rd   r   r   r*  s       r`    _float8_weight_only_quant_tensorr  6  sP    >>QG"6v~~6F GG&&L%%vxJ ra   c          	         UR                   (       a(  [        R                  R                  R	                  5         [        X5      (       d   S5       e[        U [        5      (       a  [        U 5      n [        [        X5      U5      n[        U U[        R                  R                  USS95        [        R                   " [#        [$        U R&                  US9U 5      U l        U $ )Nzsapplying float8 weight only quant requires module to have {parameter_name} attribute but {module} does not have oneFrw   r/  )r5  rS  rT  rU  rV  rj   rf   r   _unwrap_float8_linearr  r}   rW   rg   rh   ry   r   r   r   r2   r   rl  s       r`   _float8_weight_only_transformr  ?  s     !!""EEG6** 	,*
 &,''&v.7' +5A
 (( & 1 1)	

 	F Mra   c                   *   \ rS rSr% Sr\r\R                  \	S'   \r
\R                  \	S'   Sr\\\\\   4      \	S'   \R"                  r\\   \	S'   Sr\\   \	S'   Sr\\   \	S	'   Sr\\   \	S
'   \R2                  r\\	S'   Sr\\	S'   Sr\\	S'   S rSr g))Float8DynamicActivationFloat8WeightConfigie  a~  
Configuration for applying float8 dynamic symmetric quantization to both activations and weights of linear layers.

Args:
    activation_dtype (torch.dtype): The target data type for activation quantization. Default is torch.float8_e4m3fn.
    weight_dtype (torch.dtype): The target data type for weight quantization. Default is torch.float8_e4m3fn.
    granularity (Optional[Union[FP8Granularity, List[FP8Granularity]]]):
        The granularity for quantization. Can be either a single granularity (applied to both
        activations and weights) or a tuple of two granularities (one for activations, one for weights).
        If None, defaults to PerTensor for both. Currently both quantizations need to be the same type. And
        only PerTensor and PerRow are supported.
    mm_config (Float8MMConfig): Configuration for the matrix multiplication. Default uses fast accumulation.
    activation_value_lb (Optional[float]): the lower bound for activation value for calculating scale
    activation_value_ub (Optional[float]): the upper bound for activation value for calculating scale
    kernel_preference (KernelPreference): kernel preference for ops like matmul, grouped matmul etc. by defalut (KernelPreference.AUTO) it will be chosen for user based on hardware or other information, this only needs to be set in weight
    set_inductor_config (bool): if True, adjusts `torchinductor` settings to recommended values.
    version (int): the version of the config, version 1 is deprecated, version 2 is using Float8Tensor (default)

Example:

.. literalinclude:: ../../examples/inference/float8_dynamic_activation_float8_weight.py
   :language: python
rB  r   Nrb  packing_format	mm_configactivation_value_lbactivation_value_ubkernel_preferenceTr5  r   r  c                    [         R                  R                  S5        [        U R                  5      u  pX/U l        Sn[        U R                  5      (       aN  U R                  [        R                  [        R                  4;   d   S5       eU R                  S:  d   S5       eSn[         R                  R                  5       (       a  SnU R                  c  [        US9U l        g g )Nz>torchao.quantization.Float8DynamicActivationFloat8WeightConfigTunimplementedr   F)use_fast_accum)rg   r   r   r   rb  r   r  r   AUTOTORCHr  xpuis_availabler  r   )r   r  r   default_use_fast_accums       r`   r  7Float8DynamicActivationFloat8WeightConfig.__post_init__  s    $$L	
 6L6
2 3G!%,T-=-=>>)) %% &&.     <<1$5o5$%*"99!!##%*">>!+;QRDN "ra   )rb  r  )!r   r   r   r   r  r   rB  rg   r  r  r   rb  r
   r   r   r	   r   r>  r  r  r   r  floatr  r   r  r  r5  r=  r  r	  r  r   r   ra   r`   r  r  e  s    0 %/ekk. *L%++*IMK%^0D DEFM4G4M4MNH01M*.Ix'.+/%/+/%/*:*?*?'? $$GSSra   r  c           	         UR                   nUR                  nUR                  nUR                  nUR                  nUR
                  nUR                  nUR                  n	[        U5        Uu  pU R                  5       S;   a`  [        U
[        5      (       a  [        U[        5      (       d   S5       eU R                  S   S-  S:w  d  U R                  S   S-  S:w  a  U $ O[        U 5      (       d  U $ UR                  S:X  d   SUR                   35       eU	[        R                   :X  a:  [        U["        5      (       a%  U R$                  [&        R(                  :X  d   S5       e[+        UU
UUUS	9nU	[        R                   :X  a  [,        R.                  " U UUUUUS
9nU$ U	[        R0                  :X  a5  [        U["        5      (       d   S5       e[2        R.                  " U UUUS9nU$ g )N)      zH4D/5D tensor only supports per tensor activation and weight quantizationr   r;  r6   r   rf  zBPerRow quantization only works for bfloat16 precision input weight)hp_value_lbhp_value_ubr  )r  rb  r  r  r  z8Sparse packing format only supports per-row quantization)r  rb  r  )rB  r   rb  r  r  r  r  r  r   r  rf   r;   r   r0   r  r   r>  r:   r  rg   rK  r+   r   r%  SPARSE_CUTLASSr-   )rd   r   rB  r   rb  r  r  r  r  r  r  r   r  r  s                 r`   8_float8_dynamic_activation_float8_weight_quantize_tensorr    s   ..&&L$$K  I 44 4400**N K(1<. zz|v0)<<	B
 B
 	VU	V 
 <<?R1$Q"(<(AM )BF## >>QG"6v~~6F GG,222zF8 8 ||u~~- 	
P	
- 4''+ ,222'//%*/-
  	.==	=,f55 	
F	
5 8??%*-	
   
>ra   c          	      n   [         R                  R                  5       (       a%  [        5       (       d  [	        5       (       d   S5       eUR
                  (       a(  [        R                  R                  R                  5         [        X5      (       d   SU S3SU  S3-   5       e[        U [        5      (       a  [        U 5      n [        [        X5      U5      n[!        U U[         R"                  R%                  USS95        [&        R(                  " [+        [,        U R.                  US9U 5      U l        U $ )	NzPFloat8 dynamic activation quantization is only supported on CUDA>=8.9 and MI300+zKapplying float8 dynamic activation quant requires module to have parameter rP  rQ  rR  Frw   r/  )rg   cudar  r5   r4   r5  rS  rT  rU  rV  rj   rf   r   r  r  r}   rW   rh   ry   r   r   r   r2   r   rl  s       r`   2_float8_dynamic_activation_float8_weight_transformr    s     zz   ""hjj 	
^	
0 !!""EEG6** 
UVdUeeop&+
,	-* &,''&v.O' +5A
 (( & 1 1)	

 	F Mra   intx_unpacked_tensor	hp_tensorr   c           
         [        U [        5      (       d   eU R                  R                  U5      U l        [        U R
                     u  p4[        UU R                  U R                  U R                  [        R                  UUS9U l        g)aP  
Adjusts the scale_dtype on IntxUnpackedToInt8Tensor.
Updating the scale dtype requires updating the qdata because qdata is calculated after the scale.
This is used in IntxWeightOnlyConfig and Int8DynamicActivationIntxWeightConfig to make
version=2 and version=1 numerically equivalent when the scale_dtype differs from the input dtype
)output_dtyper   r   N)rf   r*   scalerS   rA   r   rD   r(  
zero_pointrg   r   qdata)r  r  r   qminqmaxs        r`   r&  r&    s     *,DEEEE!5!;!;!>!>{!K()=)J)JKJD!0''""''ZZ"ra   c                      \ rS rSr% Sr\R                  r\R                  \	S'   \
" S5      r\\	S'   \R                  r\\	S'   Sr\\R                     \	S'   \R&                  r\\	S	'   \R,                  r\\	S
'   Sr\\	S'   S rSrg)IntxWeightOnlyConfigi+  a  
Configuration for quantizing weights to torch.intx, with 1 <= x <= 8.
Weights are quantized with scales/zeros in a groupwise or channelwise
manner using the number of bits specified by weight_dtype.
args:
    `weight_dtype`: The dtype to use for weight quantization.  Must be torch.intx, where 1 <= x <= 8.
    `granularity`: The granularity to use for weight quantization.  Must be PerGroup or PerAxis(0).
    `mapping_type`: The type of mapping to use for the weight quantization.
        Must be one of MappingType.ASYMMETRIC or MappingType.SYMMETRIC.
    `scale_dtype`: The dtype to use for the weight scale.
    `intx_packing_format`: The format to use for the packed weight tensor (version 2 only).
    `intx_choose_qparams_algorithm`: The algorithm to use for choosing the quantization parameters.
    `version`: version of the config to use, only subset of above args are valid based on version, see note for more details.

Example:

.. literalinclude:: ../../examples/inference/intx_weight_only.py
   :language: python
r   r   rb  r   Nr   r  r  r   r  c           	         [         R                  R                  S5        U R                  [	        SS5       Vs/ s H  n[        [         SU 35      PM     sn;   d   SU R                   35       e[        U R                  [        [        45      (       d   SU R                   35       e[        U R                  [        5      (       a8  U R                  R                  S:X  d   SU R                  R                   35       eU R                  [        R                  [        R                  [        R                  4;   d   S	U R                   35       eg s  snf )
Nz)torchao.quantization.IntxWeightOnlyConfigr6   r  r	  r
  z1granularity must be PerAxis or PerGroup, but got r   r  zvmapping_type must be MappingType.ASYMMETRIC, MappingType.SYMMETRIC, or MappingType.SYMMETRIC_NO_CLIPPING_ERR, but got )rg   r   r   r   r  r}   rf   rb  r8   r9   r  r   rB   r   r   r  r  s     r`   r  "IntxWeightOnlyConfig.__post_init__K  sV   $$%PQ  aQR$T1WUc!I%>$TT 	
J4K\K\J]^	
T $**Wh,?@@ 	
?@P@P?QR	
@ d&&00##((A- 78H8H8M8M7NO-   ""!!11%
 
 	

 E  FJ  FW  FW  EX  Y	
 
 %Us   Er   )r   r   r   r   r  rg   r   r   r  r  r8   rb  r7   rB   r   r   r   r
   r)   r  r  r'   r  r  r  r	  r  r   r   ra   r`   r  r  +  s    ( !&

L%++*&qzK) + 5 5L+5)-K%++&-->-O-O*O")) "#=  GS
ra   r  c          
         UR                   nUR                  nUR                  nUR                  nUR                  nUR
                  n	U R                  5       S:X  a  Sn
O3U R                  5       S:X  a  Sn
O[        SU R                  5        35      e[        U[        5      (       a  UR                  nOW[        U[        5      (       a4  UR                  S:X  d   SUR                   35       eU R                  U
   nO[        SU 35      eU R                  5       S:X  a  SU4nOU R                  5       S:X  d   eSUSS4nUR                  S:X  d   eUR                  [        R                   :X  az  Ub=  UR"                  [$        R&                  :X  a  UR)                  [$        R*                  5      n[,        R.                  " U UUUUUU	S	9nUb  XpR"                  :w  a  [1        XU5        U$ [        S
U 35      e)Nr   rO   r  r6   z>IntxWeightOnlyConfig only works for 2-d and 4-d Tensors, got: r   r  z-granularity must be PerGroup or PerAxis, got )r   r  r  r  r  )r   rb  r   r   r  r  r  r   rf   r9   r   r8   r  r   r  r)   r  r  rg   r   rS   r   r*   r%  r&  )rd   r   r  r  r   rb  r   r   r  r  	input_dimr   r(  r*  s                 r`   !_intx_weight_only_quantize_tensorr  `  s    &&L$$K&&L$$K 44$*$H$H!zz|q				LVZZ\N[
 	
 +x(( ++
	K	)	)1$ 	
3K4D4D3EF	
$ \\),
HVWWzz|q_
 zz|q   Q*
>>Q!!%6%G%GG(->-D-D-S 1 4 4UZZ @-55%%/*G

 "{ll'B7
KX78K7LMNNra   c          	      ,   [        X5      (       d   SU S3SU  S3-   5       e[        [        X5      UUUS9n[        U U[        R
                  R                  USS95        [        R                  " [        [        U R                  US9U 5      U l        U $ )	Nz8applying intx weight only quant requires module to have rP  rQ  rR  r  Frw   r/  )rj   r  r}   rW   rg   rh   ry   r   r   r   r2   r   )r   r   r-  r  r  r*  s         r`   _intx_weight_only_transformr    s     6** 
B>BRR\]&+
,	-* 3'!+	J :U;
 (( & 1 1)	

 	F Mra   c                       \ rS rSr% Sr\" \S9r\\	\
\   4   \S'   \" \S9r\\	\
\   4   \S'   Sr\\S'   S rS	 rS
rg)r   i  a  Configuration class for applying different quantization configs to modules or parameters based on their fully qualified names (FQNs).

Args:
    `fqn_to_config`: typing.OrderedDict[str, Optional[AOBaseConfig]]: an
     ordered dictionary from
         (1). fully qualified name (fqn) of module or parameter
         (2). regex of fully qualified name (in python `re` module regex format), should
              start with prefix "re:" or
         (3). "_default"
     to the config that we want to apply to the module/param or None

     Config key ordered by precedence:
       * fully qualified parameter name, e.g. `language.layers.0.q_proj.weight`
       * fully qualified module name, e.g. `language.layers.0.q_proj`
       * regex for parameter names, must start with `re:`, e.g. `re:language\.layers\..+\.q_proj.weight`.
         The first regex that matches will be applied.
       * regex for module names, must start with `re:`, e.g. `re:language\.layers\..+\.q_proj`,
         whichever regex fully matches the module fqn first will be applied
         (order of keys for dictionary are kept consistent since we are using OrderedDict)
       * "_default", fallback if no match for all previous keys
         (Note, when using `_default`, the config is applied to all modules, to apply
          it to only a subset of modules, e.g. with some types, it's better to filter
          the modules that we don't want to quantize before hand and configure them to
          None, e.g. `{"re:.+norm.+": None, "_default": linear_config}`) "_default" is not supported when filter_fn is not specified.
    `module_fqn_to_config`: typing.OrderedDict[str, Optional[AOBaseConfig]]: To maintain BC with ModuleFqnToConfig, to be deprecated later
    `version`: int: Version of config to use.

Note:
    - The order of patterns in the OrderedDict may matter as only the first matching pattern is applied
    - "_default" is ignored for parameter replacement.
)default_factoryr   module_fqn_to_configr6   r  c                 J   [         R                  R                  S5        [        U R                  5      S:  a>  [        U R
                  5      S:  a%  U R                  U R
                  :w  a  [        S5      e[        U R
                  5      S:  a*  [        U R                  5      S:X  a  U R
                  U l        [        U R                  5      S:  a*  [        U R
                  5      S:X  a  U R                  U l        SU R                  ;   a  [        R                  " S5        g g )Nz torchao.quantization.FqnToConfigr   zP`fqn_to_config` and `module_fqn_to_config` are both specified and are not equal!r   zConfig Deprecation: _default is deprecated and will no longer be supported in a future release. Please see https://github.com/pytorch/ao/issues/3229 for more details.)	rg   r   r   lenr   r  r   rg  rh  r   s    r`   r  FqnToConfig.__post_init__  s    $$%GH ""#a'D--.2""d&?&??b 
 t(()A-#d6H6H2IQ2N!%!:!:Dt!!"Q&3t/H/H+IQ+N(,(:(:D% +++MM y ,ra   c                 l    SR                  S/S U R                  R                  5        5       QSP5      $ )N
zFqnToConfig({c              3   8   #    U  H  u  pS U SU S3v   M     g7f)z  'z':
    ,Nr   ).0keyvalues      r`   	<genexpr>&FqnToConfig.__str__.<locals>.<genexpr>	  s(      &@
 #hugQ/&@s   z}))joinr   r   r   s    r`   __str__FqnToConfig.__str__  sF    yy&*&8&8&>&>&@ 	
 		
ra   )r   r  N)r   r   r   r   r  r   r   r   OrderedDictTypestrr
   r   r  r  r  r	  r  r  r   r   ra   r`   r   r     so    @ CH#CM?3(>#>?  JO#J/#x/E*EF  GS2

ra   r   r   c                 f    [         R                  " U 5      R                  R                  SS5      SL$ )zk
Returns True if the handler function has a "parameter_name" kwarg in its type signature, False otherwise.
r-  N)inspect	signature
parametersget)r   s    r`   "_handler_supports_fqn_quantizationr    s/     W%00445EtLTXXXra   fqnc                 f   Sn/ n[        [        U R                  5       5      5       HD  u  nu  pgU[        U 5      ;   d  M  [	        U5      S:  a  U SU 3OUnUR                  XVXx45        MF     [        U5       H~  u  pVpxXR                  ;   d  M  SnUR                  U   n	U	c  UR                  U5        M>  [        [        U	5         n
[        U
5      (       a	  U
" X	US9n Mi  [        [        U	5       S35      e   U(       d=  XR                  ;   a.  UR                  U   n	U	b  [        [        U	5         n
U
" X	5      $ U $ U H  u  pVpxUR                   H  nUR                  S5      (       d  M  [        R                  " US	S U5      (       d  M<  SnUR                  U   n	U	c  MR  [        [        U	5         n
[        U
5      (       a	  U
" X	US9n M}  [        [        U	5       S35      e   M     U(       d|  UR                   Hl  nUR                  S5      (       d  M  [        R                  " US	S U5      (       d  M<  UR                  U   n	U	b  [        [        U	5         n
U
" X	5      s  $ U s  $    U(       d9  UR                  R                  S
S5      n	U	b  [        [        U	5         n
U
" X	5      $ U $ )aO  This function expects a module that either is specified in FqnToConfig or has a parameter that is specified in FqnToConfig.

Args:
    module (torch.nn.Module): The module to be processed.
    fqn (str): The fully qualified name of the module containing the parameters.
    config (FqnToConfig): Configuration object containing regex patterns / fqn mapped
        to quantization configurations.

Returns:
    torch.nn.Module: The modified module with quantized parameters.

Raises:
    NotImplementedError: If the quantization configuration is not yet supported for parameter quantization.
Fr   rR   TNrN  zs does not yet support parameter quantization! Please see https://github.com/pytorch/ao/issues/3252 for more detailsre:r   r   )	enumeraterT   named_parametersdirr  appendr   r   r.   r   r  NotImplementedError
startswithre	fullmatchr  )r   r  r   parameter_config_foundtop_level_paramsir-  paramparameter_fqncr   patterns               r`   r   r     s   & #&/V5L5L5N0O&P""NS[(-0X\3%q()~  ##Q$MN 'Q 488H3I/5000%)"$$]3Ay $$Q'247;5g>>$V~NF-7)  $W  X  4J$ "c-A-A&A  %=.tAw7G6%%M 4D/5++G!!%((R\\'!"+}-U-U)-&((1=6tAw?G9'BB!(>!R1#Awi  ([  \  , 4D  "++G!!%((R\\'!"+s-K-K((1=6tAw?G"6--!M , "  $$Z6=.tAw7G6%%Mra   c                     XR                   ;   a"  U R                  S5      (       a   SU  S35       egUR                    H<  nUR                  S5      (       d  M  [        R                  " USS U 5      (       d  M<    g   g)aM  Check if a given fqn matches the exact fqn or regex pattern specified in FqnToConfig.

Args:
    fqn (str): The fully qualified name of the module.
    config (FqnToConfig): Configuration object containing regex patterns or raw FQNs for quantization.

Returns:
    bool: True if the fqn is specified in FqnToConfig. False otherwise.
r  zError: Exact match but regex z specified.Tr   NF)r   r  r  r  )r  r   !maybe_module_or_param_fqn_patterns      r`   r   r   }  s     """>>%(( 	
+C5<	
( 171E1E-0;;EBBr||1!"5sH H 	 2F
 ra   c                     U R                  5        H@  u  p4U[        U 5      ;   d  M  [        U5      S:  a  U SU 3OUn[        XR5      (       d  M@    g   g)a  Check if a given module contains top-level parameters that match the exact fqn or regex pattern specified in FqnToConfig.

Args:
    module (nn.Module): The module to be checked.
    fqn (str): The fully qualified name of the module.
    config (FqnToConfig): Configuration object containing regex patterns or raw FQNs for quantization.

Returns:
    bool: True if the module contains top-level parameters that match the fqn or regex pattern specified in FqnTo
r   rR   TF)r  r  r  r   )r   r  r   r]   r  r  s         r`   r   r     sS     ..03v;/23x!|se1TFOM%m<<	 1 ra   c                     [         R                  " S5         [        R                  " U R                  U R
                  5      nSSS5        U R                  Wl        U R                  Ul        U$ ! , (       d  f       N2= f)aE  
Unwrap a torchao Float8Linear by returning a nn.Linear with the same weights and bias.

Torchao inference quantization techniques are generally only applicable to nn.Linear
layers, so this helper is useful for unwrapping models trained with torchao float8 training,
which replaces nn.Linear layers with Float8Linear layers.
metaN)rg   rQ   rh   ri   r  out_featuresrd   r   )r   
new_modules     r`   r  r    sW     
f	YYv1163F3FG
 
JkkJO	 
	s   ,A//
A=)r   Nr   )Fr   )r  r  loggingr  r   rg  collectionsr   dataclassesr   r   	functoolsr   typingr   r   r	   r
   r   r   r  rg   torch.nnrh   torch.nn.utils.parametrizerU  r{   rS  torchao.core.configr   torchao.dtypesr   r   r   r   torchao.dtypes.utilsr   torchao.float8.configr   torchao.float8.float8_linearr   torchao.float8.inferencer   r   r   r   r   (torchao.prototype.quantization.quant_apir   =torchao.quantization.linear_activation_weight_observed_tensorr   torchao.quantization.observerr   %torchao.quantization.quantize_.commonr   (torchao.quantization.quantize_.workflowsr   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   %torchao.quantization.transform_moduler.   r/   torchao.quantization.utilsr0   r1   r2   r3   torchao.utilsr4   r5   rb  r7   r8   r9   r:   r;   "linear_activation_quantized_tensorr<   r=   linear_quant_modulesr>   r?   qatr@   quant_primitivesrA   rB   rC   rD   unifiedrE   rF   rG   	getLoggerr   rD  __all__FLOATLAYOUT_TO_ZERO_POINT_DOMAINLAYOUT_TO_PRESERVE_ZEROSrV   rp   rI   rH   r   r  r=  r   r   r   DevicerJ   r  r   r   r   r   r,  r1  r3  rM  rX  rZ  r_  ra  rj  rn  rq  rs  rw  ry  r  r  r  r  r  r  r  r  r  r  r  r&  r  r  r  r   rK   r  r   r   r   ri   r  serializationadd_safe_globalsr   ra   r`   <module>r)     s  	   	   # (  > > 1   0 0  ,  ( , 5  F    $ 
   1 ,			8	$
 O112 
 5  ,.+ sCx)+ 
+\$DJ CGN99N89N 9:N
 %((//3!7!=>?NbD
 ).e. CM+/	H
88??H
H
 %((//3!7!=>?H
 U[[''(	H
V5<< ELL $ELL U\\ ,%,, 5<< & @
L @
 @
P ,004G 
 5<<(G   -G T ""GH
 #+/04HHOO1 	
 5<<(  - XX__ IB $R< $R $RN8Tv ""67
 #	HHOO  	
 XX__ 8@ ;l ; ;$ ""IJ
 #	1HHOO131 	1
 XX__1 K1h !< ! !H6 ""67
 #	HHOO  	 8@  $
||

\\
.
 
#
 

4 -LL -L -L`>B ""GH
 	HHOO1
 XX__ I@ /
< /
 /
d ""FG
 	,HHOO,0, H,^ T\ T T4 ""89
 #	"HHOO""" 	"
 XX__" :"J :S :S :SzE P ""KL
 #	#HHOO#5# 	# M#L2||  
	2 1
< 1
 1
p ,004;O 5<<(	;O
  -;O| ""67
 #+/04HHOO  	
 5<<(  - XX__ 8D M
, M
 M
b   Yuxx5uxxFGY	Y[HHOO[	[ [|	6II	 0, 299     $ $#0ra   