
    @KiK                         S SK JrJr  SSKJr  \(       a  SSKJr  SSKJrJ	r	J
r
JrJr  SSKJr  \
" 5       (       a  S SKr\R                   " \5      rSr " S	 S
\5      rg)    )TYPE_CHECKINGOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_kernels_availableis_torch_availableis_triton_availablelogging)get_module_from_nameNc                   D  ^  \ rS rSrSrSrSrS/rU 4S jrS r	S r
S%S
 jrSSS\S	\4S jrSSSSS\SS4S jrS&S jrSSS\\   S\\   4S jr S'SSS\\\      4S jjrS\\   S\S	\\   4S jrS rS rS\S	\4S jrS(S \4S! jjrS'S" jr\S	\4S# j5       rS$rU =r$ ))Mxfp4HfQuantizer'   z'
FP4 quantization using fbgemm kernels
TF
acceleratec                 B   > [         TU ]  " U40 UD6  Xl        S U l        g N)super__init__quantization_configtriton_kernels_hub)selfr   kwargs	__class__s      i/var/www/html/dynamic-report/venv/lib/python3.13/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   Mxfp4HfQuantizer.__init__1   s$    ,77#6 "&    c                     U R                   c    SSKJn  U" S5      U l         U R                   $ U R                   $ ! [         a    [        S5      ef = f)z3Lazy import and initialize kernels only when neededr   )
get_kernelz kernels-community/triton_kernelsz2kernels package is required for MXFP4 quantization)r   kernelsr    ImportError)r   r    s     r   _lazy_import_kernels%Mxfp4HfQuantizer._lazy_import_kernels6   s]    ""*X.*45W*X' &&&t&&&  X!"VWWXs	   : Ac                 0   [        5       (       d  [        S5      eU R                  R                  (       a  g [        R
                  R                  5       (       df  [        R                  R                  5       (       dC  U R                  (       a'  [        R                  S5        SU R                  l        g [        S5      e[        5       (       d  [        S5      e[        R                  R                  5       (       a  Sn[        S5      =(       a
    [        5       nO?[        R
                  R                  5       nUS:  n[        S5      =(       a
    [        5       nU R                  (       a]  U(       d'  [        R                  S	5        SU R                  l        g U(       d'  [        R                  S
5        SU R                  l        g O$U(       d  [!        S5      eU(       d  [!        S5      eU R                  (       d  U R#                  5         UR%                  S5      nUc  [        R                  S5        g Ub\  U R                  (       dJ  ['        U[(        5      (       a4  SUR+                  5       ;   d  SUR+                  5       ;   a  [!        S5      eg g g g )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z^Using MXFP4 quantized models requires a GPU, we will default to dequantizing the model to bf16Tz-Quantizing a model using MXFP4 requires a GPUz9Using mxfp4 requires Accelerate: `pip install accelerate`z3.5.0)      z3.4.0u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) We will default to dequantizing the model to bf16.zMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0, we will default to dequantizing the model to bf16u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) zuMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0
device_mapzYou have loaded an FP4 model on CPU and have a CUDA/XPU device available, make sure to set your model on a GPU/XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or device_map = 'xpu'. cpudiskzYou are attempting to load an FP4 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   r"   r   
dequantizetorchcudais_availablexpupre_quantizedloggerwarning_onceRuntimeErrorr	   r   r
   get_device_capability
ValueErrorr#   get
isinstancedictvalues)r   argsr   gpu_is_supportedkernels_availablecompute_capabilityr(   s          r   validate_environment%Mxfp4HfQuantizer.validate_environmentA   s<   !##] 
 ##..

''))UYY-C-C-E-E!!##t 7;((3"#RSS&((YZZ99!!### 3G < WAUAW!&!A!A!C1V; 3G < WAUAW###I 7;((3$##  7;((3 % " r  # H  !!%%'ZZ-
V #&&z400j//11Vz?P?P?R5R n  6S 1 ' $r   returnc                 X    Uc&  [         R                  n[        R                  SU5        U$ )NzOverriding dtype=%s with `dtype=torch.bfloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp4. Pass your own dtype to specify the dtype of the remaining non-linear layers or pass dtype=torch.bfloat16 to remove this warning.)r,   bfloat16r1   info)r   dtypes     r   update_dtypeMxfp4HfQuantizer.update_dtype   s.    =NNEKK@  r   modelr   
param_namec                 @   SSK Jn  SSKJn  U R                  R
                  (       a'  SU;   d  SU;   a  [        XS [        S5      *  5      u  pgO[        X5      u  pg[        Xd5      (       d+  [        Xe5      (       a#  U R                  R
                  (       a  US;   a  gg	g)
Nr   Mxfp4GptOssExpertsGptOssExpertsblocksscales_blocks)down_proj_biasgate_up_proj_biasFT)	integrationsrK   models.gpt_oss.modeling_gpt_ossrM   r   r+   r   lenr7   )r   rG   rH   r   rK   rM   moduletensor_names           r   param_needs_quantization)Mxfp4HfQuantizer.param_needs_quantization   s    5C ##..H
4JhZdNd"6uIZCPYN?>["\FK"6u"IFf11v--$2J2J2U2UEEr   param_valueztorch.Tensortarget_deviceztorch.devicec                    SSK JnJnJnJn	Jn
  SSKJn  U R                  (       d  U R                  5       n[        X5      u  p[        R                  " U5         [        X5      (       a  U	" X,5      u  nnUR                  R                  UR                  R                   UR                  R"                  nnnU
" UUU5      u  nnSU;   a  SOSn[%        UUU5        [%        UU S3U" UU" U" 5       S9S95        ['        UU S	35        ['        UU S
35        S S S 5        g UR)                  S5      nUR)                  S5      nUR)                  S5      nUR)                  S5      nUR)                  S5      nSU;   d  SU;   a6  U R*                  R                  (       a  [        XS [-        S	5      *  5      u  pO[        X5      u  pUUUUUUS.n[        X5      (       d+  [        X5      (       an  U R*                  R                  (       aR  U R*                  R                  (       a  US [-        S	5      *  nU" XX$U40 UD6  g U" UUUUU R                  5       40 UD6  g g g ! , (       d  f       g = f)Nr   )rK   r+   load_and_swizzle_mxfp4quantize_to_mxfp4swizzle_mxfp4rL   gate_up_proj	down_proj_precision_config)rhs_data)weight_scaleflex_ctxrP   _scalesempty_paramcasting_dtypeto_contiguousrankdevice_meshrN   rO   )rg   rh   ri   rj   rk   rG   )rS   rK   r+   r]   r^   r_   rT   rM   r0   r#   r   r,   devicer7   
matmul_ogsPrecisionConfigFlexCtx
InFlexDatasetattrdelattrr6   r   rU   )r   rG   rZ   rH   r[   r   rK   r+   r]   r^   r_   rM   r   rV   _triton_weight_tensorrd   rn   ro   rp   projrg   rh   ri   rj   rk   shard_kwargsdq_param_names                               r   create_quantized_param'Mxfp4HfQuantizer.create_quantized_param   sa   	
 	
 	D!!!%!:!:!<,U?IFm,f999J;9k6(,*55EE*55==*55@@ /9WO
 :G,l<N:6(, .<z-I>{DFD*>?& 12'\G]g]iLjk FtfG$45FtfG$45+ -,4 !**]3K"JJ7M"JJ7M::f%D **]3KJ&(j*@dF^F^FiFi0CTc)n_8UV	0C	  +!.!.*L &55611d6N6N6Y6Y++66 %//@#i.$AMv;}m`lm*"#%113 ' 7Z1_ -,s   B>I
I*c                 d   U R                   R                  (       a  U R                  U5        [        R                  R                  5       (       a  [        R                  R                  5         g [        R                  R                  5       (       a  [        R                  R                  5         g g r   )r   r+   remove_quantization_configr,   r-   r.   empty_cacher/   )r   rG   r   s      r   #_process_model_after_weight_loading4Mxfp4HfQuantizer._process_model_after_weight_loading  sj    ##..++E2::""$$JJ""$YY##%%II!!# &r   expected_keyscheckpoint_keysc                    / nU GHh  nUR                  S5      (       a9  US [        S5      *  nUR                  US-   5        UR                  US-   5        MS  UR                  S5      (       a9  US [        S5      *  nUR                  US-   5        UR                  US-   5        M  U R                  (       d  UR                  S	5      (       a%  US [        S5      *  nUR                  US-   5        M  UR                  S
5      (       a&  US [        S5      *  nUR                  US-   5        GM*  UR                  S5      (       a  GMC  UR                  U5        GMW  UR                  U5        GMk     U$ )Nz.mlp.experts.gate_up_projr`   gate_up_proj_blocksgate_up_proj_scalesz.mlp.experts.down_projra   down_proj_blocksdown_proj_scalesz.mlp.experts.down_proj_blocksz .mlp.experts.gate_up_proj_blocksrO   )endswithrU   appendr0   )r   rG   r   r   new_expected_keyskeybases          r   update_expected_keys%Mxfp4HfQuantizer.update_expected_keys  s]    C||7881c.112!((0E)EF!((0E)EF677.c+../!((0B)BC!((0B)BC''<< ?@@9#&8"9!9:D%,,TK-?@\\"DEE<#&;"<!<=D%,,TN-BC\\(++%,,S1!((-/ !0 ! r   keep_in_fp32_modulesc                 l   SSK Jn  U R                  XR                  R                  U5      U l        UR                  SS5      nU(       a&  [        R                  S5        SU R                  l        UR                  nU" UU R                  U R                  US9nU R                  UR                  l        g )Nr   )replace_with_mxfp4_linearuse_kernelsFzYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseT)modules_to_not_convertr   config)
rS   r   get_modules_to_not_convertr   r   r6   r1   r2   r+   r   )r   rG   r   r   r   r   r   s          r   $_process_model_before_weight_loading5Mxfp4HfQuantizer._process_model_before_weight_loading(  s     	=&*&E&E++BBDX'
# jj6e 37D$$/)#'#>#> $ 8 8	
 ,0+C+C(r   missing_keysprefixc                 \   SSK Jn  / nUR                  5        Hr  u  pg[        Xt5      (       d  M  U HU  nXh;   d  Xc SU 3;   d  M  UR	                  S5      (       a  M,  UR	                  S5      (       a  MD  UR                  U5        MW     Mt     U V	s/ s H  oU;  d  M
  U	PM     sn	$ s  sn	f )Nr   rJ   .z.weightz.bias)rS   rK   named_modulesr7   r   r   )
r   rG   r   r   rK   not_missing_keysnamerV   missingks
             r   update_missing_keys$Mxfp4HfQuantizer.update_missing_keysG  s    5!//1LD&55+GDhay4I,I ' 0 0 ; ; ' 0 0 9 9(//8  , 2 (E<a4D+D<EEEs   	B) B)c                     SUR                   R                  ;   a.  [        USS 5      b   UR                  R	                  SSSSS.5        U$ )NGptOssConfigbase_model_tp_plangrouped_gemmz(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   __name__getattrr   updater   r   s     r   update_tp_planMxfp4HfQuantizer.update_tp_planV  R    V--666v3T:F))00DRDRAOAO	 r   c                     SUR                   R                  ;   a.  [        USS 5      b   UR                  R	                  SSSSS.5        U$ )Nr   base_model_ep_planr   r   )r   r   r   r   r   r   s     r   update_ep_planMxfp4HfQuantizer.update_ep_planc  r   r   c                 d   U R                   R                  (       a3  SU;   a  UR                  SS5      $ SU;   a  UR                  SS5      $  U$ U R                  (       dP  UR	                  S5      (       a  UR                  SS5      $ UR	                  S5      (       a  UR                  SS5      $ U$ )NrP    rf   r`   r   ra   r   )r   r+   replacer0   r   )r   rH   s     r   get_param_nameMxfp4HfQuantizer.get_param_namep  s    ##..J&!)))R88j(!)))R88 )  ##"">22!)).:OPP"";//!))+7IJJr   safe_serializationc                    SSK Jn  UR                  5       nUR                  5        GH  u  pV[	        Xc5      (       d  M  [        US5      (       d  M+  [        US5      (       d  M>  UR                  R                  R                  R                  UR                  R                  R                  5      R                  SS5      R                  SSSS	5      XE S
3'   UR                  R                  R                  R                  R                  UR                  R                  R                  R                  5      R                  SS5      XE S3'   UR                  R                  R                  R                  UR                  R                  R                  5      R                  SS5      R                  SSSS5      XE S3'   UR                   R                  R                  R                  R                  UR                   R                  R                  R                  5      R                  SS5      XE S3'   GM     0 nXG4$ )Nr   rJ   r`   ra       Z      z.gate_up_proj_blocksz.gate_up_proj_scalesi@  z.down_proj_blocksz.down_proj_scales)rS   rK   
state_dictr   r7   hasattrr`   storagelayoutunswizzle_datadata	transposereshapegate_up_proj_precision_configrd   ra   down_proj_precision_config)r   rG   r   rK   r   r   rV   metadatas           r   get_state_dict_and_metadata,Mxfp4HfQuantizer.get_state_dict_and_metadata}  s   5%%'
!//1LD666FN33FK00 ''//66EEfFYFYFaFaFfFfgYr2&WRR, V#789 88EEMMTTcc<<IIQQVViB' V#789 $$,,33BB6CSCSC[C[C`C`aYr2&WRr2. V#456 55BBJJQQ``99FFNNSSiB' V#456+ 26 ##r   c                     g)NT )r   r   s     r   is_serializable Mxfp4HfQuantizer.is_serializable  s    r   c                 .    [         R                  S5        g)NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()F)r1   r2   )r   s    r   is_trainableMxfp4HfQuantizer.is_trainable  s     x	
 r   )r   r   r   )rD   torch.dtyper@   r   )rG   r   r   )F)r   
__module____qualname____firstlineno____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r#   r>   rE   strboolrX   rx   r}   listr   r   r   r   r   r   r   r   r   propertyr   __static_attributes____classcell__)r   s   @r   r   r   '   sI    (,$ %'
	'M^
.? S _c "R R $R 	R
 &Rh$!*; !DQTI !hlmphq !@ 59D D 'tCy1D>FtCy F# FRVWZR[ F  !$T !$F d  r   r   )typingr   r   r   r   modeling_utilsr   utilsr	   r
   r   r   r   quantizers_utilsr   r,   
get_loggerr   r1   r   r   r   r   r   <module>r      sV    +  0  3 			H	% A{ Ar   