
    i?                     .   S SK JrJr  S SKJrJrJrJr  SSKJ	r	J
r
  SSKJrJr  SSKJr  \(       a  SSKJr  \	" 5       (       a  S S	KrS S
KJr  O\r\
R,                  " \5      r " S S\5      r " S S\5      rS\\R6                  \R8                  /S.0rg	)    )ABCabstractmethod)TYPE_CHECKINGAnyOptionalUnion   )is_torch_availablelogging)QuantizationConfigMixinQuantizationMethod   )get_module_from_name)PreTrainedModelN)
ModuleListc            
          \ rS rSrSrSrSrSrS\4S jr	S8S
 jr
S8S jrS\\\\4      S	\\\\4      4S jrS8S jrS\\   S\S	\\   4S jrS\\   S\\   S	\\   4S jrS\\   S	\\   4S jrSSS	\\S4   4S jrS\\\\\4   4   S	\\\\\4   4   4S jrS	\4S jrSSS\S	\4S jrS rS  rS! rS" r S9S# jr!S9S$ jr"S% r#S& r$S' r%S( r&S\S	\4S) jr'\(   S:SSS*\\\      S+\\\      S,\4S- jj5       r)\*S	\4S. j5       r+\*S	\4S/ j5       r,S;S0 jr-S1 r.\/S2 5       r0\/S3 5       r1\/S<S4 j5       r2\*\/S5 5       5       r3S6 r4S7r5g)=HfQuantizer"   a^  
Abstract class of the HuggingFace quantizer. Supports for now quantizing HF transformers models for inference and/or quantization.
This class is used only for transformers.PreTrainedModel.from_pretrained and cannot be easily used outside the scope of that method
yet.

Attributes
    quantization_config (`transformers.utils.quantization_config.QuantizationConfigMixin`):
        The quantization config that defines the quantization parameters of your model that you want to quantize.
    modules_to_not_convert (`list[str]`, *optional*):
        The list of module names to not convert when quantizing the model.
    required_packages (`list[str]`, *optional*):
        The list of required pip packages to install prior to using the quantizer
    requires_calibration (`bool`):
        Whether the quantization method requires to calibrate the model before using it.
    requires_parameters_quantization (`bool`):
        Whether the quantization method requires to create a new Parameter. For example, for bitsandbytes, it is
        required to create a new xxxParameter in order to properly quantize the model.
FNquantization_configc                     Xl         UR                  S/ 5      U l        UR                  SS5      U l        U R                  (       d+  U R                  (       a  [        SUR                   S35      eg g )Nmodules_to_not_convertpre_quantizedTzThe quantization method z does require the model to be pre-quantized. You explicitly passed `pre_quantized=False` meaning your model weights are not quantized. Make sure to pass `pre_quantized=True` while knowing what you are doing.)r   popr   r   requires_calibration
ValueErrorquant_method)selfr   kwargss      l/home/dmtnaga/Documents/work/airagagent/rag_env/lib/python3.13/site-packages/transformers/quantizers/base.py__init__HfQuantizer.__init__:   st    #6  '-jj1I2&N##ZZ>!!d&?&?*+>+K+K*L MN O  '@!    dtypetorch.dtypereturnc                 N    [         R                  S5        U R                  U5      $ )z
Deprecared in favor of `update_dtype`!

Args:
    dtype (`torch.dtype`):
        The input dtype that is passed in `from_pretrained`
zb`update_torch_dtype` is deprecated in favor of `update_dtype`! It will be removed in version v4.57)loggerwarning_onceupdate_dtyper   r#   s     r   update_torch_dtypeHfQuantizer.update_torch_dtypeH   s(     	p	
   ''r"   c                     U$ )a  
Some quantization methods require to explicitly set the dtype of the model to a
target dtype. You need to override this method in case you want to make sure that behavior is
preserved

Args:
    dtype (`torch.dtype`):
        The input dtype that is passed in `from_pretrained`
 r*   s     r   r)   HfQuantizer.update_dtypeU   	     r"   
device_mapc                     U$ )aa  
Override this method if you want to pass a override the existing device map with a new
one. E.g. for bitsandbytes, since `accelerate` is a hard requirement, if no device_map is
passed, the device_map is set to `"auto"``

Args:
    device_map (`Union[dict, str]`, *optional*):
        The device_map that is passed through the `from_pretrained` method.
r.   )r   r1   s     r   update_device_mapHfQuantizer.update_device_mapa   s
     r"   c                     U$ )a  
Override this method if you want to adjust the `target_dtype` variable used in `from_pretrained`
to compute the device_map in case the device_map is a `str`. E.g. for bitsandbytes we force-set `target_dtype`
to `torch.int8` and for 4-bit we pass a custom enum `accelerate.CustomDtype.int4`.

Args:
    dtype (`torch.dtype`, *optional*):
        The dtype that is used to compute the device_map.
r.   r*   s     r   adjust_target_dtypeHfQuantizer.adjust_target_dtypem   r0   r"   missing_keysprefixc                     U$ )z
Override this method if you want to adjust the `missing_keys`.

Args:
    missing_keys (`list[str]`, *optional*):
        The list of missing keys in the checkpoint compared to the state dict of the model
r.   )r   modelr8   r9   s       r   update_missing_keysHfQuantizer.update_missing_keysy   s
     r"   expected_keysloaded_keysc                     U$ )a  
Override this method if you want to adjust the `update_expected_keys`.

Args:
    expected_keys (`list[str]`, *optional*):
        The list of the expected keys in the initialized model.
    loaded_keys (`list[str]`, *optional*):
        The list of the loaded keys in the checkpoint.
r.   )r   r;   r>   r?   s       r   update_expected_keys HfQuantizer.update_expected_keys   s
     r"   unexpected_keysc                     U$ Nr.   )r   r;   rC   s      r   update_unexpected_keys"HfQuantizer.update_unexpected_keys   s    r"   c                    ^ UR                  5        V^Vs0 s H/  u  mn[        U4S jU R                   5       5      (       d  M,  TU_M1     snn$ s  snnf )a  
returns dtypes for modules that are not quantized - used for the computation of the device_map in case
one passes a str as a device_map. The method will use the `modules_to_not_convert` that is modified
in `_process_model_before_weight_loading`.

Args:
    model (`~transformers.PreTrainedModel`):
        The model to quantize
    dtype (`torch.dtype`):
        The dtype passed in `from_pretrained` method.
c              3   ,   >#    U  H	  oT;   v   M     g 7frE   r.   ).0mnames     r   	<genexpr>8HfQuantizer.get_special_dtypes_update.<locals>.<genexpr>   s     FvZuUVDyZus   )named_parametersanyr   )r   r;   r#   rL   _s      ` r   get_special_dtypes_update%HfQuantizer.get_special_dtypes_update   sN     (-'='='?
'?GD!3FvZ^ZuZuFvCvKD%K'?
 	
 
s   +AA
max_memoryc                     U$ )zaadjust max_memory argument for infer_auto_device_map() if extra memory is needed for quantizationr.   )r   rT   s     r   adjust_max_memoryHfQuantizer.adjust_max_memory       r"   c                 P    [         R                  S5        U R                  " U0 UD6$ )zDEPRECATED -> remove in v5z`check_quantized_param` is deprecated in favor of `param_needs_quantization`, which is a much more self.explanatory name for what the method achieves. It will be removed in v5)r'   r(   param_needs_quantizationr   argsr   s      r   check_quantized_param!HfQuantizer.check_quantized_param   s.    `	
 ,,d=f==r"   r;   r   
param_namec                     g)zX
Check whether a given param needs quantization as defined by `create_quantized_param`.
Fr.   )r   r;   r_   r   s       r   rZ   $HfQuantizer.param_needs_quantization   s     r"   c                 l    U R                   (       d#  [        SU R                  R                   S35      eg)z
Take needed components from state_dict (those from which `param_needs_quantization` is True) and create
quantized param.
It usually also load the new param directly in the `model`.
Note: only applicable if requires_parameters_quantization == True.
zG`.create_quantized_param()` method is not supported by quantizer class .N) requires_parameters_quantizationAttributeError	__class____name__r[   s      r   create_quantized_param"HfQuantizer.create_quantized_param   s:     44 YZ^ZhZhZqZqYrrst  5r"   c                     g)a  
This method is used to potentially check for potential conflicts with arguments that are
passed in `from_pretrained`. You need to define it for all future quantizers that are integrated with transformers.
If no explicit check are needed, simply return nothing.
Nr.   r[   s      r   validate_environment HfQuantizer.validate_environment   s     	r"   c                     U$ z"updates the tp plan for the scalesr.   r   configs     r   update_tp_planHfQuantizer.update_tp_plan       r"   c                     U$ rn   r.   ro   s     r   update_ep_planHfQuantizer.update_ep_plan   rs   r"   c                     SUl         U R                  R                  Ul        U R                  (       a  U R                  U5        U R                  " U40 UD6$ )a	  
Setting model attributes and/or converting model before weights loading. At this point
the model should be initialized on the meta device so you can freely manipulate the skeleton
of the model in order to replace modules in-place. Make sure to override the abstract method `_process_model_before_weight_loading`.

Args:
    model (`~transformers.PreTrainedModel`):
        The model to quantize
    kwargs (`dict`, *optional*):
        The keyword arguments that are passed along `_process_model_before_weight_loading`.
T)is_quantizedr   r   quantization_methodr   _convert_model_for_quantization$_process_model_before_weight_loadingr   r;   r   s      r   preprocess_modelHfQuantizer.preprocess_model   sM     "$($<$<$I$I!00788I&IIr"   c                 (    U R                   " U40 UD6$ )aM  
Post-process the model post weights loading.
Make sure to override the abstract method `_process_model_after_weight_loading`.

Args:
    model (`~transformers.PreTrainedModel`):
        The model to quantize
    kwargs (`dict`, *optional*):
        The keyword arguments that are passed along `_process_model_after_weight_loading`.
)#_process_model_after_weight_loadingr|   s      r   postprocess_modelHfQuantizer.postprocess_model   s     77HHHr"   c                     [        US5      (       a  U?[        UR                  S5      (       a  UR                  ?[        UR                  S5      (       a  UR                  ?[        US5      (       a  U?SUl        g)z0
Remove the quantization config from the model.
hf_quantizerr   _pre_quantization_dtypery   FN)hasattrr   rp   r   r   ry   rx   r   r;   s     r   remove_quantization_config&HfQuantizer.remove_quantization_config   sk     5.))"5<<!67705<<!:;;45/00)"r"   c                 n    U R                  U5      nU?UR                  ?UR                  ?U?SUl        U$ )z
Potentially dequantize the model to retrieve the original model, with some loss in accuracy / performance.
Note not all quantization schemes support this.
F)_dequantizer   rp   r   r   ry   rx   r   s     r   
dequantizeHfQuantizer.dequantize   sC    
   ' LL,LL0%"r"   c                     g)a5  
The factor to be used in `caching_allocator_warmup` to get the number of bytes to pre-allocate to warm up accelerator.
A factor of 2 means we allocate all bytes in the empty model (since we allocate in fp16), a factor of 4 means
we allocate half the memory of the weights residing in the empty model, etc...
   r.   r   s    r   get_accelerator_warm_up_factor*HfQuantizer.get_accelerator_warm_up_factor  s     r"   c                 F    [        U R                  R                   S35      e)NzH has no implementation of `dequantize`, please raise an issue on GitHub.)NotImplementedErrorr   r   r   s     r   r   HfQuantizer._dequantize  s'    !''4455}~
 	
r"   c                     U$ )z>
Override this method if you want to adjust the `param_name`.
r.   )r   r_   s     r   get_param_nameHfQuantizer.get_param_name  s
     r"   skip_moduleskeep_in_fp32_modulesadd_default_skipsc                     SSK Jn  Ub  U(       a	  U" U 5      nO/ nUb  UR                  U5        Ub  UR                  U5        U$ )Nr	   )get_keys_to_not_convert)integrationsr   extend)r;   r   r   r   r   r   s         r   get_modules_to_not_convert&HfQuantizer.get_modules_to_not_convert$  sP     	;#4%<U%C"%'"#")),7+"))*>?%%r"   c                     g)zUFlag indicating whether the quantized model can carry out quantization aware trainingFr.   r   s    r   is_qat_trainableHfQuantizer.is_qat_trainable:       r"   c                     g)z;Flag indicating whether the quantized model can be compiledFr.   r   s    r   is_compileableHfQuantizer.is_compileable?  r   r"   c                 
    S0 4$ )zcGet state dict and metadata. Useful when we need to modify a bit the state dict due to quantizationNr.   )r   r;   safe_serializations      r   get_state_dict_and_metadata'HfQuantizer.get_state_dict_and_metadataD  s    Rxr"   c                     U$ )zEUpdate state dict with metadata. Default behaviour returns state_dictr.   )r   
state_dictmetadatas      r   update_state_dict_with_metadata+HfQuantizer.update_state_dict_with_metadataH  rX   r"   c                     g rE   r.   r|   s      r   r{   0HfQuantizer._process_model_before_weight_loadingL  s    EHr"   c                     g rE   r.   r|   s      r   r   /HfQuantizer._process_model_after_weight_loadingO  s    DGr"   c                     g rE   r.   )r   r   s     r   is_serializableHfQuantizer.is_serializableR  s    8;r"   c                     g rE   r.   r   s    r   is_trainableHfQuantizer.is_trainableU  s    r"   c                    SSK Jn  UR                  5        H  u  p4UR                  R                  nU[
        ;   d  M'  U R                  R                  [
        U   S   ;   d  MM  U" 5          [        X5      u  pc[
        U   S   " UR                  R                  5       5      UR                  U'   S S S 5        M     g ! , (       d  f       M  = f)Nr   )init_empty_weightsquantization_methodsmodule_name)
accelerater   named_modulesrf   rg   !MODULES_TO_PATCH_FOR_QUANTIZATIONr   r   r   rp   get_text_config_modules)r   r;   r   rL   modulemodule_class_nameparent_modules          r   rz   +HfQuantizer._convert_model_for_quantizationY  s    1!//1LD & 0 0 9 9 $EE((5545FGH^_` ()*>u*K'M3TUf3ghu3v4464M**40 *) 2 *)s   +AB==
C	)r   r   r   )r#   r$   r%   r$   )r;   r   )NNF)FrE   )6rg   
__module____qualname____firstlineno____doc__r   required_packagesrd   r   r    r+   r)   r   dictstrr   r3   r6   listr<   rA   rF   rR   r   intrV   boolr]   rZ   rh   rk   rq   ru   r}   r   r   r   r   r   r   staticmethodr   propertyr   r   r   r   r   r{   r   r   r   rz   __static_attributes__r.   r"   r   r   r   "   s|   & !',$,C (

HT#s(^,D 
RVWZ\_W_R`Ia 

tCy # RVWZR[ 
c 
QUVYQZ 
_cdg_h 
T#Y 4PS9 
m 
SR_M_H` 
"DeCHo1E,F 4PSUZ[^`c[cUdPdKe > >.? S _c 
J$I# 	

    -148"'	& &tCy)& 'tCy1&  	& &* $      H HG G; ;  r"   r   c                   >   ^  \ rS rSrSrU 4S jr    SS jrSrU =r$ )SequentialLlama4TextExpertsii  z
A module that implements a compressed version of a list of expert modules.
This is specifically designed to work with Llama4TextExperts in MoE layers.
c                    > SSK Jn  [        TU ]  [	        UR
                  5       Vs/ s H
  o2" U5      PM     sn5        UR
                  U l        g s  snf )Nr   )Llama4TextMLP)*transformers.models.llama4.modeling_llama4r   superr    rangenum_local_expertsnum_experts)r   rp   r   rQ   rf   s       r   r    $SequentialLlama4TextExperts.__init__o  sH    Lv?W?W9XY9XA-/9XYZ!33 Zs   Ac                     UR                  U R                  SUR                  S   5      n[        R                  " U5      n[        U R                  5       H  nX   " X   5      X#'   M     U$ )N)reshaper   shapetorch
zeros_liker   )r   hidden_states
routed_out
expert_idxs       r   forward#SequentialLlama4TextExperts.forwardu  sh     &--d.>.>MDWDWXZD[\%%m4
 0 01J%)%5m6O%PJ" 2r"   )r   )r   torch.Tensorr%   r   )	rg   r   r   r   r   r    r   r   __classcell__)rf   s   @r   r   r   i  s)    
4% 
 r"   r   Llama4TextExperts)r   r   )abcr   r   typingr   r   r   r   utilsr
   r   utils.quantization_configr   r   quantizers_utilsr   modeling_utilsr   r   torch.nnr   r   
get_logger__file__r'   r   r   COMPRESSED_TENSORSBITS_AND_BYTESr   r.   r"   r   <module>r      s    $ 6 6 / S 2 0#J			H	%D# DN
* 0 211--!
% !r"   