ó
    ;‡i³<  ã                   ó”   • S SK Jr   " S S\5      r " S S\5      r " S S\5      r " S S	\5      r " S
 S\5      r " S S\5      rg)é    )ÚOptimizer2Statec                   óB   ^ • \ rS rSr           SU 4S jjrSrU =r$ )ÚAdamWé	   c                 ó6   >• [         TU ]  SUUUUUUUU	U
UUS9  g)aÛ  
Base AdamW optimizer.

Arguments:
    params (`torch.Tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 1e-2):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
    is_paged (`bool`, defaults to `False`):
        Whether the optimizer is a paged optimizer or not.
Úadam©Úis_pagedN©ÚsuperÚ__init__©ÚselfÚparamsÚlrÚbetasÚepsÚweight_decayÚamsgradÚ
optim_bitsÚargsÚmin_8bit_sizeÚpercentile_clippingÚ
block_wiser
   Ú	__class__s                €Úh/home/dmtnaga/Documents/work/airagagent/rag_env/lib/python3.13/site-packages/bitsandbytes/optim/adamw.pyr   ÚAdamW.__init__
   s<   ø€ ôX 	‰ÑØØØØØØØØØØØØð 	ò 	
ó    © ©çü©ñÒMbP?©gÍÌÌÌÌÌì?g+‡ÙÎ÷ï?ç:Œ0âŽyE>ç{®Gáz„?Fé    Né   éd   TF©Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__r   Ú__static_attributes__Ú__classcell__©r   s   @r   r   r   	   ó0   ø† ð ØØØØØØØØØØ÷9
õ 9
r   r   c                   óB   ^ • \ rS rSr           SU 4S jjrSrU =r$ )Ú	AdamW8bitéF   c                 ó|   >• U(       a  [        S5      eUS:w  a  [        S5      e[        TU ]	  SUUUUUSUU	U
UUS9  g)a†  
8-bit AdamW optimizer.

Arguments:
    params (`torch.Tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 1e-2):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
        Note: This parameter is not supported in AdamW8bit and must be False.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
        Note: This parameter is not used in AdamW8bit as it always uses 8-bit optimization.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
    is_paged (`bool`, defaults to `False`):
        Whether the optimizer is a paged optimizer or not.
z'AdamW8bit does not support amsgrad=Truer%   zGAdamW8bit only supports optim_bits=32 (default value for compatibility)r   é   r	   N©Ú
ValueErrorr   r   r   s                €r   r   ÚAdamW8bit.__init__G   sa   ø€ ö^ ÜÐFÓGÐGà˜Óô ÐfÓgÐgä‰ÑØØØØØØØØØØØØð 	ò 	
r   r   r    r(   r/   s   @r   r2   r2   F   s2   ø† ð ØØØØØØØØØØ÷D
õ D
r   r2   c                   óB   ^ • \ rS rSr           SU 4S jjrSrU =r$ )Ú
AdamW32bitéŽ   c                 ó6   >• [         TU ]  SUUUUUSUU	U
UUS9  g)aÝ  
32-bit AdamW optimizer.

Arguments:
    params (`torch.Tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 1e-2):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
    is_paged (`bool`, defaults to `False`):
        Whether the optimizer is a paged optimizer or not.
r   r%   r	   Nr   r   s                €r   r   ÚAdamW32bit.__init__   s<   ø€ ôX 	‰ÑØØØØØØØØØØØØð 	ò 	
r   r   r    r(   r/   s   @r   r:   r:   Ž   r0   r   r:   c                   ó@   ^ • \ rS rSr          SU 4S jjrSrU =r$ )Ú
PagedAdamWéË   c                 ó6   >• [         TU ]  SUUUUUUUU	U
USS9  g)au  
Paged AdamW optimizer.

Arguments:
    params (`torch.Tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 1e-2):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
r   Tr	   Nr   ©r   r   r   r   r   r   r   r   r   r   r   r   r   s               €r   r   ÚPagedAdamW.__init__Ì   s<   ø€ ôR 	‰ÑØØØØØØØØØØØØð 	ò 	
r   r   ©
r!   r"   r#   r$   Fr%   Nr&   r'   Tr(   r/   s   @r   r?   r?   Ë   ó-   ø† ð ØØØØØØØØØ÷6
õ 6
r   r?   c                   ó@   ^ • \ rS rSr          SU 4S jjrSrU =r$ )ÚPagedAdamW8biti  c                 ó|   >• U(       a  [        S5      eUS:w  a  [        S5      e[        TU ]	  SUUUUUSUU	U
USS9  g)	a/  
Paged 8-bit AdamW optimizer.

Arguments:
    params (`torch.Tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 1e-2):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
        Note: This parameter is not supported in PagedAdamW8bit and must be False.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
        Note: This parameter is not used in PagedAdamW8bit as it always uses 8-bit optimization.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
z,PagedAdamW8bit does not support amsgrad=Truer%   zLPagedAdamW8bit only supports optim_bits=32 (default value for compatibility)r   r5   Tr	   Nr6   rB   s               €r   r   ÚPagedAdamW8bit.__init__  sa   ø€ öX ÜÐKÓLÐLà˜Óô ÐkÓlÐlä‰ÑØØØØØØØØØØØØð 	ò 	
r   r   rD   r(   r/   s   @r   rG   rG     s/   ø† ð ØØØØØØØØØ÷A
õ A
r   rG   c                   ó@   ^ • \ rS rSr          SU 4S jjrSrU =r$ )ÚPagedAdamW32bitiJ  c                 ó6   >• [         TU ]  SUUUUUSUU	U
USS9  g)a|  
Paged 32-bit AdamW optimizer.

Arguments:
    params (`torch.Tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 1e-2):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
r   r%   Tr	   Nr   rB   s               €r   r   ÚPagedAdamW32bit.__init__K  s<   ø€ ôR 	‰ÑØØØØØØØØØØØØð 	ò 	
r   r   rD   r(   r/   s   @r   rK   rK   J  rE   r   rK   N)Úbitsandbytes.optim.optimizerr   r   r2   r:   r?   rG   rK   r   r   r   Ú<module>rO      s[   ðõ 9ô:
ˆOô :
ôzE
ô E
ôP:
ô :
ôz7
ô 7
ôtB
_ô B
ôJ7
oõ 7
r   