
    ;i<                         S SK Jr   " S S\5      r " S S\5      r " S S\5      r " S S	\5      r " S
 S\5      r " S S\5      rg)    )Optimizer2Statec                   B   ^  \ rS rSr           SU 4S jjrSrU =r$ )AdamW	   c                 6   > [         TU ]  SUUUUUUUU	U
UUS9  g)a  
Base AdamW optimizer.

Arguments:
    params (`torch.Tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 1e-2):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
    is_paged (`bool`, defaults to `False`):
        Whether the optimizer is a paged optimizer or not.
adamis_pagedNsuper__init__selfparamslrbetasepsweight_decayamsgrad
optim_bitsargsmin_8bit_sizepercentile_clipping
block_wiser
   	__class__s                h/home/dmtnaga/Documents/work/airagagent/rag_env/lib/python3.13/site-packages/bitsandbytes/optim/adamw.pyr   AdamW.__init__
   s<    X 	 	 	
     MbP?g?g+?:0yE>{Gz?F    N   d   TF__name__
__module____qualname____firstlineno__r   __static_attributes____classcell__r   s   @r   r   r   	   0     9
 9
r   r   c                   B   ^  \ rS rSr           SU 4S jjrSrU =r$ )	AdamW8bitF   c                 |   > U(       a  [        S5      eUS:w  a  [        S5      e[        TU ]	  SUUUUUSUU	U
UUS9  g)a  
8-bit AdamW optimizer.

Arguments:
    params (`torch.Tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 1e-2):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
        Note: This parameter is not supported in AdamW8bit and must be False.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
        Note: This parameter is not used in AdamW8bit as it always uses 8-bit optimization.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
    is_paged (`bool`, defaults to `False`):
        Whether the optimizer is a paged optimizer or not.
z'AdamW8bit does not support amsgrad=Truer%   zGAdamW8bit only supports optim_bits=32 (default value for compatibility)r      r	   N
ValueErrorr   r   r   s                r   r   AdamW8bit.__init__G   sa    ^ FGG fgg 	 	
r   r   r    r(   r/   s   @r   r2   r2   F   s2     D
 D
r   r2   c                   B   ^  \ rS rSr           SU 4S jjrSrU =r$ )
AdamW32bit   c                 6   > [         TU ]  SUUUUUSUU	U
UUS9  g)a  
32-bit AdamW optimizer.

Arguments:
    params (`torch.Tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 1e-2):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
    is_paged (`bool`, defaults to `False`):
        Whether the optimizer is a paged optimizer or not.
r   r%   r	   Nr   r   s                r   r   AdamW32bit.__init__   s<    X 	 	 	
r   r   r    r(   r/   s   @r   r:   r:      r0   r   r:   c                   @   ^  \ rS rSr          SU 4S jjrSrU =r$ )
PagedAdamW   c                 6   > [         TU ]  SUUUUUUUU	U
USS9  g)au  
Paged AdamW optimizer.

Arguments:
    params (`torch.Tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 1e-2):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
r   Tr	   Nr   r   r   r   r   r   r   r   r   r   r   r   r   r   s               r   r   PagedAdamW.__init__   s<    R 	 	 	
r   r   
r!   r"   r#   r$   Fr%   Nr&   r'   Tr(   r/   s   @r   r?   r?      -     6
 6
r   r?   c                   @   ^  \ rS rSr          SU 4S jjrSrU =r$ )PagedAdamW8biti  c                 |   > U(       a  [        S5      eUS:w  a  [        S5      e[        TU ]	  SUUUUUSUU	U
USS9  g)	a/  
Paged 8-bit AdamW optimizer.

Arguments:
    params (`torch.Tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 1e-2):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
        Note: This parameter is not supported in PagedAdamW8bit and must be False.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
        Note: This parameter is not used in PagedAdamW8bit as it always uses 8-bit optimization.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
z,PagedAdamW8bit does not support amsgrad=Truer%   zLPagedAdamW8bit only supports optim_bits=32 (default value for compatibility)r   r5   Tr	   Nr6   rB   s               r   r   PagedAdamW8bit.__init__  sa    X KLL kll 	 	
r   r   rD   r(   r/   s   @r   rG   rG     s/     A
 A
r   rG   c                   @   ^  \ rS rSr          SU 4S jjrSrU =r$ )PagedAdamW32bitiJ  c                 6   > [         TU ]  SUUUUUSUU	U
USS9  g)a|  
Paged 32-bit AdamW optimizer.

Arguments:
    params (`torch.Tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 1e-2):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
r   r%   Tr	   Nr   rB   s               r   r   PagedAdamW32bit.__init__K  s<    R 	 	 	
r   r   rD   r(   r/   s   @r   rK   rK   J  rE   r   rK   N)bitsandbytes.optim.optimizerr   r   r2   r:   r?   rG   rK   r   r   r   <module>rO      s[    9:
O :
zE
 E
P:
 :
z7
 7
tB
_ B
J7
o 7
r   