
    ;i2>                         S SK Jr   " S S\5      r " S S\5      r " S S\5      r " S S	\5      r " S
 S\5      r " S S\5      rg)    )Optimizer2Statec                   B   ^  \ rS rSr           SU 4S jjrSrU =r$ )Adam	   c                 6   > [         TU ]  SUUUUUUUU	U
UUS9  g)a  
Base Adam optimizer.

Arguments:
    params (`torch.tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 0.0):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
    is_paged (`bool`, defaults to `False`):
        Whether the optimizer is a paged optimizer or not.
adamis_pagedNsuper__init__selfparamslrbetasepsweight_decayamsgrad
optim_bitsargsmin_8bit_sizepercentile_clipping
block_wiser
   	__class__s                g/home/dmtnaga/Documents/work/airagagent/rag_env/lib/python3.13/site-packages/bitsandbytes/optim/adam.pyr   Adam.__init__
   s<    X 	 	 	
     gMbP?)g?g+?g:0yE>r   F    Ni   d   TF__name__
__module____qualname____firstlineno__r   __static_attributes____classcell__r   s   @r   r   r   	   0     9
 9
r   r   c                   B   ^  \ rS rSr           SU 4S jjrSrU =r$ )Adam8bitF   c                 |   > U(       a  [        S5      eUS:w  a  [        S5      e[        TU ]	  SUUUUUSUU	U
UUS9  g)a  
8-bit Adam optimizer.

Arguments:
    params (`torch.tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 0.0):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
        Note: This parameter is not supported in Adam8bit and must be False.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
        Note: This parameter is not used in Adam8bit as it always uses 8-bit optimization.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
    is_paged (`bool`, defaults to `False`):
        Whether the optimizer is a paged optimizer or not.
z&Adam8bit does not support amsgrad=Truer!   zFAdam8bit only supports optim_bits=32 (default value for compatibility)r      r	   N
ValueErrorr   r   r   s                r   r   Adam8bit.__init__G   sa    ^ EFF eff 	 	
r   r   r    r#   r*   s   @r   r-   r-   F   2     D
 D
r   r-   c                   B   ^  \ rS rSr           SU 4S jjrSrU =r$ )	Adam32bit   c                 6   > [         TU ]  SUUUUUSUU	U
UUS9  g)a  
32-bit Adam optimizer.

Arguments:
    params (`torch.tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 0.0):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
    is_paged (`bool`, defaults to `False`):
        Whether the optimizer is a paged optimizer or not.
r   r!   r	   Nr   r   s                r   r   Adam32bit.__init__   s<    X 	 	 	
r   r   r    r#   r*   s   @r   r6   r6      r+   r   r6   c                   B   ^  \ rS rSr           SU 4S jjrSrU =r$ )	PagedAdam   c                 6   > [         TU ]  SUUUUUUUU	U
USS9  g)a  
Paged Adam optimizer.

Arguments:
    params (`torch.tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 0.0):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
    is_paged (`bool`, defaults to `False`):
        Whether the optimizer is a paged optimizer or not.
r   Tr	   Nr   r   s                r   r   PagedAdam.__init__   s<    X 	 	 	
r   r   r    r#   r*   s   @r   r;   r;      r+   r   r;   c                   B   ^  \ rS rSr           SU 4S jjrSrU =r$ )PagedAdam8biti  c                 |   > U(       a  [        S5      eUS:w  a  [        S5      e[        TU ]	  SUUUUUSUU	U
USS9  g)	a  
8-bit paged Adam optimizer.

Arguments:
    params (`torch.tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 0.0):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
        Note: This parameter is not supported in PagedAdam8bit and must be False.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
        Note: This parameter is not used in PagedAdam8bit as it always uses 8-bit optimization.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
    is_paged (`bool`, defaults to `False`):
        Whether the optimizer is a paged optimizer or not.
z+PagedAdam8bit does not support amsgrad=Truer!   zKPagedAdam8bit only supports optim_bits=32 (default value for compatibility)r   r0   Tr	   Nr1   r   s                r   r   PagedAdam8bit.__init__	  sa    ^ JKK jkk 	 	
r   r   r    r#   r*   s   @r   r@   r@     r4   r   r@   c                   B   ^  \ rS rSr           SU 4S jjrSrU =r$ )PagedAdam32bitiP  c                 6   > [         TU ]  SUUUUUSUU	U
USS9  g)a  
Paged 32-bit Adam optimizer.

Arguments:
    params (`torch.tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 0.0):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
    is_paged (`bool`, defaults to `False`):
        Whether the optimizer is a paged optimizer or not.
r   r!   Tr	   Nr   r   s                r   r   PagedAdam32bit.__init__Q  s<    X 	 	 	
r   r   r    r#   r*   s   @r   rD   rD   P  r+   r   rD   N)bitsandbytes.optim.optimizerr   r   r-   r6   r;   r@   rD   r   r   r   <module>rH      s[    9:
? :
zE
 E
P:
 :
z:
 :
zE
O E
P:
_ :
r   