
    ;i                     R    S SK Jr   " S S\5      r " S S\5      r " S S\5      rg)	    )Optimizer2Statec                   F   ^  \ rS rSr             SU 4S jjrSrU =r$ )LAMB   c                 6   > [         TU ]  SUUUUUU	U
UUUSS9  g)a  
Base LAMB optimizer.

Arguments:
    params (`torch.tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    bias_correction (`bool`, defaults to `True`):
        Whether to apply bias correction to the first and second-order moments.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 1e-2):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
    adam_w_mode (`bool`, defaults to `True`):
        Whether to use the AdamW variant.
    optim_bits (`int`, defaults to 32):
        The number of bits of the optimizer state.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
    max_unorm (`float`, defaults to 1.0):
        The maximum gradient norm.
lamb      ?	max_unormNsuper__init__)selfparamslrbias_correctionbetasepsweight_decayamsgradadam_w_mode
optim_bitsargsmin_8bit_sizepercentile_clipping
block_wiser   	__class__s                  g/home/dmtnaga/Documents/work/airagagent/rag_env/lib/python3.13/site-packages/bitsandbytes/optim/lamb.pyr   LAMB.__init__	   s<    d 	 	 	
     )MbP?Tg?g+?:0yE>r   FT    N   d   Fr	   __name__
__module____qualname____firstlineno__r   __static_attributes____classcell__r   s   @r   r   r      s6     ?
 ?
r    r   c                   D   ^  \ rS rSr            SU 4S jjrSrU =r$ )LAMB8bitK   c                 6   > [         TU ]  SUUUUUSU	U
UUSS9  g)a@  
8-bit LAMB optimizer.

Arguments:
    params (`torch.tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    bias_correction (`bool`, defaults to `True`):
        Whether to apply bias correction to the first and second-order moments.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 1e-2):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
    adam_w_mode (`bool`, defaults to `True`):
        Whether to use the AdamW variant.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
    max_unorm (`float`, defaults to 1.0):
        The maximum gradient norm.
r   r   r	   r
   Nr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s                 r   r   LAMB8bit.__init__L   s<    ^ 	 	 	
r    r!   r"   Tr#   r$   r   FTNr&   r'   Fr	   r(   r/   s   @r   r1   r1   K   3     <
 <
r    r1   c                   D   ^  \ rS rSr            SU 4S jjrSrU =r$ )	LAMB32bit   c                 6   > [         TU ]  SUUUUUSU	U
UUSS9  g)aA  
32-bit LAMB optimizer.

Arguments:
    params (`torch.tensor`):
        The input parameters to optimize.
    lr (`float`, defaults to 1e-3):
        The learning rate.
    bias_correction (`bool`, defaults to `True`):
        Whether to apply bias correction to the first and second-order moments.
    betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
        The beta values are the decay rates of the first and second-order moment of the optimizer.
    eps (`float`, defaults to 1e-8):
        The epsilon value prevents division by zero in the optimizer.
    weight_decay (`float`, defaults to 1e-2):
        The weight decay value for the optimizer.
    amsgrad (`bool`, defaults to `False`):
        Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
    adam_w_mode (`bool`, defaults to `True`):
        Whether to use the AdamW variant.
    args (`object`, defaults to `None`):
        An object with additional arguments.
    min_8bit_size (`int`, defaults to 4096):
        The minimum number of elements of the parameter tensors for 8-bit optimization.
    percentile_clipping (`int`, defaults to 100):
        Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
    block_wise (`bool`, defaults to `True`):
        Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
    max_unorm (`float`, defaults to 1.0):
        The maximum gradient norm.
r   r%   r	   r
   Nr   r4   s                 r   r   LAMB32bit.__init__   s<    ^ 	 	 	
r    r!   r6   r(   r/   s   @r   r9   r9      r7   r    r9   N)bitsandbytes.optim.optimizerr   r   r1   r9   r!   r    r   <module>r>      s2   
 9@
? @
F=
 =
@=
 =
r    