
    9i@                     V   d Z ddlZddlmZmZmZmZ ddlZ	 ddl	m
Z
  G d dej                  j                  j                  j                         Z	 	 	 	 	 	 	 	 	 ddeded	ed
ededededee   dee   dededeee      fdZ G d de
      Z G d de      Zy# e$ r	 ddlm
Z
 Y w xY w)z?Functions and classes related to optimization (weight updates).    N)CallableListOptionalUnion)Adamc                   J     e Zd ZdZ	 	 d
dededededef
 fdZd Z	d	 Z
 xZS )WarmUpa  
    Applies a warmup schedule on a given learning rate decay schedule.

    Args:
        initial_learning_rate (`float`):
            The initial learning rate for the schedule after the warmup (so this will be the learning rate at the end
            of the warmup).
        decay_schedule_fn (`Callable`):
            The schedule function to apply after the warmup for the rest of training.
        warmup_steps (`int`):
            The number of steps for the warmup part of training.
        power (`float`, *optional*, defaults to 1.0):
            The power to use for the polynomial warmup (defaults is a linear warmup).
        name (`str`, *optional*):
            Optional name prefix for the returned tensors during the schedule.
    initial_learning_ratedecay_schedule_fnwarmup_stepspowernamec                 h    t         |           || _        || _        || _        || _        || _        y N)super__init__r
   r   r   r   r   )selfr
   r   r   r   r   	__class__s         f/var/www/html/backtest/airagagent/rag_env/lib/python3.12/site-packages/transformers/optimization_tf.pyr   zWarmUp.__init__0   s6     	%:"(
!2	    c                     t        j                   j                  xs d      5 }t        j                  t         j                        }t        j                   j
                  t         j                        }||z  } j                  t         j                  j                  | j                        z  t        j                  ||k  fd fd|      cd d d        S # 1 sw Y   y xY w)Nr	   c                       S r    )warmup_learning_rates   r   <lambda>z!WarmUp.__call__.<locals>.<lambda>I   s    , r   c                  @     j                   j                  z
        S r   )r   r   )r   steps   r   r   z!WarmUp.__call__.<locals>.<lambda>J   s    ..td6G6G/GH r   r   )tf
name_scoper   castfloat32r   r
   mathpowr   cond)r   r   r   global_step_floatwarmup_steps_floatwarmup_percent_doner   s   ``    @r   __call__zWarmUp.__call__?   s    ]]49901 	T !#bjj 9!#):):BJJ!G"36H"H#'#=#=L_aeakak@l#l 77!$66,H		 	 	s   B1C""C+c                 v    | j                   | j                  | j                  | j                  | j                  dS )Nr
   r   r   r   r   r+   r   s    r   
get_configzWarmUp.get_configN   s5    %)%?%?!%!7!7 --ZZII
 	
r   )      ?N)__name__
__module____qualname____doc__floatr   intstrr   r)   r-   __classcell__r   s   @r   r	   r	      sO    , $ $ 	
  
r   r	   init_lrnum_train_stepsnum_warmup_stepsmin_lr_ratio
adam_beta1
adam_beta2adam_epsilonadam_clipnormadam_global_clipnormweight_decay_rater   include_in_weight_decayc                 @   t         j                  j                  j                  j	                  | ||z
  | |z  |
      }|rt        | ||      }|	dkD  rt        ||	|||||g d|	      }||fS t         j                  j                  j                  ||||||      }||fS )a  
    Creates an optimizer with a learning rate schedule using a warmup phase followed by a linear decay.

    Args:
        init_lr (`float`):
            The desired learning rate at the end of the warmup phase.
        num_train_steps (`int`):
            The total number of training steps.
        num_warmup_steps (`int`):
            The number of warmup steps.
        min_lr_ratio (`float`, *optional*, defaults to 0):
            The final learning rate at the end of the linear decay will be `init_lr * min_lr_ratio`.
        adam_beta1 (`float`, *optional*, defaults to 0.9):
            The beta1 to use in Adam.
        adam_beta2 (`float`, *optional*, defaults to 0.999):
            The beta2 to use in Adam.
        adam_epsilon (`float`, *optional*, defaults to 1e-8):
            The epsilon to use in Adam.
        adam_clipnorm (`float`, *optional*, defaults to `None`):
            If not `None`, clip the gradient norm for each weight tensor to this value.
        adam_global_clipnorm (`float`, *optional*, defaults to `None`)
            If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all
            weight tensors, as if they were concatenated into a single vector.
        weight_decay_rate (`float`, *optional*, defaults to 0):
            The weight decay to use.
        power (`float`, *optional*, defaults to 1.0):
            The power to use for PolynomialDecay.
        include_in_weight_decay (`List[str]`, *optional*):
            List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
            applied to all parameters except bias and layer norm parameters.
    )r
   decay_stepsend_learning_rater   )r
   r   r           )	LayerNorm
layer_normbias)	learning_raterA   beta_1beta_2epsilonclipnormglobal_clipnormexclude_from_weight_decayrB   )rJ   rK   rL   rM   rN   rO   )r   keras
optimizers	schedulesPolynomialDecayr	   AdamWeightDecayr   )r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   r   rB   lr_schedule	optimizers                 r   create_optimizerrX   X   s    \ ((%%//??%#&66!L0	 @ K ")))

 3#%/ "0&I$;

	, k!! HH'',,% "0 - 
	 k!!r   c                   ,    e Zd ZdZ	 	 	 	 	 	 	 	 	 ddeeej                  j                  j                  j                  f   dedededededeee      d	eee      d
ef fdZe fd       Z fdZd Zd fd	Zd Zd fd	Zd fd	Z fdZd Z xZS )rU   a{
  
    Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the
    loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact
    with the m and v parameters in strange ways as shown in [Decoupled Weight Decay
    Regularization](https://arxiv.org/abs/1711.05101).

    Instead we want to decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent
    to adding the square of the weights to the loss with plain (non-momentum) SGD.

    Args:
        learning_rate (`Union[float, tf.keras.optimizers.schedules.LearningRateSchedule]`, *optional*, defaults to 0.001):
            The learning rate to use or a schedule.
        beta_1 (`float`, *optional*, defaults to 0.9):
            The beta1 parameter in Adam, which is the exponential decay rate for the 1st momentum estimates.
        beta_2 (`float`, *optional*, defaults to 0.999):
            The beta2 parameter in Adam, which is the exponential decay rate for the 2nd momentum estimates.
        epsilon (`float`, *optional*, defaults to 1e-07):
            The epsilon parameter in Adam, which is a small constant for numerical stability.
        amsgrad (`bool`, *optional*, defaults to `False`):
            Whether to apply AMSGrad variant of this algorithm or not, see [On the Convergence of Adam and
            Beyond](https://arxiv.org/abs/1904.09237).
        weight_decay_rate (`float`, *optional*, defaults to 0.0):
            The weight decay to apply.
        include_in_weight_decay (`List[str]`, *optional*):
            List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
            applied to all parameters by default (unless they are in `exclude_from_weight_decay`).
        exclude_from_weight_decay (`List[str]`, *optional*):
            List of the parameter names (or re patterns) to exclude from applying weight decay to. If a
            `include_in_weight_decay` is passed, the names in it will supersede this list.
        name (`str`, *optional*, defaults to `"AdamWeightDecay"`):
            Optional name for the operations created when applying gradients.
        kwargs (`Dict[str, Any]`, *optional*):
            Keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
            norm; `clipvalue` is clip gradients by value, `decay` is included for backward compatibility to allow time
            inverse decay of learning rate. `lr` is included for backward compatibility, recommended to use
            `learning_rate` instead.
    rJ   rK   rL   rM   amsgradrA   rB   rP   r   c
                 Z    t        |   ||||||	fi |
 || _        || _        || _        y r   )r   r   rA   _include_in_weight_decay_exclude_from_weight_decay)r   rJ   rK   rL   rM   rZ   rA   rB   rP   r   kwargsr   s              r   r   zAdamWeightDecay.__init__   s:     	$YRXY!2(?%*C'r   c                 >    dt         i}t        t        |   ||      S )z?Creates an optimizer from its config with WarmUp custom object.r	   )custom_objects)r	   r   rU   from_config)clsconfigr`   r   s      r   ra   zAdamWeightDecay.from_config   s&     #F+_c6vn6]]r   c                     t         t        |   |||       t        j                  | j
                  d      |||f   d<   y )Nadam_weight_decay_rater   rA   )r   rU   _prepare_localr   constantrA   )r   
var_device	var_dtypeapply_stater   s       r   rf   zAdamWeightDecay._prepare_local   sA    ot3J	;WDFKK"")AE
Z+,-@Ar   c                     | j                  |j                        }|rI|j                  ||z  ||j                  |j                  j
                  f   d   z  | j                        S t        j                         S )NrA   )use_locking)	_do_use_weight_decayr   
assign_subdevicedtype
base_dtype_use_lockingr   no_op)r   varrJ   rj   do_decays        r   _decay_weights_opz!AdamWeightDecay._decay_weights_op   sq    ,,SXX6>>#k3::syy?S?S2T&UVi&jj -- "   xxzr   c                 l    t        t        |       \  }}t        t        |   t        ||      fd|i|S )Nr   )listzipr   rU   apply_gradients)r   grads_and_varsr   r^   gradstvarsr   s         r   rz   zAdamWeightDecay.apply_gradients   s;    C01u_d;Cu<McTXc\bccr   c                     || j                   |   i fS |xs i }|j                  ||f      }|| j                  ||      }||||f<   |d   d|ifS )z1Retrieves the learning rate with the given state.lr_trj   )_decayed_lr_tget_fallback_apply_state)r   rh   ri   rj   coefficientss        r   _get_lrzAdamWeightDecay._get_lr   sw    %%i0"44!'R"
I'>?55j)LL3?KY/0F#m[%AAAr   c                    | j                  |j                  |j                  j                  |      \  }}| j	                  |||      }t        j                  |g      5  t        t        | &  ||fi |cd d d        S # 1 sw Y   y xY wr   )
r   ro   rp   rq   rv   r   control_dependenciesr   rU   _resource_apply_dense)r   gradrt   rj   r   r^   decayr   s          r   r   z%AdamWeightDecay._resource_apply_dense  s{    ||CJJ		0D0DkRf&&sD+>$$eW- 	[$EdCZSYZ	[ 	[ 	[s   A>>Bc                    | j                  |j                  |j                  j                  |      \  }}| j	                  |||      }t        j                  |g      5  t        t        | &  |||fi |cd d d        S # 1 sw Y   y xY wr   )
r   ro   rp   rq   rv   r   r   r   rU   _resource_apply_sparse)	r   r   rt   indicesrj   r   r^   r   r   s	           r   r   z&AdamWeightDecay._resource_apply_sparse  s~    ||CJJ		0D0DkRf&&sD+>$$eW- 	e$FtSRYd]cd	e 	e 	es   A??Bc                 ^    t         |          }|j                  d| j                  i       |S )NrA   )r   r-   updaterA   )r   rc   r   s     r   r-   zAdamWeightDecay.get_config  s-    #%*D,B,BCDr   c                     | j                   dk(  ry| j                  r)| j                  D ]  }t        j                  ||       y | j                  r)| j                  D ]  }t        j                  ||       y y)z0Whether to use L2 weight decay for `param_name`.r   FT)rA   r\   researchr]   )r   
param_namers      r   rm   z$AdamWeightDecay._do_use_weight_decay  s~    !!Q&((22  99Q
+7  **44 !99Q
+7 ! r   )	gMbP??+?gHz>FrF   NNrU   r   )r/   r0   r1   r2   r   r3   r   rQ   rR   rS   LearningRateScheduleboolr   r   r5   r   classmethodra   rf   rv   rz   r   r   r   r-   rm   r6   r7   s   @r   rU   rU      s   $P [`#&7;9=%DUBHH$7$7$A$A$V$VVWD D 	D
 D D !D "*$s)!4D $,DI#6D D$ ^ ^

dB[e
r   rU   c                   B    e Zd ZdZd Zed        Zed        Zd Zd Z	y)GradientAccumulatoraR  
    Gradient accumulation utility. When used with a distribution strategy, the accumulator should be called in a
    replica context. Gradients will be accumulated locally on each replica and without synchronization. Users should
    then call `.gradients`, scale the gradients if required, and pass the result to `apply_gradients`.
    c                      g | _         d| _        y)zInitializes the accumulator.N)
_gradients_accum_stepsr,   s    r   r   zGradientAccumulator.__init__9  s     r   c                 0   | j                   qt        j                  t        j                  dt        j                        dt        j
                  j                  t        j                  j                        | _         | j                   j                         S )zNumber of accumulated steps.r   )rp   F	trainablesynchronizationaggregation)
r   r   Variablerg   int64VariableSynchronizationON_READVariableAggregationONLY_FIRST_REPLICAvaluer,   s    r   r   zGradientAccumulator.step>  sk     $ "ARXX. " : : B B22EE	!D   &&((r   c                     | j                   st        d      | j                   D cg c]  }||j                         n| c}S c c}w )z1The accumulated gradients on the current replica.zBThe accumulator should be called first to initialize the gradients)r   
ValueErrorr   r   gradients     r   	gradientszGradientAccumulator.gradientsK  sB     abbW[WfWfg8H$8 hFgggs   Ac                    | j                   s| j                  }| j                   j                  |D cg c]b  }|\t        j                  t        j
                  |      dt        j                  j                  t        j                  j                        n|d c}       t        |      t        | j                         k7  r-t        dt        | j                          dt        |             t        | j                   |      D ]  \  }}|	||j                  |        | j                  j                  d       yc c}w )z/Accumulates `gradients` on the current replica.NFr   z	Expected z gradients, but got    )r   r   extendr   r   
zeros_liker   r   r   r   lenr   ry   
assign_addr   )r   r   _r   accum_gradients        r   r)   zGradientAccumulator.__call__R  s   		AOO"" %.
 !  + KKh/"'(*(B(B(J(J$&$:$:$M$M	 ""
 y>S11yT__)=(>>RSVW`SaRbcdd(+DOOY(G 	4$NH)h.B))(3	4 	$$Q''
s   A'D?c                     | j                   sy| j                  j                  d       | j                   D ])  }||j                  t        j                  |             + y)z8Resets the accumulated gradients on the current replica.Nr   )r   r   assignr   r   r   s     r   resetzGradientAccumulator.resetl  sN      # 	9H#h 78	9r   N)
r/   r0   r1   r2   r   propertyr   r   r)   r   r   r   r   r   r   .  s@    !
 
) 
) h h(49r   r   )	rF   r   r   g:0yE>NNrF   r.   N)r2   r   typingr   r   r   r   
tensorflowr   "tensorflow.keras.optimizers.legacyr   ImportErrortensorflow.keras.optimizersrQ   rR   rS   r   r	   r3   r4   r5   rX   rU   objectr   r   r   r   <module>r      s+   F 
 2 2 17
7
RXX  **?? 7
| %),0"37Q"Q"Q" Q" 	Q"
 Q" Q" Q" E?Q" #5/Q" Q" Q" &d3i0Q"h~d ~DE9& E9i  101s   B B('B(