
    i53                        S SK r S SKrS SKJr  S SKrS SKJrJr  SSKJr  SSK	J
r
  SSKJr  \
R                  " \5      r\" S5       " S	 S
\R                   5      5       r\" S5       " S S\R                   5      5       r\" S5       " S S\R                   5      5       r\" S5       " S S\R                   5      5       r\" S5       " S S\R                   5      5       r\" S5       " S S\R                   5      5       r " S S\R                   5      r " S S\R                   5      r " S S\R                   5      r " S  S!\R                   5      r " S" S#\R                   5      r " S$ S%\R                   5      r " S& S'\5      r " S( S)\R                   5      r0 S*\_S+\S,S-S..4_S/\_S0\_S1\S2S304_S4\_S5\S6S304_S7\_S8\_S9\R>                  _S:\_S;\_S<\_S=\R@                  _S>\_S?\RB                  _S@\RD                  _\\RF                  \RH                  \RJ                  \SA.Er&\" \&5      r'SB r(\(" S15      r)\(" S05      r*\(" S*5      r+\(" S/5      r,\(" S<5      r-\(" SC5      r.\(" S;5      r/\(" S:5      r0g)D    N)OrderedDict)Tensornn   )use_kernel_forward_from_hub)logging)is_torchdynamo_compilingGeluTanhc                   \   ^  \ rS rSrSrS
S\4U 4S jjjrS\S\4S jrS\S\4S jr	S	r
U =r$ )GELUTanh   a  
A fast C implementation of the tanh approximation of the GeLU activation function. See
https://huggingface.co/papers/1606.08415.

This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
match due to rounding errors.
use_gelu_tanh_pythonc                    > [         TU ]  5         U(       a  U R                  U l        g [        R
                  " [        R                  R                  SS9U l        g )Ntanh)approximate)	super__init___gelu_tanh_pythonact	functoolspartialr   
functionalgelu)selfr   	__class__s     h/home/dmtnaga/Documents/work/airagagent/rag_env/lib/python3.13/site-packages/transformers/activations.pyr   GELUTanh.__init__(   s<    --DH ((););PDH    inputreturnc                     US-  S[         R                  " [        R                  " S[        R                  -  5      US[         R
                  " US5      -  -   -  5      -   -  $ N      ?      ?       @Hm?g      @torchr   mathsqrtpipowr   r   s     r   r   GELUTanh._gelu_tanh_python/   sP    s{cEJJtyytww/G5S[^c^g^ghmor^sSsKs/t$uuvvr   c                 $    U R                  U5      $ Nr   r-   s     r   forwardGELUTanh.forward2       xxr   r1   F)__name__
__module____qualname____firstlineno____doc__boolr   r   r   r2   __static_attributes____classcell__r   s   @r   r   r      sJ    QT Q Qwv w& wV   r   r   NewGELUc                   *    \ rS rSrSrS\S\4S jrSrg)NewGELUActivation6   z
Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
r   r    c                     SU-  S[         R                  " [        R                  " S[        R                  -  5      US[         R
                  " US5      -  -   -  5      -   -  $ r"   r'   r-   s     r   r2   NewGELUActivation.forward=   sP    U{cEJJtyytww/G5S[^c^g^ghmor^sSsKs/t$uuvvr    Nr6   r7   r8   r9   r:   r   r2   r<   rE   r   r   rA   rA   6   s    
wV w wr   rA   GeLUc                   \   ^  \ rS rSrSrS
S\4U 4S jjjrS\S\4S jrS\S\4S jr	S	r
U =r$ )GELUActivationA   a  
Original Implementation of the GELU activation function in Google BERT repo when initially created. For
information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
Also see the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
use_gelu_pythonc                    > [         TU ]  5         U(       a  U R                  U l        g [        R
                  R                  U l        g r0   )r   r   _gelu_pythonr   r   r   r   )r   rK   r   s     r   r   GELUActivation.__init__J   s/    ((DH}}))DHr   r   r    c                 n    US-  S[         R                  " U[        R                  " S5      -  5      -   -  $ )Nr#   r$   r%   )r(   erfr)   r*   r-   s     r   rM   GELUActivation._gelu_pythonQ   s,    s{cEIIediin.D$EEFFr   c                 $    U R                  U5      $ r0   r1   r-   s     r   r2   GELUActivation.forwardT   r4   r   r1   r5   )r6   r7   r8   r9   r:   r;   r   r   rM   r2   r<   r=   r>   s   @r   rI   rI   A   sG    * * *G& GV GV   r   rI   SiLUc                   *    \ rS rSrSrS\S\4S jrSrg)SiLUActivationX   a  
See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
later.
r   r    c                 @    [         R                  R                  U5      $ r0   )r   r   silur-   s     r   r2   SiLUActivation.forwardb   s    }}!!%((r   rE   NrF   rE   r   r   rV   rV   X   s    )V ) )r   rV   FastGELUc                   *    \ rS rSrSrS\S\4S jrSrg)FastGELUActivationf   zu
Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
r   r    c                 ^    SU-  S[         R                  " US-  SSU-  U-  -   -  5      -   -  $ )Nr#   r$   g3E?r&   )r(   r   r-   s     r   r2   FastGELUActivation.forwardl   s:    U{cEJJu|/CsXX]M]`eMeGe/f$gghhr   rE   NrF   rE   r   r   r]   r]   f   s    iV i ir   r]   	QuickGELUc                   *    \ rS rSrSrS\S\4S jrSrg)QuickGELUActivationp   zj
Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
r   r    c                 :    U[         R                  " SU-  5      -  $ )NgZd;?)r(   sigmoidr-   s     r   r2   QuickGELUActivation.forwardv   s    u}}UU]333r   rE   NrF   rE   r   r   rc   rc   p   s    4V 4 4r   rc   c                   J   ^  \ rS rSrSrS\S\4U 4S jjrS\S\4S jrS	r	U =r
$ )
ClippedGELUActivationz   ar  
Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
https://huggingface.co/papers/2004.09602.

Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
initially created.

For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://huggingface.co/papers/1606.08415
minmaxc                 h   > X:  a  [        SU SU S35      e[        TU ]	  5         Xl        X l        g )Nzmin should be < max (got min: z, max: ))
ValueErrorr   r   rk   rl   )r   rk   rl   r   s      r   r   ClippedGELUActivation.__init__   s8    9=cU'#aPQQr   xr    c                 l    [         R                  " [        U5      U R                  U R                  5      $ r0   )r(   clipr   rk   rl   )r   rq   s     r   r2   ClippedGELUActivation.forward   s!    zz$q'488TXX66r   )rl   rk   )r6   r7   r8   r9   r:   floatr   r   r2   r<   r=   r>   s   @r   ri   ri   z   s3    
E  7 7F 7 7r   ri   c                   >   ^  \ rS rSrSrU 4S jrS\S\4S jrSrU =r	$ )AccurateGELUActivation   z
Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
https://github.com/hendrycks/GELUs

Implemented along with MEGA (Moving Average Equipped Gated Attention)
c                 z   > [         TU ]  5         [        R                  " S[        R                  -  5      U l        g )N   )r   r   r)   r*   r+   precomputed_constantr   r   s    r   r   AccurateGELUActivation.__init__   s'    $(IIa$''k$:!r   r   r    c                     SU-  S[         R                  " U R                  US[         R                  " US5      -  -   -  5      -   -  $ )Nr#   r   r&      )r(   r   r{   r,   r-   s     r   r2   AccurateGELUActivation.forward   sE    U{a%**T-F-F%RZ]b]f]fglno]pRpJp-q"rrssr   )r{   )
r6   r7   r8   r9   r:   r   r   r2   r<   r=   r>   s   @r   rw   rw      s)    ;tV t t tr   rw   c                   P   ^  \ rS rSrSrU 4S jrS\S\4S jrS\S\4S jrSr	U =r
$ )	MishActivation   z
See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://huggingface.co/papers/1908.08681). Also
visit the official repository for the paper: https://github.com/digantamisra98/Mish
c                 `   > [         TU ]  5         [        R                  R                  U l        g r0   )r   r   r   r   mishr   r|   s    r   r   MishActivation.__init__   s    ==%%r   r   r    c                 n    U[         R                  " [        R                  R	                  U5      5      -  $ r0   )r(   r   r   r   softplusr-   s     r   _mish_pythonMishActivation._mish_python   s%    uzz"--"8"8"?@@@r   c                 $    U R                  U5      $ r0   r1   r-   s     r   r2   MishActivation.forward   r4   r   r1   )r6   r7   r8   r9   r:   r   r   r   r2   r<   r=   r>   s   @r   r   r      s;    
&A& AV AV   r   r   c                   *    \ rS rSrSrS\S\4S jrSrg)LinearActivation   zS
Applies the linear activation function, i.e. forwarding input directly to output.
r   r    c                     U$ r0   rE   r-   s     r   r2   LinearActivation.forward   s    r   rE   NrF   rE   r   r   r   r      s    V  r   r   c                   "    \ rS rSrSrSS jrSrg)LaplaceActivation   z
Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
https://huggingface.co/papers/2209.10655

Inspired by squared relu, but with bounded range and gradient for better stability
c                     X-
  R                  U[        R                  " S5      -  5      nSS[        R                  " U5      -   -  $ )Nr%   r#   r$   )divr)   r*   r(   rP   )r   r   musigmas       r   r2   LaplaceActivation.forward   s:      3!78cEIIe,,--r   rE   N)g۞?g ^/?r6   r7   r8   r9   r:   r2   r<   rE   r   r   r   r      s    .r   r   c                       \ rS rSrSrS rSrg)ReLUSquaredActivation   zX
Applies the relu^2 activation introduced in https://huggingface.co/papers/2109.08668v2
c                 p    [         R                  R                  U5      n[        R                  " U5      nU$ r0   )r   r   relur(   square)r   r   relu_appliedsquareds       r   r2   ReLUSquaredActivation.forward   s)    }}))%0,,|,r   rE   Nr   rE   r   r   r   r      s    r   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )ClassInstantier   c                 l   > [         TU ]  U5      n[        U[        5      (       a  UOU0 4u  p4U" S0 UD6$ )NrE   )r   __getitem__
isinstancetuple)r   keycontentclskwargsr   s        r   r   ClassInstantier.__getitem__   s7    '%c*!+GU!;!;g'2}V}r   rE   )r6   r7   r8   r9   r   r<   r=   r>   s   @r   r   r      s     r   r   c                      ^  \ rS rSrSrSSSS\R                  S4U 4S jjrS\S	\4S
 jr	S\S	\4S jr
S\S	\4S jrSrU =r$ )XIELUActivation   z
Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010

If the user has installed the nickjbrowning/XIELU wheel, we import xIELU CUDA
Otherwise, we emit a single warning and use xIELU Python
g?r#   gưFc                 P  > [         TU ]  5         [        R                  " [        R
                  " [        R                  " [        R                  " XS95      5      R                  S5      5      U l	        [        R                  " [        R
                  " [        R                  " [        R                  " X#-
  US95      5      R                  S5      5      U l
        U R                  S[        R                  " X5S95        U R                  S[        R                  " XES95        X`l        [        U R                  R                  5       R!                  5       R                  5       R#                  5       5      U l        [        U R&                  R                  5       R!                  5       R                  5       R#                  5       5      U l        S U l         SS Kn[        R.                  R0                  R3                  5       U l        Sn SSKJn	  U	" U R8                  5      U l        US-  n[>        RA                  U5        g ! [<         a$  n
USU
 S	3-  nU R8                  U l         S n
A
N?S n
A
ff = f! [<         a)  n
[>        RA                  S
[C        U
5      5         S n
A
g S n
A
ff = f)N)dtyper   betaepszUsing experimental xIELU CUDA.)allow_in_graphz& Enabled torch._dynamo for xIELU CUDA.z+ Could not enable torch._dynamo for xIELU (z*) - this may result in slower performance.u   CUDA-fused xIELU not available (%s) – falling back to a Python version.
For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`)"r   r   r   	Parameterr(   logexpm1tensor	unsqueezealpha_palpha_nregister_bufferwith_vector_loadsru   r   detachcpuitem_beta_scalarr   _eps_scalar_xielu_cuda_obj	xielu.opsclassesxieluXIELUtorch._dynamor   _xielu_cuda_xielu_cuda_fn	Exceptionloggerwarning_oncestr)r   alpha_p_initalpha_n_initr   r   r   r   r   msgr   errr   s              r   r   XIELUActivation.__init__   s    	||EIIekk%,,|:a.b$c$m$mno$pq||IIekk%,,|/B%"PQR\\]^_
 	VU\\$%DEUELL$BC!2!$))"2"2"4"8"8":"@"@"B"G"G"IJ !2!6!6!8!>!>!@!E!E!GH#	#(==#6#6#<#<#>D 2C78&4T5E5E&F#?? $  7DSEIstt&*&6&6##7  	jC 	sB   3I2 	"I +I2 
I/I*%I2 *I//I2 2
J%<J  J%rq   r    c           
         [         R                  R                  U R                  5      nU R                  [         R                  R                  U R
                  5      -   n[        R                  " US:  X!-  U-  U R                  U-  -   [        R                  " [        R                  " XR                  5      5      U-
  U-  U R                  U-  -   5      $ )Nr   )r   r   r   r   r   r   r(   wherer   rk   r   )r   rq   r   r   s       r   _xielu_pythonXIELUActivation._xielu_python  s    --((6))bmm44T\\BB{{EK!Odii!m+[[1hh/014?$))a-O
 	
r   c                    UR                   nUR                  5       S:  a'  UR                  S5      nUR                  5       S:  a  M'  UR                  5       S:  a"  UR                  SSUR	                  S5      5      nX!R                   :w  a!  [
        R                  SUUR                   5        U R                  R                  UU R                  R                  UR                  5      U R                  R                  UR                  5      U R                  U R                  U R                  5      nUR                  U5      $ )zDFirewall function to prevent torch.compile from seeing .item() callsr   r   r   z_Warning: xIELU input tensor expects 3 dimensions but got (shape: %s). Reshaping to (shape: %s).)shapedimr   viewsizer   r   r   r2   r   tor   r   r   r   r   )r   rq   original_shaperesults       r   r   XIELUActivation._xielu_cuda  s    eegkAA eegk557Q;r1affRj)AWW$q
 %%--LLOOAGG$LLOOAGG$""
 {{>**r   r   c                     U R                   bF  UR                  (       a5  [        5       (       d  U R                  U5      $ [        R                  S5        U R                  U5      $ )Nz:torch._dynamo is compiling, using Python version of xIELU.)r   is_cudar	   r   r   r   r   r-   s     r   r2   XIELUActivation.forward1  sN    ++--**511##$`a!!%((r   )r   r   r   r   r   r   r   )r6   r7   r8   r9   r:   r(   bfloat16r   r   r   r   r2   r<   r=   r>   s   @r   r   r      sd     nn)V
v 
& 
+V + +2)V ) ) )r   r   r   gelu_10i
   )rk   rl   	gelu_fastgelu_newgelu_pythonrK   Tgelu_pytorch_tanhgelu_python_tanhr   gelu_accuratelaplace
leaky_relulinearr   
quick_gelur   relu2relu6rf   )rY   swishr   prelur   c           	          U [         ;   a	  [         U    $ [        SU  S[        [         R                  5       5       35      e)Nz	function z not found in ACT2FN mapping )ACT2FNKeyErrorlistkeys)activation_strings    r   get_activationr   U  sB    F"'((#4"55RSWX^XcXcXeSfRghiir   rY   )1r   r)   collectionsr   r(   r   r   integrations.hub_kernelsr   utilsr   utils.import_utilsr	   
get_loggerr6   r   Moduler   rA   rI   rV   r]   rc   ri   rw   r   r   r   r   r   r   	LeakyReLUReLUReLU6SigmoidrT   TanhPReLUACT2CLSr   r   r   r   r   r   r   rY   r   
linear_actrE   r   r   <module>r     s     #   A  8 
		H	% Z(ryy  ). Y'w		 w (w V$RYY  %, V$
)RYY 
) %
) Z(i i )i [)4")) 4 *47BII 72tRYY t RYY "ryy 
.		 
.BII k [)bii [)|
N%s2'>? # !	
 N%6$=>  $:D#AB +   ",,  N % BGG "  RXX!" rzz#$ WWGGXX-0 
	!j ]+*%f;'	L)
ffH%
r   