
    ;i                        S SK r S SKJrJrJrJrJr  S SKrS SKrS SKJ	r	J
r
JrJr  S SKJs  Jr  S SKrS SKJr  S SKJr  S SKJr  S SKJrJr  \" SS	S
9r " S S\R                  R8                  5      r " S S\R                  R8                  5      r " S S\R                  R<                  5      rS\S   4S jr  " S S\RB                  5      r" " S S\"5      r# " S S\"5      r$ " S S\R                  R<                  5      r%S r& " S S\R8                  5      r' " S S \R8                  5      r( " S! S"\(5      r) " S# S$\(5      r* " S% S&\RB                  5      r+ " S' S(\RB                  5      r, " S) S*\RB                  5      r-g)+    N)AnyOptionalTypeVarUnionoverload)Tensordevicedtypenn)HIP_ENVIRONMENT)
QuantState)GlobalOptimManager)*INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPINGOutlierTracerTztorch.nn.Module)boundc                      ^  \ rS rSrSr        SS\S\S\\   S\\   S\S	\S
\S\\	   SS4U 4S jjjr
SS jr SS jrS\	S\	4S jrSrU =r$ )StableEmbedding   a  
Custom embedding layer designed to improve stability during training for NLP tasks by using 32-bit optimizer states. It is designed to reduce gradient variations that can result from quantization. This embedding layer is initialized with Xavier uniform initialization followed by layer normalization.

Example:

```
# Initialize StableEmbedding layer with vocabulary size 1000, embedding dimension 300
embedding_layer = StableEmbedding(num_embeddings=1000, embedding_dim=300)

# Reset embedding parameters
embedding_layer.reset_parameters()

# Perform a forward pass with input tensor
input_tensor = torch.tensor([1, 2, 3])
output_embedding = embedding_layer(input_tensor)
```

Attributes:
    norm (`torch.nn.LayerNorm`): Layer normalization applied after the embedding.

Methods:
    reset_parameters(): Reset embedding parameters using Xavier uniform initialization.
    forward(input: Tensor) -> Tensor: Forward pass through the stable embedding layer.
Nnum_embeddingsembedding_dimpadding_idxmax_norm	norm_typescale_grad_by_freqsparse_weightreturnc                    > [         TU ]  UUUUUUUUU	U
5
        [        R                  R	                  X)S9U l        [        R                  " 5       R                  U SSS05        ga  
Args:
    num_embeddings (`int`):
        The number of unique embeddings (vocabulary size).
    embedding_dim (`int`):
        The dimensionality of the embedding.
    padding_idx (`Optional[int]`):
        Pads the output with zeros at the given index.
    max_norm (`Optional[float]`):
        Renormalizes embeddings to have a maximum L2 norm.
    norm_type (`float`, defaults to `2.0`):
        The p-norm to compute for the `max_norm` option.
    scale_grad_by_freq (`bool`, defaults to `False`):
        Scale gradient by frequency during backpropagation.
    sparse (`bool`, defaults to `False`):
        Computes dense gradients. Set to `True` to compute sparse gradients instead.
    _weight (`Optional[Tensor]`):
        Pretrained embeddings.
r	   weight
optim_bits    N)	super__init__torchr   	LayerNormnormr   get_instanceregister_module_override)selfr   r   r   r   r   r   r   r   r	   r
   	__class__s              g/home/dmtnaga/Documents/work/airagagent/rag_env/lib/python3.13/site-packages/bitsandbytes/nn/modules.pyr&   StableEmbedding.__init__0   sn    @ 		
 HH&&}&D	'')BB4T`bdSef    c                     [         R                  R                  R                  U R                  5        U R                  5         g Nr'   r   initxavier_uniform_r"   _fill_padding_idx_with_zeror,   s    r.   reset_parameters StableEmbedding.reset_parameters_   (    %%dkk2((*r0   c                     U R                   bG  [        R                  " 5          U R                  U R                      R	                  S5        S S S 5        g g ! , (       d  f       g = fNr   r   r'   no_gradr"   fill_r7   s    r.   r6   +StableEmbedding._fill_padding_idx_with_zeroj   F    'D,,-33A6 ! (    )A
A$inputc           	      b   [         R                  " UU R                  U R                  U R                  U R
                  U R                  U R                  5      nUR                  [        R                  " 5       5      nU R                  U5      R                  U R                  R                  5      $ r2   )F	embeddingr"   r   r   r   r   r   tor'   get_default_dtyper)   r
   r,   rC   embs      r.   forwardStableEmbedding.forwardo   s}    kkKKMMNN##KK
 ffU,,./yy~  !2!233r0   )r)   )NN       @FFNNNr   N)__name__
__module____qualname____firstlineno____doc__intr   floatboolr   r&   r8   r6   rK   __static_attributes____classcell__r-   s   @r.   r   r      s    : &*$(#($(-g-g -g c]	-g
 5/-g -g !-g -g &!-g 
-g -g^+7
4V 4 4 4r0   r   c                      ^  \ rS rSrSr       SS\S\S\\   S\\   S\S	\S
\S\\	   S\\
   SS4U 4S jjjrSS jr SS jrS\	S\	4S jrSrU =r$ )	Embedding   zK
Embedding class to store and retrieve word embeddings from their indices.
Nr   r   r   r   r   r   r   r   r	   r   c
                    > [         T
U ]  UUUUUUUUU	S9	  [        R                  " 5       R	                  U SSS05        gr    )r%   r&   r   r*   r+   )r,   r   r   r   r   r   r   r   r   r	   r-   s             r.   r&   Embedding.__init__   sV    > 	 	 
	
 	'')BB4T`bdSefr0   c                     [         R                  R                  R                  U R                  5        U R                  5         g r2   r3   r7   s    r.   r8   Embedding.reset_parameters   r:   r0   c                     U R                   bG  [        R                  " 5          U R                  U R                      R	                  S5        S S S 5        g g ! , (       d  f       g = fr<   r=   r7   s    r.   r6   %Embedding._fill_padding_idx_with_zero   rA   rB   rC   c           	          [         R                  " UU R                  U R                  U R                  U R
                  U R                  U R                  5      nU$ r2   )rE   rF   r"   r   r   r   r   r   rI   s      r.   rK   Embedding.forward   sH    kkKKMMNN##KK
 
r0    )NNrM   FFNNrN   )rO   rP   rQ   rR   rS   rT   r   rU   rV   r   r	   r&   r8   r6   rK   rW   rX   rY   s   @r.   r[   r[      s     &*$(#($(#'*g*g *g c]	*g
 5/*g *g !*g *g &!*g  *g 
*g *gX+7
V   r0   r[   c                   P  ^  \ rS rSrSSSSSS\R
                  SS4	S\\R                     S\\   S\\	   S	\
S
\S\R                  S\S   S\
SS 4S jjrS rS rS rS r\   S'S\R                  S\\\4   S\
S\S   SS 4
S jj5       rS rS rS(S\\\	\\4      S\
4S jjrS(S\\\	\\4      S\
4S jjr\   S)S\S\\\	\4      S\\\\4      S\
S\4
S  jj5       r\S*S\S\\\4   S\
S\4S! jj5       r\S*S\S"\S\
S\4S# jj5       rU 4S$ jr\S+U 4S% jj5       rS&r U =r!$ ),
Params4bit   NFTfp4dataquant_state	blocksizecompress_statistics
quant_typequant_storagemodule
Linear4bitbnb_quantizedr   c
                     Uc  [         R                  " S5      nUc  [        (       d  SOSn[         R                  R	                  XU5      n
XJl        XZl        Xjl        X:l        Xzl	        Xl
        Xl        Xl        U
$ )Nr   @   r\   )r'   emptyr   r   _make_subclassrl   rm   rn   rk   ro   rr   rj   rp   )clsrj   requires_gradrk   rl   rm   rn   ro   rp   rr   r,   s              r.   __new__Params4bit.__new__   sn     <;;q>D"1/sI||**3mD"#6 $&**	r0   c                 v    U R                   R                  5       nU R                  US'   U R                  US'   U$ )Nrj   rx   )__dict__copyrj   rx   r,   states     r.   __getstate__Params4bit.__getstate__   s6    ""$		f!%!3!3or0   c                     US   U l         US   U l        US   U l        US   U l        US   U l        US   U l        US   U l        US   U l        US	   U l        g )
Nrx   rl   rm   rn   rk   rj   ro   rr   rp   )	rx   rl   rm   rn   rk   rj   ro   rr   rp   r~   s     r.   __setstate__Params4bit.__setstate__   sr    "?3{+#()>#? - /&M	"?3"?3Hor0   c                    [        U 5      R                  [        U 5      5      nU R                  5       nUR                  U5        [        R
                  " US   5      Ul        [        R
                  " US   5      Ul        U$ )Nrk   rj   )typery   r   r   r}   deepcopyrk   rj   )r,   memonew_instancer   s       r.   __deepcopy__Params4bit.__deepcopy__   sg    Dz))$t*5!!#!!%(#'==}1E#F  MM%-8r0   c                     [        U 5      R                  [        U 5      5      nU R                  5       nUR                  U5        U$ r2   )r   ry   r   r   )r,   r   r   s      r.   __copy__Params4bit.__copy__  s<    Dz))$t*5!!#!!%(r0   quantized_statsrx   c                    [         R                  R                  XR                  U5      5      nX7l        [
        R                  " X$S9Ul        UR                  R                  Ul        UR                  R                  Ul
        UR                  R                  Ul        SUl        UR                  Ul        XWl        UR                  b  UR                  UR                  l        U$ )N)qs_dictr	   T)r'   r   rv   rG   rx   r   	from_dictrk   rl   nestedrm   rn   rr   r
   ro   rp   )rw   rj   r   rx   r	   rp   kwargsr,   s           r.   from_prequantizedParams4bit.from_prequantized  s     ||**3@*%//W))33#'#3#3#:#: **55!!ZZ;;"&*&6&6DKK#r0   c                 N   U R                   R                  5       R                  U5      n[        R                  R                  UU R                  U R                  U R                  U R                  S9u  p4X0l         X@l
        U R                  b  X@R                  l
        SU l        U $ )N)rl   rm   rn   ro   T)rj   
contiguousrG   bnb
functionalquantize_4bitrl   rm   rn   ro   rk   rp   rr   )r,   r	   ww_4bitrk   s        r.   	_quantizeParams4bit._quantize&  s    II  "%%f-!nn::nn $ 8 8,, ; 
 	&;;"&1KK#!r0   c                      U R                  SS9$ Ncpur!   rG   r7   s    r.   r   Params4bit.cpu6      wwew$$r0   r	   non_blockingc                 2    U R                  Uc  SUS9$ UUS9$ Ncudar	   r   r   r,   r	   r   s      r.   r   Params4bit.cuda9  '    wwfQ]w^^FQ]w^^r0   c                 2    U R                  Uc  SUS9$ UUS9$ Nxpur   r   r   s      r.   r   Params4bit.xpu<  '    wwv~eP\w]]6P\w]]r0   r,   r
   c                     g r2   re   r,   r	   r
   r   s       r.   rG   Params4bit.to?       r0   c                     g r2   re   r,   r
   r   s      r.   rG   r   G      NQr0   tensorc                     g r2   re   r,   r   r   s      r.   rG   r   J      DGr0   c                   > [         R                  R                  R                  " U0 UD6u  p4pVUb2  UR                  S:w  a"  U R
                  (       d  U R                  U5      $ U R                  b  U R                  R                  U5        [        [        TU ]!  X4US9U R                  U R                  U R                  U R                  U R                  U R                  U R
                  S9nU$ )Nmetar	   r
   r   )rx   rk   rl   rm   rn   ro   rr   )r'   _C_nn	_parse_tor   rr   r   rk   rG   rg   r%   rx   rl   rm   rn   ro   )	r,   argsr   r	   r
   r   convert_to_format	new_paramr-   s	           r.   rG   r   M  s    9>9O9OQU9`Y_9`6|&++"7@R@R>>&))+  ##F+"
&L
Q"00 ,,..$($<$<??"00"00	I r0   c                   >^ ^ Uc  0 nU[         R                  [         R                  4;   a  US   m[        TT ]  XX45      n[        U[        5      (       a  [        U U4S jU 5       5      $ T " UTR                  TR                  TR                  TR                  TR                  TR                  TR                  TR                  S9	$ [        TT ]  XX45      $ )Nr   c              3      >#    U  Hc  nT" UTR                   TR                  TR                  TR                  TR                  TR
                  TR                  TR                  S 9	v   Me     g7f)	rj   rx   rk   rl   rm   rn   ro   rp   rr   N)rx   rk   rl   rm   rn   ro   rp   rr   ).0chunkrw   r   s     r.   	<genexpr>0Params4bit.__torch_function__.<locals>.<genexpr>n  sj       "( "&,&:&:$*$6$6"("2"2,2,F,F#)#4#4&,&:&:%}}&,&:&:
 "(s   A+A.r   )r'   r   splitr%   __torch_function__
isinstancetuplerx   rk   rl   rm   rn   ro   rp   rr   )rw   functypesr   r   resultr   r-   s   `     @r.   r   Params4bit.__torch_function__c  s    >FEKK--!WFW/TJF&%((  "(   "("6"6 & 2 2$..(.(B(B%00"("6"6!=="("6"6
 
 w)$tDDr0   )	rl   rr   rm   rj   rp   rk   ro   rn   rx   )Fr   NNF....)re   N)"rO   rP   rQ   rR   r'   uint8r   r   r   rT   rV   strr
   ry   r   r   r   r   classmethoddictr   r   r   r   r   r	   r   r   r   r   rG   r   rW   rX   rY   s   @r.   rg   rg      s=    (,,0#'$(%*[[)-#u||$ j)	
 C= "  {{ &  
:	& 
 $)-ll c3h 	 & 
 2 %_8E#vs*:$;< _SW _^(5fc)9#:; ^RV ^  03-0 	sF{+, eSj)* 	
 
  QQ5,QDQ1Q QGGFG$GG G, %E %Er0   rg   rp   )Embedding4bitrq   c                 x   [        U R                  SS 5      b  g [        U SS 5      c  [        R                  " S5        U R                  R                  S   S:X  d   e[        U R                  [        5      (       d$  [        U R                  U R                  SS9U l        U R                  U R                  l        g )Nrk   zhFP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.   T)ro   rr   )	getattrr"   warningswarnshaper   rg   ro   rk   )rp   s    r.   'fix_4bit_weight_quant_state_from_moduler     s    v}}mT2>v}d+3v	
 ==q!Q&&&fmmZ00"6==@T@Tdhi & 2 2FMMr0   c                      ^  \ rS rSrSrSSSS\R                  S4U 4S jjrS rU 4S jr	S	\R                  4S
 jrSrU =r$ )rq   i  aq  
This class is the base module for the 4-bit quantization algorithm presented in [QLoRA](https://arxiv.org/abs/2305.14314).
QLoRA 4-bit linear layers uses blockwise k-bit quantization under the hood, with the possibility of selecting various
compute datatypes such as FP4 and NF4.

In order to quantize a linear layer one should first load the original fp16 / bf16 weights into
the Linear4bit module, then call `quantized_module.to("cuda")` to quantize the fp16 / bf16 weights.

Example:

```python
import torch
import torch.nn as nn

import bitsandbytes as bnb
from bnb.nn import Linear4bit

fp16_model = nn.Sequential(
    nn.Linear(64, 64),
    nn.Linear(64, 64)
)

quantized_model = nn.Sequential(
    Linear4bit(64, 64),
    Linear4bit(64, 64)
)

quantized_model.load_state_dict(fp16_model.state_dict())
quantized_model = quantized_model.to(0) # Quantization happens here
```
TNri   c	           	         > [         T	U ]  XX85        [        U R                  R                  SUUUU S9U l        X@l        USLU l        SU l        Xpl        g)a/  
Initialize Linear4bit class.

Args:
    input_features (`str`):
        Number of input features of the linear layer.
    output_features (`str`):
        Number of output features of the linear layer.
    bias (`bool`, defaults to `True`):
        Whether the linear class uses the bias term as well.
Frx   rm   rn   ro   rp   N)	r%   r&   rg   r"   rj   compute_dtypecompute_type_is_setrk   ro   )
r,   input_featuresoutput_featuresbiasr   rm   rn   ro   r	   r-   s
            r.   r&   Linear4bit.__init__  s_    , 	$G KK 3!'
 +#0#< *r0   c                 v   UR                   [        R                  [        R                  4;   a  UR                   U l        g UR                   [        R
                  :X  a  U R                  S [        R                  4;   aL  UR                  5       UR                  S   :X  a+  [        R                  " S5        [        R                  " SSS9  U R                  S [        R                  4;   aN  UR                  5       UR                  S   :w  a,  [        R                  " S5        [        R                  " SSS9  g g g g )NzInput type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference.ignorez.*inference.)messagezInput type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.z.*inference or training)r
   r'   float32bfloat16r   float16numelr   r   r   filterwarningsr,   xs     r.   set_compute_typeLinear4bit.set_compute_type  s    77u}}enn55 "#DWW%!!dEMM%::	QWWUW[@X  Y ''.I!!dEMM%::	QWWUW[@X k '':ST	 AY: &r0   c                   > [         TU ]  XU5        [        U R                  SS5      b\  U R                  R                  R                  SS9R                  5        H&  u  pEU(       a  UOUR                  5       XS-   U-   '   M(     gg)zK
save weight and bias,
then fill state_dict with components of quant_state
rk   NT)packedzweight.)r%   _save_to_state_dictr   r"   rk   as_dictitemsdetach)r,   destinationprefix	keep_varskvr-   s         r.   r   Linear4bit._save_to_state_dict  sw    
 	#KC4;;t4@//77t7DJJL;Da!((*Y.23 M Ar0   r   c                    [        U 5        U R                  bb  U R                  R                  UR                  :w  a>  U R                  R                  R	                  UR                  5      U R                  l        U R
                  (       d  U R                  U5        SU l        UR                  nU R                  b  UR	                  U R                  5      nU R                  c  S O$U R                  R	                  U R                  5      nU R                  R                  5       n[        R                  " XX0R                  R                  S9R	                  U5      $ )NT)r   rk   )r   r   r
   rj   rG   r   r   r   r"   tr   matmul_4bitrk   )r,   r   	inp_dtyper   r"   s        r.   rK   Linear4bit.forward  s    /5 99 TYY__%?!YY^^..qww7DIIN''!!!$'+D$GG	)T''(Ayy(tdiill4;M;M.NqtAXAXY\\]fggr0   )r   r   rk   ro   r"   )rO   rP   rQ   rR   rS   r'   r   r&   r   r   r   rK   rW   rX   rY   s   @r.   rq   rq     sK    H  kk#+JU(	Uh h hr0   rq   c                   N   ^  \ rS rSrSrSSS\R                  S4U 4S jjrSrU =r	$ )	LinearFP4i  z
Implements the FP4 data type.
TNc           
      2   > [         TU ]  UUUUUSUU5        g)  
Args:
    input_features (`str`):
        Number of input features of the linear layer.
    output_features (`str`):
        Number of output features of the linear layer.
    bias (`bool`, defaults to `True`):
        Whether the linear class uses the bias term as well.
ri   Nr%   r&   	r,   r   r   r   r   rm   ro   r	   r-   s	           r.   r&   LinearFP4.__init__  *    & 			
r0   re   
rO   rP   rQ   rR   rS   r'   r   r&   rW   rX   rY   s   @r.   r  r    s'      kk
 
r0   r  c                   N   ^  \ rS rSrSrSSS\R                  S4U 4S jjrSrU =r	$ )	LinearNF4i;  a
  Implements the NF4 data type.

Constructs a quantization data type where each bin has equal area under a standard normal distribution N(0, 1) that
is normalized into the range [-1, 1].

For more information read the paper: QLoRA: Efficient Finetuning of Quantized LLMs (https://arxiv.org/abs/2305.14314)

Implementation of the NF4 data type in bitsandbytes can be found in the `create_normal_map` function in
the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236.
TNc           
      2   > [         TU ]  UUUUUSUU5        g)r  nf4Nr  r  s	           r.   r&   LinearNF4.__init__G  r  r0   re   r  rY   s   @r.   r  r  ;  s'    	  kk
 
r0   r  c                     ^  \ rS rSr     SS\\R                     S\\R                     S\\R                     4S jjrU 4S jrS r	SS\\
\\\4      S	\4S
 jjrSS\\
\\\4      S	\4S jjrS r\   SS\S\\
\\4      S\\
\\4      S	\S\4
S jj5       r\SS\S\
\\4   S	\S\4S jj5       r\SS\S\S	\S\4S jj5       rU 4S jrSrU =r$ )
Int8Paramsif  rj   CBSCBc                     Uc  [         R                  " S5      n[         R                  R                  XU5      nXFl        XVl        X6l        U$ r<   )r'   ru   r   rv   r  r  has_fp16_weights)rw   rj   rx   r  r  r  objs          r.   ry   Int8Params.__new__g  sA     <;;q>Dll))#]C/
r0   c                   > U R                   (       a  [        TU ]	  U5      $ U R                  R	                  5       R                  U[
        R                  S9n[        R                  R                  U5      u  p4nX0l        X0l
        X@l        U $ )Nr	   r
   )r  r%   rG   rj   r   r'   r   r   r   int8_vectorwise_quantr  r  )r,   r	   Br  r  _r-   s         r.   r   Int8Params._quantizew  so      7:f%% II  "%%V5==%I^^99!<
	r0   c                      U R                  SS9$ r   r   r7   s    r.   r   Int8Params.cpu  r   r0   r	   r   c                 2    U R                  Uc  SUS9$ UUS9$ r   r   r   s      r.   r   Int8Params.cuda  r   r0   c                 2    U R                  Uc  SUS9$ UUS9$ r   r   r   s      r.   r   Int8Params.xpu  r   r0   c                 4   [        U 5      R                  [        U 5      [        R                  " U R                  U5      U R
                  U R                  [        R                  " U R                  U5      [        R                  " U R                  U5      S9nU$ )N)rj   rx   r  r  r  )	r   ry   r}   r   rj   rx   r  r  r  )r,   r   r   s      r.   r   Int8Params.__deepcopy__  sr    Dz))Jtyy$/,,!22}}TWWd+dhh- * 
 r0   r,   r
   r   c                     g r2   re   r   s       r.   rG   Int8Params.to  r   r0   c                     g r2   re   r   s      r.   rG   r1    r   r0   r   c                     g r2   re   r   s      r.   rG   r1    r   r0   c                 |  > [         R                  R                  R                  " U0 UD6u  p4pVU R                  R
                  [         R                  :H  nU(       dH  UbE  UR                  S:w  a5  U R                  R                  R                  S:X  a  U R                  U5      $ [        [        T	U ]1  X4US9U R                  U R                  S9nU(       ae  UR                  Ul        UbQ  U R                   bD  U R                   R                  R                  S:w  a   U R                   R                  U5      Ul        U$ )Nr   r   r   )rx   r  )r'   r   r   r   rj   r
   int8r   r	   r   r  r%   rG   rx   r  r  r  )
r,   r   r   r	   r
   r   r   is_quantizedr   r-   s
            r.   rG   r1    s    9>9O9OQU9`Y_9`6|yy%**4 2v{{f7LQUQZQZQaQaQfQfjoQo >>&)) GJfJM,,!22
	 $>>IL!dhh&:txx?S?SW]?] $F 3	r0   )r  r  rj   )NTFNNr   r   r   )rO   rP   rQ   rR   r   r'   r   ry   r   r   r   rT   r	   r   rV   r   r   r   r   r   r
   rG   rW   rX   rY   s   @r.   r  r  f  sq    (,%)&*u||$
 U\\" ell# %_8E#vs*:$;< _SW _^(5fc)9#:; ^RV ^
  03-0 	sF{+, eSj)* 	
 
  QQ5,QDQ1Q QGGFG$GG G r0   r  c                    U R                  U S35      nUc  g U R                  U S3S5      n[        U[        R                  5      (       a  UR                  5       n[        U[        5      (       a  U[        ;  a  [        SU 35      e[        U[        5      (       a  U[        ;   a	  [        U   nUS:w  a  [        SU 35      eg )Nr"   weight_formatrowz'Expected supported weight format - got z+Only 'row' weight format is supported, got )	getpopr   r'   r   itemrT   r   
ValueError)	
state_dictr  local_metadatastrictmissing_keysunexpected_keys
error_msgsr"   r8  s	            r.   maybe_rearrange_weightrD    s    ^^vhf-.F~NNfX]#;UCM-..%**, -%%-?i*iB=/RSS	M3	'	'M=g,gB=QF}oVWW r0   c                   H   ^  \ rS rSrSrS	U 4S jjrS rS\S\4S jrSr	U =r
$ )
Embedding8biti  a  
This class implements [LLM.int8()](https://arxiv.org/abs/2208.07339) algorithm for embedding layer

Quantization API is similar to Linear8bitLt:
```python
import torch
import torch.nn as nn

from bitsandbytes.nn import Embedding8bit

fp16_module = nn.Embedding(128, 64)
int8_module = Embedding8bit(128, 64)

int8_module.load_state_dict(fp16_module.state_dict())

int8_module = int8_module.to(0) # Quantization happens here
```
c                    > [         TU ]  XX4S9  U R                  R                  R                  U l        [        U R                  R                  SSS9U l        g )Nr#  Fr  rx   )r%   r&   r"   rj   r
   r  )r,   r   r   r	   r
   r-   s        r.   r&   Embedding8bit.__init__  sG    vS[[%%++
 !1!1EY^_r0   c                     [        S5      e)Nz.Saving Embedding8bit module is not implementedNotImplementedErrorr,   r  r  r  s       r.   r   !Embedding8bit._save_to_state_dict      !"RSSr0   rC   r   c                     [        U R                  S5      (       d  [        S5      eU R                  R                  nU R                  R                  nUR
                  U R                  U R                  4:X  d   eUR
                  U R                  4:X  d   e[        R                  " X5      n[        R                  " XR                  U R                  S5      5      nXES-  -  nUR                  U R                  5      $ )Nr  zKEmbedding layer is not quantized. Please call .cuda() or .to(device) first.r   g     _@)hasattrr"   RuntimeErrorrj   r  r   r   r   rE   rF   viewrG   r
   )r,   rC   rows	row_statscompressed_outputcompressed_output_statsoutputs          r.   rK   Embedding8bit.forward  s    t{{E**lmm{{KKOO	zzd1143E3EFFFF4#6#6"8888KK4"#++e^^DDWDWYZ5["\"&EFyy$$r0   r
   r"   )NN)rO   rP   rQ   rR   rS   r&   r   r   rK   rW   rX   rY   s   @r.   rF  rF    s,    &`T%V % % %r0   rF  c                   r   ^  \ rS rSrSrSS\R                  S4U 4S jjrS\4S jr	S r
S\S	\4S
 jrSrU =r$ )r   i  a  
This is the base class similar to Linear4bit. It implements the 4-bit quantization algorithm presented in
[QLoRA](https://arxiv.org/abs/2305.14314) for embeddings.

Quantization API is similar to Linear4bit:
```python
import torch
import torch.nn as nn

from bitsandbytes.nn import Embedding4bit

fp16_module = nn.Embedding(128, 64)
quantized_module = Embedding4bit(128, 64)

quantized_module.load_state_dict(fp16_module.state_dict())

quantized_module = quantized_module.to(0) # Quantization happens here
```
Nri   c           	      2  > [         TU ]  XXcS9  U R                  R                  R                  U l        [        U R                  R                  SS UUU S9U l        U R                  R                  nX'-  S:w  a  [        R                  " SU SU S35        g g )Nr#  Fr   r   zEmbedding size z  is not divisible by block size z#. This will lead to slow inference.)	r%   r&   r"   rj   r
   rg   rl   r   r   )	r,   r   r   r
   rn   ro   r	   rl   r-   s	           r.   r&   Embedding4bit.__init__  s     	vS[[%%++
 KK $!'
 KK))	$)MM!-0PQZP[ \4 4 *r0   rC   c                    U R                   U R                  R                  R                  -  S:X  d   eU R                  R                  R                  [        R                  5      R                  U R                  U R                   -  S-  S5      n[        R                  R                  R                  UR                  U R                  U R                   S-  5      US9R                  SS5      nUR                  UR                  5       U R                   -  S-  S4:X  d   eU R                   U R                  R                  -  nU R                  R                  R                  nUR                  U R                  U-  4:X  d   e[        R                  R                  R                  UR                  U R                  U5      US9R                  S5      nUR                  UR                  5       U-  4:X  d   e[        R                   " U R                  R                  5      nXgl        [        R"                  " / UR                  QU R                   P75      Ul        [$        R                  R'                  X75      nUR                  / UR                  QU R                   P7:X  d   eUR)                  U R*                  5      $ )Nr      r   r"   rC   r   )r   r"   rk   rl   rj   rS  r'   r   r   r   r   rF   r   r   absmaxr}   r   Sizer   dequantize_4bitrG   r
   )	r,   rC   w_4bit_uint8output_4bitblocks_per_embra  output_absmaxoutput_quant_staterX  s	            r.    _forward_with_partial_dequantize.Embedding4bit._forward_with_partial_dequantize8  sC   !!DKK$;$;$E$EEJJJ{{'',,U[[9>>t?R?RUYUgUg?gkl?lnophh))33$$T%8%8$:L:LPQ:QR 4 
 $r1+ 	   U[[]T5G5G%G1%La$PPPP++t{{/D/DD((//|| 3 3n DFFFF++55;;t22NC 6 
 $
 	 ""u{{}~'E&GGGG!]]4;;+B+BC$1!#(::.P.PT=O=O.P#Q //P||AAd.@.@AAAAyy$$r0   c                     [        S5      e)Nz.Saving Embedding4bit module is not implementedrK  rM  s       r.   r   !Embedding4bit._save_to_state_dictY  rO  r0   r   c                    [        U 5        U R                  U R                  R                  R                  -  S:X  a  U R                  U5      $ [        R                  R                  U R                  R                  U R                  R                  5      n[        R                  R                  R                  UUS9R                  U R                  5      $ )Nr   r`  )r   r   r"   rk   rl   ri  r   r   rc  rj   r'   r   rF   rG   r
   )r,   rC   dequantized_weights      r.   rK   Embedding4bit.forward\  s    /5 7 7 A AAQF88?? ^^;;DKK<L<LdkkNeNefxx"",,% - 
 "TZZ.	r0   rZ  )rO   rP   rQ   rR   rS   r'   r   r&   r   ri  r   rK   rW   rX   rY   s   @r.   r   r     sJ    0 kk:%f %BTV   r0   r   c                   F   ^  \ rS rSrS\R
                  S4U 4S jjrSrU =r$ )EmbeddingFP4ij  Nc           	      *   > [         TU ]  UUUSUUS9  g )Nri   r
   rn   ro   r	   r  r,   r   r   r
   ro   r	   r-   s         r.   r&   EmbeddingFP4.__init__k  )     	' 	 	
r0   re   	rO   rP   rQ   rR   r'   r   r&   rW   rX   rY   s   @r.   rq  rq  j      
 kk
 
r0   rq  c                   F   ^  \ rS rSrS\R
                  S4U 4S jjrSrU =r$ )EmbeddingNF4i}  Nc           	      *   > [         TU ]  UUUSUUS9  g )Nr  rs  r  rt  s         r.   r&   EmbeddingNF4.__init__~  rv  r0   re   rw  rY   s   @r.   rz  rz  }  rx  r0   rz  c                      ^  \ rS rSrSr     SS\S\4U 4S jjjrU 4S jrU 4S jrS r	U 4S	 jr
S
\R                  4S jrSrU =r$ )Linear8bitLti  a  
This class is the base module for the [LLM.int8()](https://arxiv.org/abs/2208.07339) algorithm.
To read more about it, have a look at the paper.

In order to quantize a linear layer one should first load the original fp16 / bf16 weights into
the Linear8bitLt module, then call `int8_module.to("cuda")` to quantize the fp16 weights.

Example:

```python
import torch
import torch.nn as nn

import bitsandbytes as bnb
from bnb.nn import Linear8bitLt

fp16_model = nn.Sequential(
    nn.Linear(64, 64),
    nn.Linear(64, 64)
)

int8_model = nn.Sequential(
    Linear8bitLt(64, 64, has_fp16_weights=False),
    Linear8bitLt(64, 64, has_fp16_weights=False)
)

int8_model.load_state_dict(fp16_model.state_dict())
int8_model = int8_model.to(0) # Quantization happens here
```
r   r   c                 R  > [         TU ]  XX75        [        R                  " 5       U l        X`l        XPR                  l        X@R                  l        US:  a  U(       d  SU R                  l        [        U R                  R                  XDS9U l
        U R                  [        5        g)a1  
Initialize Linear8bitLt class.

Args:
    input_features (`int`):
        Number of input features of the linear layer.
    output_features (`int`):
        Number of output features of the linear layer.
    bias (`bool`, defaults to `True`):
        Whether the linear class uses the bias term as well.
        TrH  N)r%   r&   r   MatmulLtStater   index	thresholdr  use_poolr  r"   rj   "_register_load_state_dict_pre_hookrD  )	r,   r   r   r   r  r  r  r	   r-   s	           r.   r&   Linear8bitLt.__init__  s{    * 	$G&&(

(

&6

#s?#3"&DJJ !1!1DTu//0FGr0   c                   > [         T	U ]  XU5        Sn[        U R                  U5      n[        U R                  U5      nX$ -   nUS-   nU R                  R
                  (       d  UbA  U(       a  UOUR                  5       X'   [        R                  " S[        R                  S9X'   g UbA  U(       a  UOUR                  5       X'   [        R                  " S[        R                  S9X'   g g g )Nr  r8  r   )r
   )
r%   r   r   r"   r   r  r   r'   r   r   )
r,   r  r  r  scb_nameparam_from_weightparam_from_statekey_nameformat_namer-   s
            r.   r    Linear8bitLt._save_to_state_dict  s    #KC  $DKK:"4::x8j) .zz** ,=F(9L]LdLdLf%+0<<+M(!-<E(8K[KbKbKd%+0<<+M( .	 +r0   c           	        > [         TU ]  UUUUUUU5        [        U5      nU H  n	U	[        U5      S  n
U
S:X  d  M  U R                  R
                  c  [        S5      eX   nU R                  R
                  R                  U5        U R                  R
                  b%  U R                  R
                  U R                  l        UR                  U	5        M     g )Nr  zLoading a quantized checkpoint into non-quantized Linear8bitLt is not supported. Please call module.cuda() before module.load_state_dict())
r%   _load_from_state_dictlistlenr"   r  rR  copy_r   remove)r,   r>  r  r?  r@  rA  rB  rC  unexpected_copykey
input_nameinput_paramr-   s               r.   r  "Linear8bitLt._load_from_state_dict  s     	%	
 /"CS[]+JU";;??*&c 
 )o%%k2::>>-%)[[__DJJN&&s+! #r0   c                     U R                   R                  U R                  l        U R                   R                  U R                  l        S U R                   l        S U R                   l        g r2   r"   r  r   r  r7   s    r.   init_8bit_stateLinear8bitLt.init_8bit_state  >    



r0   c                   > [         TU ]  " U0 UD6n[        R                  R                  R
                  " U0 UD6u  pEpgUb  UR                  R                  b4  UR                  R                  R                  U5      UR                  l        UR                  R                  b4  UR                  R                  R                  U5      UR                  l        U$ r2   )	r%   rG   r'   r   r   r   r   r  r  )	r,   r   r   r   r	   r
   r   r   r-   s	           r.   rG   Linear8bitLt.to  s    T,V,9>9O9OQU9`Y_9`6| ||*"(,,//"4"4V"<||+#)<<#3#3#6#6v#> r0   r   c                 ~   U R                   U R                  l        U R                  R                  b  U R                  5         U R                  bb  U R                  R                  UR                  :w  a>  U R                  R                  R                  UR                  5      U R                  l        [        R                  " XR                  U R                  U R                  S9nU R                  R                  (       d<  U R                  R                  b%  U R                  R                  U R                  l        U$ N)r   r   )trainingr   is_trainingr"   r  r  r   r
   rj   rG   r   matmulr  )r,   r   outs      r.   rK   Linear8bitLt.forward&  s    !%

;;>>%  " 99 TYY__%?!YY^^..qww7DIINjjKKdiitzzJzz**tzz}}/H#zz}}DKK
r0   r  r   r"   )TTr  NN)rO   rP   rQ   rR   rS   rT   r&   r   r  r  rG   r'   r   rK   rW   rX   rY   s   @r.   r~  r~    sf    F  H H  H  HDN0%,N  r0   r~  c                   >   ^  \ rS rSrSU 4S jjrS rS rS rSrU =r	$ )OutlierAwareLineari7  c                 B   > [         TU ]  XX45        S U l        SU l        g r   )r%   r&   outlier_dimr6  )r,   r   r   r   r	   r-   s        r.   r&   OutlierAwareLinear.__init__8  s"    $G!r0   c                     [        S5      e)NzJPlease override the `forward_with_outliers(self, x, outlier_idx)` functionrK  )r,   r   outlier_idxs      r.   forward_with_outliers(OutlierAwareLinear.forward_with_outliers=  s    !"noor0   c                     [        S5      e)NzEPlease override the `quantize_weights(self, w, outlier_idx)` functionrK  )r,   r   r  s      r.   quantize_weight"OutlierAwareLinear.quantize_weight@  s    !"ijjr0   c                    U R                   cV  [        R                  " 5       nUR                  5       (       d  [	        S5        UR                  U R                  5      nX0l         U R                  (       dS  U R                  U R                  U R                   5      nU R                  R                  R                  U5        SU l        g g )NzTPlease use OutlierTracer.initialize(model) before using the OutlierAwareLinear layerT)r  r   r*   is_initializedprintget_outliersr"   r6  r  rj   r  )r,   r   tracerr  r   s        r.   rK   OutlierAwareLinear.forwardC  s    #"//1F((**lm --dkk:K*  $$T[[$2B2BCAKK""1% $D !r0   )r6  r  )TN)
rO   rP   rQ   rR   r&   r  r  rK   rW   rX   rY   s   @r.   r  r  7  s    "
pk% %r0   r  c                   D   ^  \ rS rSr      SU 4S jjrS rS rSrU =r$ )SwitchBackLinearBnbiR  c	                 H  > [         T	U ]  XX85        [        R                  " 5       U l        Xpl        X`R                  l        X@R                  l        XPR                  l        US:  a  U(       d  SU R                  l	        [        U R                  R                  XDS9U l        g )Nr  TrH  )r%   r&   r   r  r   r  r  r  memory_efficient_backwardr  r  r"   rj   )
r,   r   r   r   r  r  r  r  r	   r-   s
            r.   r&   SwitchBackLinearBnb.__init__S  sv     	$G&&(

(

&6

#/H

,s?#3"&DJJ !1!1DTur0   c                     U R                   R                  U R                  l        U R                   R                  U R                  l        S U R                   l        S U R                   l        g r2   r  r7   s    r.   r  #SwitchBackLinearBnb.init_8bit_statej  r  r0   c                 .   U R                   U R                  l        U R                  R                  b  U R                  5         [        R                  " UR                  5       U R                  R                  5       S U R                  S9U R                  -   $ r  )
r  r   r  r"   r  r  r   matmul_mixedhalfr   r   s     r.   rK   SwitchBackLinearBnb.forwardp  sf    !%

;;>>%  "$++*:*:*<4tzzZ]a]f]fffr0   r  )TTFr  NN)	rO   rP   rQ   rR   r&   r  rK   rW   rX   rY   s   @r.   r  r  R  s.    
 "'v.g gr0   r  ).r}   typingr   r   r   r   r   r   r'   r   r	   r
   r   torch.nn.functionalr   rE   bitsandbytesr   bitsandbytes.cextensionr   bitsandbytes.functionalr   bitsandbytes.optimr   bitsandbytes.utilsr   r   r   r[   r   	Parameterrg   r   Linearrq   r  r  r  rD  rF  r   rq  rz  r~  r  r  re   r0   r.   <module>r     s\  
  : :   + +    3 . 1 XC()g4ehh(( g4TL"" L^zE## zEz3E:W4X 3"wh wht!

 !
H(

 (
VY## YxX*,%BLL ,%^aBLL aH
= 
&
= 
&d299 dN% %6$g")) $gr0   