
    o9iYW              	          d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	 d dlm
Z
 d dlZd dlmZ d Zej                   Z	  G d d      Zd	ed
eeef   fdZdej                   dej,                  dej                   fdZ G d dej0                  j2                        Zej6                  Zej6                  Zej6                  Zdej>                  de fdZ!d Z"d Z#e G d d             Z$ G d dej0                  j2                        Z% G d dej0                  j2                        Z&	 	 	 	 d#dededede$fd Z'd$deded!e	defd"Z(y)%    N)	dataclass)reduce)TupleOptionalList)warnc                 8    t        t        j                  | d      S )N   )r   operatormul)iterables    j/var/www/html/backtest/airagagent/rag_env/lib/python3.12/site-packages/bitsandbytes/autograd/_functions.pyprodr      s    (,,!,,    c                   8    e Zd ZdZd Zd Zed        Zd Zd Z	y)GlobalOutlierPoolerNc                     t        d      )NzCall get_instance() instead)RuntimeErrorselfs    r   __init__zGlobalOutlierPooler.__init__!   s    899r   c                 0    t               | _        d | _        y N)setoutliers	model_dimr   s    r   
initializezGlobalOutlierPooler.initialize$   s    r   c                     | j                   0| j                  |       | _         | j                   j                          | j                   S r   )	_instance__new__r   )clss    r   get_instancez GlobalOutlierPooler.get_instance(   s6    == KK,CMMM$$&}}r   c                     | j                   || _         || j                   k7  ry | j                  j                  |j                                y r   )r   r   updatetolist)r   outlier_idxfeature_dims      r   add_outliersz GlobalOutlierPooler.add_outliers/   s=    >>!(DN$..([//12r   c                     t        j                  t        | j                              j	                  t         j
                        S r   )torchTensorlistr   toint64r   s    r   get_current_outlier_idxz+GlobalOutlierPooler.get_current_outlier_idx7   s)    ||D/033EKK@@r   )
__name__
__module____qualname__r   r   r   classmethodr"   r(   r/    r   r   r   r      s0    I:  3Ar   r   transform_tile	tile_sizec                 j   |\  }}d||z  cxk  rdk  sJ  J t        j                  ||z  t         j                        j                  ||      }t        j                  |      }t        d      D ]  }t        j                  |d|z  d      dz  }|dz
  j                  t         j                        j                         }t        j                  |j                         dz   |k(        sJ d	        | |      }	|	j                  |j                        dz   }
||
d|z  z  z  }||z  d|z  k  s |S  |S )
a  
    Compute a permutation of indices that invert the specified (tiled) matrix transformation

    :param transform_tile: a function that applies forward transform to a tensor of shape [dim1, dim2]
    :param tile_size: higher-level tile dimensions, i.e. (8, 32) for Turing and (32, 32) for Ampere
    :note: we assume that tile_transform applies to a cpu-based int8 tensor of shape tile_size
    :example: transform_tile function for the turing layout (bitsandbytes.functional as F)
    :returns: indices
    r   l            dtype      trunc)rounding_mode   zint overflow)r*   aranger.   view
zeros_likerangedivr-   int8
contiguousallintr9   )r5   r6   d1d2tile_indicespermuted_tile_indicesiith_dim_indicessample_tile_ipermuted_tile_iith_permuted_indicess              r   get_inverse_transform_indicesrQ   ;   s7    FBrBw<<Ru{{;@@RHL!,,\:1X 	))L#q&PSVV(3.225::>IIKyy**,s2oEFVVF(7.11,2D2DEK!5a!@@7S!V  	 ! r   permuted_tensorrJ   returnc                    | j                   |j                   c\  }}\  }}||z  ||z  cxk(  rdk(  sJ d        J d       | j                  d|j                               j                         }t	        j
                  |      }|||j                         <   |j                  ||||z  ||z        }|j                  dddd      }|j                  ||      j                         S )a  
    Undo a tiled permutation such as turing or ampere layout

    :param permuted_tensor: torch tensor in a permuted layout
    :param tile_indices: reverse transformation indices, from get_inverse_transform_indices
    :return: contiguous row-major tensor
    r   z+tensor must contain a whole number of tiles      r
   )	shapereshapenumeltr*   
empty_likeflattenpermuterE   )rR   rJ   rowscols	tile_rows	tile_colstensoroutputss           r   undo_layoutre   V   s     ,;+@+@,BTBT(LT4(9i)ti/414c6cc4c6cc4$$R););)=>@@BFv&G&,GL  "#ooiDI4EtyGXYGooaAq)G??4&1133r   c                   .    e Zd Zedd       Zed        Zy)
MatMul8bitNc                 *   |g d}|d   dk7  r4t        j                         5  t        j                  ||      }d d d        nt        |j                        dk(  rd}nd}t        j                  |d|      \  }}	t        j                  |||      \  }
}t        j                  ||
      }t        j                  ||	||j                  |      }|j                  s|j                  r| j                  ||       || _        || _        S # 1 sw Y   CxY w)N)r:   r:   r:   r   r:   rW   r
   rU   dim
quant_type)r*   no_gradmatmullenrX   Fvectorwise_quantigemmvectorwise_mm_dequantr9   requires_gradsave_for_backwardrk   	precision)ctxABoutrk   ru   outputrj   qASAqBSBiouts                r   forwardzMatMul8bit.forwardi   s    !IQ<1 ,a+, , 177|q ''rjIFB''szJFB772r?D,,T2r177JOF??aoo!!!Q'#!%, ,s   D		Dc                    | j                   \  }}| j                  }| j                  }d x}}|j                  rIt	        |j
                        dk(  r	ddg}g d}	ndg}ddg}	|d   dk7  rDt        j                         5  t        j                  |j                  |	      |      }d d d        nt	        |j
                        dk(  r/t	        |j
                        dk(  r|j                         }|j                         s|j                          t        j                  |j                  d|j
                  d         d|      \  }
}|j                         s|j                         }t        j                  |j                  d|j
                  d         d|      \  }}t        j                  |j!                         |
      }t        j"                  ||j!                         ||j$                  |      }nt        j                  |||      \  }
}t        j                  |||      \  }}t        j                  |j                  |	      |
      }t        j"                  ||j                  |	      ||j$                  |      }|j                  rt	        |j
                        dk(  rdg}ndg}t	        |j
                        dk(  rg d}	|}nddg}	dg}|d   dk7  rCt        j                         5  t        j                  ||j                  |	            }d d d        nt        j                  |||      \  }
}t        j                  |||      \  }}t        j                  |
|j                  |	            }t        j"                  |||j                  |	      |j$                  |      }||d d d fS # 1 sw Y   ;xY w# 1 sw Y   xY w)	NrV   r   r
   )r   rW   r
   r:   rW   rU   ri   )saved_tensorsrk   ru   rs   rn   rX   r*   rl   rm   r^   rE   is_contiguousro   rp   r@   rq   r[   rr   r9   )rv   grad_outputrw   rx   rk   ru   grad_Agrad_Bdimspermute_dimqgrad_outputS1r{   S2igrad_Bdim_Br}   S3igrad_As                      r   backwardzMatMul8bit.backward   sk     1^^
MM	??177|q 1v's !f|q ]]_ O"\\!))K*@+NFO O qww<1$QWW):"-"8"8":K&446#..0'('9'9#(([->->q-AB#-($L"
 ??,LLN//r1771:.A*FB  ggbddfl;G44[->->
F ()'9'9#*($L" //t
FB  ggbjj&=|LG44

;/#))"F ??;$$%*ss177|q '  !!f|q ]]_ O"\\+qyy7MNFO O $%#5#5Tj$ b ++A5ZPB'',

;0GH00JJ{+%% vtT4//KO OjO Os   =&N30&O 3N= O	)NvectorNr0   r1   r2   staticmethodr   r   r4   r   r   rg   rg   h   s*     0 V0 V0r   rg   devicec                     t         j                  j                  |       dk  ryt         j                  j                  |       d}t	        fd|D              ryy)z7check if this device supports the optimized int8 kernel)r   )      F)zGTX 1630zGTX 1650zGTX 1660c              3   &   K   | ]  }|v  
 y wr   r4   ).0
model_namedevice_names     r   	<genexpr>z#supports_igemmlt.<locals>.<genexpr>   s     
G:$
Gs   T)r*   cudaget_device_capabilityget_device_nameany)r   nvidia16_modelsr   s     @r   supports_igemmltr      sP    zz''v'6?**,,F,;K:O

G
GGr   c                 0    | dv s
J d|         | dk(  rdS dS )N)
col_turing
col_amperez9please find this assert and manually enter tile size for r   )r:       )r   r   r4   )formats    r   _get_tile_sizer      sA       L 
C6(KL  ,7:(:r   c                       fd}t        j                         5  t        |t                     j	                        cd d d        S # 1 sw Y   y xY w)Nc                     t        j                  | j                        d      d   j                  | j                        S )Nrow)
from_orderto_orderr   )ro   	transformr-   r   )xr   r   s    r   <lambda>zget_tile_inds.<locals>.<lambda>   s4    !++add6luvVWXY\\]^]e]ef r   )r*   rl   rQ   r   r-   )r   r   r   s   `` r   get_tile_indsr      sE    fI	 [,Yv8NORRSYZ[ [ [s   $AAc                       e Zd ZU dZeej                     ed<   dZe	ed<   dZ
dZdZdZdZdZdZdZdZdZdZdZdZdZdZdZ ej6                         Zd Zed        Zy)	MatmulLtStateN_tile_indicesFforce_no_igemmlt        Tc                 f    d | _         d | _        d | _        d | _        d | _        d | _        d | _        y r   )CBCxBr~   SCBCxBtSBtCBtr   s    r   reset_gradszMatmulLtState.reset_grads  s3    	r   c                     | j                   /t        | j                  | j                  j                        | _         | j                   S r   )r   r   formatBr   r   r   s    r   rJ   zMatmulLtState.tile_indices  s5    %!.t||TXX__!MD!!!r   ) r0   r1   r2   r   r   r*   r+   __annotations__r   boolr   r   r~   r   r   r   r   subBoutlier_poolhas_accumulated_gradients	thresholdidxis_traininghas_fp16_weightsmemory_efficient_backwarduse_poolro   get_special_format_strr   r   propertyrJ   r4   r   r   r   r      s    ,0M8ELL)0"d"	B
C	B
CD
C
CDL %I
CK %H&a&&(G " "r   r   c                   4    e Zd Zeddefd       Zed        Zy)MatMul8bitLtNc                    t        |j                        xr |j                   }d| _        t	        |j
                        dk(  rd| _        || _        || _        || _        |j
                  d   |j
                  d   k(  rIt        j                  |j
                  d d |j
                  dd  z   |j                  |j                        S t        j                  |j
                  d d |j
                  d d z   |j                  |j                        S |j                  }|j
                  }|j                  t        j                         |_        |j                  t        j                   k7  r#t#        j$                  d|j                   d       t'        |j
                        d	k(  r|j)                  d|j
                  d         }t+        j,                  |j/                  t        j                         |j0                  
      \  }	}
}}}|j0                  dkD  r||j2                  rzt        j4                  |j6                        j9                         }d|	d d |f<   d|
d d |f<   |d d |f   }|d d |f   j;                         j=                         |_        ||_         n|jB                  {|ryt+        jD                  |jF                  |      \  |_!        |_$        nJ|j2                  s<|jB                  0|r.t+        jD                  |jF                  |      \  |_!        |_$        d }|j2                  rtK        |dd       dnd}|jM                          xr! |j
                  d   |jO                  d      k(  }|r|j=                         }|jP                  r|r|jB                  |jS                          t+        j,                  |j/                  t        j                               \  }|_*        |_+        |_,        }|r%t+        jD                  ||      \  |_!        |_$        n
||_#        nd}|{|j2                  snt        j4                  |j6                        }||_         |jB                  Dt+        jZ                  |jB                  |jH                  |j@                  j]                               }n9|jF                  d d |j@                  j9                         f   j_                         }||jV                  ja                  dd      z  dz  j;                         j=                         j/                  |j                        |_        d|	d d |j@                  j9                         f<   d|
d d |j@                  j9                         f<   |d d |j@                  j9                         f   }|jH                  r|jH                  d   n|j
                  }t'        |      d	k(  r|d   |d   |d   f}n
|d   |d   f}|rt+        jD                  |	d      \  }}t+        jb                  ||jB                  ||jH                        \  }}||j                  t        j                   k(  rAt+        jd                  ||||jV                  |      }|j/                  |j                        }n%t+        jd                  ||||jV                  d       }|j/                  |j                        jg                  |      }n|j_                         }|j@                  !d|d d |j@                  j9                         f<   t        jh                  jj                  jm                  ||jF                  j/                  |j                              }|jo                  |jV                  jq                  d      js                  d            }||jg                  |      }|%#|t        jt                  ||j>                        z  }|| _;        || _        || _<        |j                  |j                  |d n|j                  c| _=        | _>        | _?        t        | j                  d d       r|
|f| _B        ||j@                  f| _C        n#d d |g| _B        d| _C        | j                  d d        t'        |      d	k(  rt        j^                  nd } ||ja                  |            S )NFr   TrU   r
   r9   r   z'MatMul8bitLt: inputs will be cast from z to float16 during quantizationrV   )r   r   )r   gradg     _@col32)bias@ ?rW   NNc                     | S r   r4   )r   s    r   r   z&MatMul8bitLt.forward.<locals>.<lambda>  s    ! r   )Er   r   r   is_emptyr   rX   rw   rx   r   r*   emptyr9   r   r   r   r"   float16warningsr   rn   rY   ro   double_quantr-   r   r   uniquecolidxlongr[   rE   r   r   r   r   r   r~   getattrr   strider   r   r   r   SCBtextract_outliersrG   cloner@   igemmlt
mm_dequantadd_nn
functionallinearmul_	unsqueezer   rm   state
grad_shapedtype_Adtype_B
dtype_biasr   needs_input_gradtensorstensor_statesrt   )rv   rw   rx   ry   r   r   using_igemmltr   input_shapeCACAtSCASCAtcoo_tensorAr   subAhas_gradis_transposedr   coo_tensorBr&   r   shapeBoutput_shapeC32Ar|   out32Sout32rz   A_wo_outliers
clone_funcs                                  r   r   zMatMul8bitLt.forward'  s   (2Q5;Q;Q7Q=ACLCECECHwwr{aggaj({{1773B<!''!"+#=QWWUVU]U]^^{{1773B<!''"1+#=QWWUVU]U]^^ --gg%!4!A!A!CE 77emm#MMCAGG9Lklm qww<1		"aggbk*A*+..emm9LX]XgXg*h'CdK??S [%<%%ll;#5#56;;=1c6
AsFCyq#vY[[]557
	99$ +,++ehh*Q'EIux))eii.?M&'kk%((W&M#	58D !! '64 8 Dt5H ! 11OaggajAHHQK6OMLLN!!(uyy7H!!# NN144#67IIJ *+++b7*K'EIux!EHH"5+A+A  ,,{'9'9:K#EI yy$--eii599==?S 88Auyy~~'7$78>>@"UYY^^B%::UBEEGRRTWWXYX_X_`EJ&'Bq%))..""#'(C599>>##$Q		(()D %!agg{q 'NKNF1IFL'NF1I6L {{2w/HD"IIdEIIr588DME6|tzzU]]:eVS%))$O177+eVS%))$O177+006 GGIMyy$56a!112XX((//uxx{{177?STF[[!4!4Q!7!;!;K!HIFT* "t'7ell444F 	$3477AGGT\T_c_i_i0S[#.s##BQ'(a.CK!%uyy 1Cq/CK ,C!!$-$'$5$:U[[
&++l344r   c                 	   | j                   rn| j                  d nt        j                  | j                        }t        j                  | j                        t        j                  | j
                        d |d fS | j                  \  }}}}}| j                  \  }}}	| j                  \  }
}| j                  }| j                  }d x}x}}|r|j                  d| j                        }t        |j                        dk(  r-|j                  d|j                  d         j!                         }t#        j$                  |j'                  t        j(                              \  }}}}}|rt#        j*                  ||d      \  }}t#        j*                  |dd      \  }}t#        j,                  ||||      \  }}t#        j.                  ||||
      }|j0                  dkD  r5|3|d d |fxx   t        j2                  |j5                         |      z  cc<   |rh|j6                  t#        j*                  |d      \  }}|j8                  /t#        j*                  |j6                  |d	      \  |_        |_        t#        j,                  ||j8                  ||j:                        \  }}t#        j.                  ||||j<                        j?                  | j@                        j'                  | jB                        }n|jD                  |jD                  j'                  | jB                  d
      jG                  |jH                  jK                  d      jM                  d            }t        j2                  ||      j?                  | j@                        j'                  | jB                        }n|jN                  tQ        |jN                  |jR                        j'                  | jB                        jG                  |jH                  jK                  d      jM                  d            }t        j2                  ||      j?                  | j@                        j'                  | jB                        }ntU        d      ||d |d fS )Nr   r8   rV   rU   T)	transposer   r   )r   r  )copyr
   r   z>State must contain either CBt or CB or CxB matrix for backward)+r   r   r*   rA   rw   rx   r   r   r   r   r   sumr   rn   rX   rY   rE   ro   r   r-   r   r   r   r   r   rm   r[   r   r   r   r   r@   r   r   r   r   r   r   r   r   re   rJ   	Exception)rv   r   	bias_grad	req_gradA	req_gradB_req_gradBiasr   r   rw   r   r   r   r   r   r   	grad_biasCgradCgradtSCgradSCgradt
coo_tensorCxAtSAtC32gradSgradgradB32SgradB32gradA32SgradA32r   s                                  r   r   zMatMul8bitLt.backward  s|   << # 0e6F6Fsxx6PI##CEE*E,<,<SUU,CT9VZZZ363G3G0	9aq{{T1%%	c++		&***)#@I {  !Q&%--b+2C2CB2GHSSUK56^^KNNSXS`S`Da5b2vvw
CDAID#[[DINGU !		'4 DGX\\'8WdCF$)9q#v%,,{}}"EEyy$!"UG!<::%,-KK		G_c,d)EJ	$%IIguzz5%))$T!gxLQQRUR`R`addehepepq%XX[[4[8==eii>Q>QRS>T>X>XYd>efk26;;CNNKNNs{{[&		5+=+=>R_T%))--a044[AB 
 k26;;CNNKNNs{{[ `aavtY44r   )r0   r1   r2   r   r   r   r   r4   r   r   r   r   #  s4     #$m O5 O5b 15 15r   r   c                   .    e Zd Zedd       Zed        Zy)
MatMul4BitNc                 J   d| _         t        |j                        dk(  rd| _         || _        || _        || _        |d   }|j                  d   |d   k(  r?t        j                  |j                  d d |dd  z   |j                  |j                        S t        j                  |j                  d d |d d z   |j                  |j                        S t        j                  j                  j                  |t        j                  ||      j                  |j                        j!                         |      }|| _        |j                  |j                  |d n|j                  c| _        | _        | _        t+        | j,                  d d       r||f| _        |S d| _        |S )	NFr   Tr
   rU   r   rW   r   )r   r   rX   rw   rx   r   r*   r   r9   r   r   r   r   ro   dequantize_4bitr-   r[   r   r   r   r   r   r   r   )rv   rw   rx   ry   r   r   B_shaperz   s           r   r   zMatMul4Bit.forward  si    =ACLCECECHAhGwwr{gaj({{1773B<'!"+#=QWWUVU]U]^^{{1773B<'"1+#=QWWUVU]U]^^
 $$++Aq/@/@E/J/M/Magg/V/X/X/Z\`a 	3477AGGT\T_c_i_i0S[#.s##BQ'(a&CK  'CKr   c                 b   | j                   rn| j                  d nt        j                  | j                        }t        j                  | j                        t        j                  | j
                        d |d fS | j                  \  }}}}}| j                  \  }}| j                  }d\  }	}
}|r|j                  d| j                        }|r[t        j                  |t        j                  || j                        j                  |j                        j!                               }	|	|
d |d fS )NNNNr   r8   )r   r   r*   rA   rw   rx   r   r   r   r
  r   rm   ro   r"  r-   r9   r[   )rv   r   r  r  r  r  rw   rx   r   r   r   r  s               r   r   zMatMul4Bit.backward  s    << # 0e6F6Fsxx6PI##CEE*E,<,<SUU,CT9VZZZ*-*>*>(	1aq{{1		$4!	#@I u||K9J9J1cii9X9[9[\g\m\m9n9p9p9rsfvtY44r   r%  r   r4   r   r   r   r     s*      < 5 5r   r   rw   rx   ry   r   c                 h    |xs
 t               }|dkD  r||_        t        j                  | ||||      S )Nr   )r   r   r   apply)rw   rx   ry   r   r   r   s         r   rm   rm   (  s7     $]_E3#aCu55r   quant_statec                    |J | j                         | j                  d   k(  r| j                  dk(  r|\  }}}}}	}
}| j                  d   |z  dk7  r4t        d| d| j                          t        j                  | ||||      S t        j                  | |j                         ||      }|||z  }|S t        j                  | ||||      S )NrU   Fr   z4Some matrices hidden dimension is not a multiple of z^ and efficient inference kernels are not supported for these (slow). Matrix input size found: )r   )	rZ   rX   rs   r   r   r'  ro   	gemv_4bitr[   )rw   rx   r(  ry   r   absmaxrX   r9   	blocksizecompressed_statsrk   	data_types               r   matmul_4bitr/  6  s    """wwyAGGBKAOOu$<S^Pui)9:y772;"a'G	{  Sq  rs  ry  ry  qz  {  |##Aq#t[AA++a;?CtJ1c4==r   )NNr   Nr   ))r   r   dataclassesr   	functoolsr   typingr   r   r   r   r*   bitsandbytes.functionalr   ro   r   r+   rc   r   callablerG   rQ   
LongTensorre   autogradFunctionrg   r'  	mm_cublas
bmm_cublasmatmul_cublasr   r   r   r   r   r   r   r   rm   r/  r4   r   r   <module>r;     s     !  ( (   #- 

A A:!( !uSRUX !64 4U=M=M 4RWR^R^ 4$q0(( q0h 	
  U\\ d ;[
 &" &" &"RH55>>** H5V75(( 75z 	666 
6 	6>6 >f >4 >f >r   