
    m9i6             +       
   d dl Zd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	Z
d dlmZ d dlmZ d dlmZ ddlmZmZ d Zi Zer	 i Zej,                  ej.                  ej0                  fed	<   ej2                  ej4                  fed
<   ej6                  ej8                  fed<   ej:                  ej<                  ej>                  fed<   ej@                  ejB                  fed<   i Z"ejF                  ejH                  fe"d	<   ejJ                  ejL                  fe"d
<   ejN                  ejP                  fe"d<   ejR                  ejT                  fe"d<   ejF                  ejH                  fe"d<   ejJ                  ejL                  fe"d<   i Z+ejX                  ejZ                  ej\                  fe+d	<   ej^                  ej`                  fe+d
<   ejb                  ejd                  fe+d<   ejf                  ejh                  ejj                  fe+d<   ejl                  ejn                  fe+d<    G d d      Z8 G d d      Z9 G d d      Z:i Z;de;ejx                  <   de;ejz                  <   de;ej|                  <   de;ej~                  <   de;ej                  <   ejx                   ej                  dd       ddZBddZCddZDddZEddZFdd ZGdd!ZHdd"ZIdd#ZJdd$ZKdd%ZLd& ZMd' ZNd(ed)ej                  fd*ZPd+ ZQd, ZRdd-ZS	 dd.ZT	 	 	 	 	 dd/ZUdd(ed0ed1eVd)efd2ZWdd(ed3ed4ed0ed)ef
d5ZX	 	 	 	 	 	 dd(ed6eeef   d4ed3ed0ed7eYd)efd8ZZdd9Z[dd(ed4ed0efd:Z\dd(ed4ed0efd;Z]dd(ed4ed0ed)efd<Z^dd(ed6eeef   d4ed0ed7eYd)efd=Z_dd(ed6eeef   d4ed0ed7eYd)efd>Z`dd(ed6eeef   d4ed0ed7eYd)efd?Zadd(ed3ed0ed)efd@Zb	 	 	 	 dd(ed6eeef   d4ed3ed0ed)efdAZcdd(ed3ed0ed)efdBZddd(ed3ed0ed)efdCZe	 	 	 	 	 	 	 ddDefdEedFedGedHeVdIeVdJeYdKeVdLedMeVdNeVdOeVdPedQeVd)dfdRZg	 	 	 	 ddDefdEedFedGedLedHeVdMeVdIeVdJeYdKeVdSedTedUedVedWedXedNeVdOeVdPedQeVd)df*dYZh	 	 	 ddDefdEedFedGedLedHeVdMeVdIeVdJeYdKeVdSedTedZed[edNeVdOeVd)df"d\Zi	 dd]ed^edJeYd_eYfd`ZjdaedbedceddefdeZkej                  fdfZl	 	 	 	 dd(edged0efdhZm	 	 	 dd(edged0efdiZn	 	 	 dd(edged0efdjZoddej                  fdkZq	 	 	 	 ddlZr	 ddmZs G dn do      Zt G dp dq      Zu G dr ds      Zvdt Zwdu Zxej                  fdvZz	 ddwZ{ddxZ|ddyZ}ddzZ~d{Zdd}Zdd~Zej                  d|fdZej                  fdZd Zd Zy)    N)norm)reduce)Tuple)Tensor   )COMPILED_WITH_CUDAlibc                 8    t        t        j                  | d      S Nr   )r   operatormul)iterables    a/var/www/html/backtest/airagagent/rag_env/lib/python3.12/site-packages/bitsandbytes/functional.pyprodr      s    (,,!,,    adammomentumrmsproplionadagradlamblarsc                   4    e Zd ZdZd Zd Zed        ZddZy)GlobalPageManagerNc                     t        d      NzCall get_instance() insteadRuntimeErrorselfs    r   __init__zGlobalPageManager.__init__c       899r   c                     g | _         y N)paged_tensorsr   s    r   
initializezGlobalPageManager.initializef   s
    r   c                     | j                   0| j                  |       | _         | j                   j                          | j                   S r$   	_instance__new__r&   clss    r   get_instancezGlobalPageManager.get_instancei   6    == KK,CMMM$$&}}r   c                 J    | j                   d d d   D ]  }t        ||        y )N)r%   prefetch_tensor)r    to_cputs      r   prefetch_allzGlobalPageManager.prefetch_allp   s,     ##DbD) 	'AAv&	'r   F)	__name__
__module____qualname__r)   r!   r&   classmethodr-   r4    r   r   r   r   `   s*    I:   'r   r   c                   2    e Zd ZdZd Zd Zed        Zd Zy)CUBLAS_ContextNc                     t        d      r   r   r   s    r   r!   zCUBLAS_Context.__init__|   r"   r   c                     i | _         y r$   )contextr   s    r   r&   zCUBLAS_Context.initialize   s	    r   c                     | j                   0| j                  |       | _         | j                   j                          | j                   S r$   r(   r+   s    r   r-   zCUBLAS_Context.get_instance   r.   r   c                    |j                   | j                  vrt        j                  j	                         }t        j                  j                  |       t        j                  t        j                               | j                  |j                   <   t        j                  j                  |       | j                  |j                      S r$   )
indexr?   torchcudacurrent_device
set_devicectc_void_pr	   get_context)r    deviceprev_devices      r   rI   zCUBLAS_Context.get_context   s}    <<t||+**335KJJ!!&))+S__5F)GDLL&JJ!!+.||FLL))r   )	r6   r7   r8   r)   r!   r&   r9   r-   rI   r:   r   r   r<   r<   y   s*    I:  *r   r<   c                   ,    e Zd ZdZd Zd Zed        Zy)Cusparse_ContextNc                     t        d      r   r   r   s    r   r!   zCusparse_Context.__init__   r"   r   c                 \    t        j                  t        j                               | _        y r$   )rG   rH   r	   get_cusparser?   r   s    r   r&   zCusparse_Context.initialize   s    {{3#3#3#56r   c                     | j                   0| j                  |       | _         | j                   j                          | j                   S r$   r(   r+   s    r   r-   zCusparse_Context.get_instance   r.   r   )r6   r7   r8   r)   r!   r&   r9   r-   r:   r   r   rM   rM      s%    I:7  r   rM         rD   )rB   dtyperJ   c                    t         |    t        |      z  }t        j                  t	        j
                  |            }t	        j                  |t	        j                  t        j                              }t        j                  j                  ||      }t        j                  || t        |            j                  |      }d|_        |j                   |_        |S )N)shape)rU   countT)dtype2bytesr   r	   cget_managed_ptrrG   c_size_tcastPOINTERc_intnp	ctypeslibas_arrayrC   
frombufferviewis_pagedrB   page_deviceid)rU   rJ   rW   	num_bytescuda_ptrc_ptr	new_arrayouts           r   	get_pagedrk      s    E"4;.I##BKK	$:;HGGHbjj23E%%e5%9I


9Ee
E
J
J5
QCCLCJr   c                     | j                   sJ d       |rd}n| j                  }t        | j                     | j	                         z  }t        j                  t        |       t        j                  |      t        j                  |             y )Nz%Only paged tensors can be prefetched!r0   )rd   re   rY   rU   numelr	   	cprefetchget_ptrrG   r[   c_int32)Ar2   deviceidrf   s       r   r1   r1      sd    ::>>>:??AGG$QWWY.IMM'!*bkk)4bjj6JKr   c           	         d }|j                   t        j                  k(  r+t        t        d|  dd       }t        j                  |      }nG|j                   t        j                  k(  r*t        t        d|  dd       }t        j                  |      }|t        d|        t        |dd      }|r|rt        |       |t        |        |t        |      t        |      t        j                  |j                                      |j                  s|j                  rt        j                  j!                          y y )Nc_fp32_uint8zFunction not implemented: 
is_managedF)rU   rC   float32getattrr	   rG   c_floatuint8c_uint8NotImplementedErrorr1   ro   c_int64rm   rd   rD   synchronize)	func_namerq   Bvalueprefetchfunccvaluerw   s           r   elementwise_funcr      s    Dww%--sa	{%0$7E"	
EKK	sa	{&148E"|.1KI;/WXXL%0Jh=/!,WQZAGGI)>?zzQZZ
 	

   r   c                      t        d| d |       y )Nfillr   )rq   r   rJ   r   s       r   r   r      s    0@DRW0Xr   c                      t        d| d d       y )Naranger   r   )rq   rJ   s     r   r   r      s    ,Xq$Br   c                      t        d| |d       y )N_mulr   r   )rq   r   rJ   s      r   r   r      s    -faA>r   c                 N   | rdnd}d|z  }|s|dk  r| sd|z  nd|z  dz
  }t        j                  |d|      }d|j                         z
  }|dk(  r|S |j                         dz  }t        j                  |d | j	                         dg|z  z   ||d  j	                         z         S )	N              rS      r         ?   r   )rC   linspacerm   r   tolist)signed
total_bitsadd_zerosigntotal_valuesvaluesgapls           r   create_linear_mapr      s    DDj=L:>
 .4:Jq^^D#|4F

C
axLLNA||F2AJ--/1#c'9F12J<M<M<OOPPr   c                    |r~t        j                  t        j                  | dd      d d       j	                         }dgdz  }t        j                  t        j                  | dd      d d        j	                         }n}t        j                  t        j                  | dd      d d       j	                         }dgdz  }t        j                  t        j                  | dd      d d        j	                         }||z   |z   }t        j
                  |      }|j                         j                  }||j                         z  }|j                         dk(  sJ |S )	Ng      ?	   r0   r      r      r   )
r   ppfrC   r   r   r   sortr   maxrm   )offsetuse_extra_valuev1v2v3vr   s          r   create_normal_mapr      s!   XXennVS!4Sb9:AACS&\xxvsA6s;<<DDFXXennVS!4Sb9:AACS&\xxvsA6s;<<DDF
R"A\\!_F[[]!!F
fjjlF<<>S   Mr   c                 T   |}|}| rdnd}||z   ||z
  k(  sJ g }g }t        t        d||z
  z   d||z
  z  d            D ]  \  }	}
|j                  d|
z          g }t        t	        j
                  ddg|            }d|dz
  z  }t        d|z        D ]  }|D ]z  }|dk7  rdnd}t        t        |            D ]  \  }	}||d|	dz    z  z  z  } |dk(  r
|d| z  z  }n|d||z
  dz
   z  z  }|j                  |       | si|j                  |        |  t        |      d|z  k(  sJ |j                          |dk  r/dt        |      z
  }t        |      D ]  }	|j                  d        |j                          t        j                  |      }||j                         z  }|S )Nr   r   rS   )repeatr   r   )	enumeraterangeappendlist	itertoolsproductlenr   rC   r   r   )r   exponent_bitsprecision_bitsr   ephas_signevaluespvaluesivalr   lstbiasevaluebit_patternr   pvalr   codes                       r   create_fp8_mapr     s   AAqHQ3*X%%%%GGEQx)?%@"BAV^H^D_abcd 3q#v F
y  !Q?
@C}QDM*+ & 	&K A+Q1E$T+%67 *4qAaC&y))*{a4j( a6$;q=!111MM% uf%	&&  v;!Z-'''
KKMA~CKs 	AMM!	
KKM<<DDHHJDKr   c                 @   g }|| rdndz
  }d||z
  z  dz
  }| sd|z  }t        |      D ]  }t        | rd||z   |z
  z  dz   nd||z   |z
  dz   z  dz         }t        j                  dd|      }|dd |dd z   dz  }	|d|dz
   |z   z  |	z  j	                         z  }| r!|d|dz
   |z   z   |	z  j	                         z  }|dkD  st        j                  dd|dz         }|dd |dd z   dz  }	|d|dz
   |z   z  |	z  j	                         z  }| s|d|dz
   |z   z   |	z  j	                         z  }
 |j                  d       |j                  d	       d
t        |      z
  }
t        |
      D ]  }|j                  d        |j                          t        |      S )a+  
    Creates the dynamic quantiztion map.

    The dynamic data type is made up of a dynamic exponent and
    fraction. As the exponent increase from 0 to -7 the number
    of bits available for the fraction shrinks.

    This is a generalization of the dynamic type where a certain
    number of the bits and be reserved for the linear quantization
    region (the fraction). n determines the maximum number of
    exponent bits.

    For more details see
    (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561]
    r   r   rS   g?Nr0          @
   r   r   )	r   intrC   r   r   r   r   r   r   )r   max_exponent_bitsr   datanon_sign_bitsadditional_itemsr   fraction_items
boundariesmeansr   s              r   create_dynamic_mapr   0  s0   " D v!15M]->>?!C//$% SRXaA$58I$IJQN^_dehudu  yJ  eJ  MN  eN  _O  RS  _S  U^^CN;
CR:ab>1S8",q01A56%?GGIIr 1A 56:;<uDLLNNDaQ0@10DEJ_z!"~5<Eb01459:eCKKMMD2$5$9":Q">?@5HPPRRS 	KKNKK
D	/C3Z A 	IIK$<r   c                 D   t        | d|z  dz
        }|j                         }|j                  d       dt        |      z
  }t	        |      D ]  }|j                  d        |j                          t        |      }||j                         j                         z  }|S )NrS   r   )num_quantilesr   r   )	estimate_quantilesr   r   r   r   r   r   absr   )rq   r   qr   r   s        r   create_quantile_mapr   b  s    1AzM!O<A	
AHHQK
A,C3Z 	 FFHq	A	!%%'++-AHr   c                      t         j                  j                         syt         j                  j                         \  } }| dk  ry| dk(  ryy)N
col_turing   r   
col_ampere)rC   rD   is_availableget_device_capability)major_minors     r   get_special_format_strr   q  sA    ::""$\JJ446ME6zzr   c                    d}t               }| D ]Z  }|t        |dd      }||j                  j                  dk(  xs |z  }|r6|j	                  |j                  j
                         \ |s2t        d| D cg c]  }|j                  |j                  f c}       t        |      dkD  r2t        d| D cg c]  }|j                  |j                  f c}       |S c c}w c c}w )NTrd   FrD   zZAll input tensors need to be on the same GPU, but found some tensors to not be on a GPU:
 r   zcInput tensors need to be on the same GPU, but found the following tensor and device combinations:
 )	setry   rJ   typeaddrB   	TypeErrorrW   r   )tensorson_gpugpu_idsr3   rd   s        r   	is_on_gpur   |  sW   FeG (9h1j%0188==F*6h7KK'( u  U\  w]  PQxyxx  BC  BJ  BJ  xK  w]  v^  _  `  	`
7|a~  ^e  @f  YZ  BC  BI  BI  KL  KS  KS  AT  @f  g  h  i  	iM w] @fs   8C"
8C'
rq   returnc                 b    | yt        j                  | j                  j                               S )z
    Get the ctypes pointer from a PyTorch Tensor.

    Parameters
    ----------
    A : torch.tensor
        The PyTorch tensor.

    Returns
    -------
    ctypes.c_void_p
    N)rG   rH   r   data_ptr)rq   s    r   ro   ro     s&     	y{{166??,--r   c                     t         j                  j                         }t         j                  j                  |        |S r$   )rC   rD   rE   rF   )rJ   rK   s     r   pre_callr     s,    **++-K	JJ&!r   c                 B    t         j                  j                  |        y r$   )rC   rD   rF   )rK   s    r   	post_callr     s    	JJ+&r   c           
          d| t         j                  k(  rdnd d| d| d|rdnd }t        t        |      s"t	        |       t        d| d	| d
|  d|       t        t        |      S )Nctransform_r       __to_r3   nz"Transform function not supported:  to z for data type z and transpose=)rC   int8hasattrr	   print
ValueErrorry   )rU   orderAorderOut	transposenames        r   get_transform_funcr     s    u

2!<AfXT(ST\eUXknTopD3d0XJoV[U\\klukvw
 	
 sD!!r   c                    t         j                  }t        |       }|dk(  r| d   }n|dk(  r| d   | d   z  }| d   }	| |f}
|r}|	}|}	| d d d   |f}
|dk(  s|dk(  r || ||      |
fS |d	k(  rd
|	dz   d
z  z  }	 ||	f||      |
fS |dk(  r%d
|	dz   d
z  z  }	ddz   dz  z  } |||	f||      |
fS |dk(  r%d
|	dz   d
z  z  }	d
dz   d
z  z  } |||	f||      |
fS t        d|       )NrS   r      r   r0   rowcolrT   col32r      r   r   r   r   zTo_order not supported: )rC   zerosr   r}   )rW   rU   rJ   to_order
from_orderr   	init_funcdimsrowscolsstatetmps               r   get_transform_bufferr    sj    Iu:DqyQx	Qx%("9DHEtth'5H-eF;UBB	W	dRiB&'$U6BEII	\	!dRiB&'TAX!O$$U6BEII	\	!dRiB&'dRiB&'$U6BEII!$<XJ"GHHr   c                    || j                   |f}n|d   }|-t        |d   | j                  | j                  ||d         \  }}n|d   |f}t	        | j                  |||      }|d   }	t        |	      dk(  r1t        j                  |	d         }
t        j                  |	d         }n|Ut        |	      }t        |D cg c]  }|	|   	 c}      }
t        j                  ||
z        }t        j                  |
      }
n6t        j                  |	d   |	d   z        }
t        j                  |	d         }t        j                         j                  | j                        } ||t        |       t        |      |
|       ||fS c c}w )Nr   r   rS   )rW   r  rU   rJ   r   r   rG   rp   r   r<   r-   rI   ro   )rq   r  r	  rj   r   r  ld	new_stater   rW   dim1dim2r   r   ptrs                  r   nvidia_transformr    sb    }*%1X

{-!Haggqxx58
Y 1Xx(	aggz8YGD!HE
5zQzz%(#zz%(#	Kr*!U1X*+zz!t)$zz$zz%(U1X-.zz%(#

%
%
'
3
3AHH
=Cgaj'#,d3	> +s   ?Frj   r   c           	      :   | j                         dk  rt        d| j                          d      |dkD  rt        d|       |dk  r|dk(  rdd|z  z  }|0t        j                  dt        j                  | j
                  	      }t        | |g       t        | j
                        }| j                  t        j                  k(  r_t        j                  t        |       t        |      t        j                  |      t        j                  | j                                      n| j                  t        j                  k(  r_t        j                   t        |       t        |      t        j                  |      t        j                  | j                                      nt        d
| j                         t#        |       |dk  rQt%        d|z        }t        j&                  dd|      j)                         j+                  | j
                        }||   }|S )a  
    Estimates 256 equidistant quantiles on the input tensor eCDF.

    Uses SRAM-Quantiles algorithm to quickly estimate 256 equidistant quantiles
    via the eCDF of the input tensor `A`. This is a fast but approximate algorithm
    and the extreme quantiles close to 0 and 1 have high variance / large estimation
    errors. These large errors can be avoided by using the offset variable which trims
    the distribution. The default offset value of 1/512 ensures minimum entropy encoding -- it
    trims 1/512 = 0.2% from each side of the distrivution. An offset value of 0.01 to 0.02
    usually has a much lower error but is not a minimum entropy encoding. Given an offset
    of 0.02 equidistance points in the range [0.02, 0.98] are used for the quantiles.

    Parameters
    ----------
    A : torch.Tensor
        The input tensor. Any shape.
    out : torch.Tensor
        Tensor with the 256 estimated quantiles.
    offset : float
        The offset for the first and last quantile from 0 and 1. Default: 1/(2*num_quantiles)
    num_quantiles : int
        The number of equally spaced quantiles.

    Returns
    -------
    torch.Tensor:
        The 256 quantiles in float32 datatype.
    r   zQQuantile estimation needs at least 256 values in the Tensor, but Tensor had only z values.zgCurrently only a maximum of 256 equally spaced quantiles are supported, but the argument num_quantiles=      `?r   rS   )r   rT   zNot supported data type r      )rm   r}   rC   r  rx   rJ   r   r   rU   r	   cestimate_quantiles_fp32ro   rG   rz   r^   float16cestimate_quantiles_fp16r   roundr   longto)rq   rj   r   r   rJ   stepidxs          r   r   r     s   : 	wwy31  5F  GH  GN  GN  GP  FQ  QY  3Z  [  [s"5  9`  an  `o  7p  #q  qsv0AmO$
{%++fEMM!((SCq#hahhFww%--$$WQZrzz&?QSUS[S[\]\c\c\eSfg	
EMM	!$$WQZrzz&?QSUS[S[\]\c\c\eSfg!$<QWWI"FGGfsS&'nnQ]388:==ahhG#hJr   r   absmaxc                 \   |;dt         vr*t               j                  | j                        t         d<   t         d   }|U| j	                         }||z  }|||z  dkD  rdndz  }t        j                  |f| j                  t
        j                        }|%t        j                  | t
        j                        }| j                  j                  dk7  r |dv sJ t        j                  |      }t        | j                        }	|j                  | j                        }t        || ||g       | j                  t
        j                  k(  rat!        j"                  t%        |      t%        |       t%        |      t%        |      |t        j&                  | j	                                      n| j                  t
        j(                  k(  r`t!        j*                  t%        |      t%        |       t%        |      t%        |      |t        j&                  | j	                                      n| j                  t
        j,                  k(  r`t!        j.                  t%        |      t%        |       t%        |      t%        |      |t        j&                  | j	                                      nt1        d	| j                         t3        | j                         n|j5                         }t!        j6                  t%        |      t%        |       t%        |      t%        |      t        j8                  |      t        j8                  | j	                                      |r=|j;                         }
||
z  }t=        ||d
      \  }}||||| j                  |
|g}||fS ||||| j                  ddg}||fS )a  
    Quantize tensor A in blocks of size 4096 values.

    Quantizes tensor A by dividing it into blocks of 4096 values.
    Then the absolute maximum value within these blocks is calculated
    for the non-linear quantization.

    Parameters
    ----------
    A : torch.Tensor
        The input tensor.
    code : torch.Tensor
        The quantization map.
    absmax : torch.Tensor
        The absmax values.
    out : torch.Tensor
        The output tensor (8-bit).

    Returns
    -------
    torch.Tensor:
        The 8-bit tensor.
    tuple(torch.Tensor, torch.Tensor):
        The quantization state to undo the quantization.
    Ndynamicr   r   rJ   rU   rU   cpu            r      @   ?Blockwise quantization only supports 16/32-bit floats, but got F)	blocksizenested)	name2qmapr   r   rJ   rm   rC   r  rx   
zeros_liker{   r   rG   rp   r   r   rU   r	   cquantize_blockwise_fp32ro   r^   r  cquantize_blockwise_fp16bfloat16cquantize_blockwise_bf16r   r   r(  cquantize_blockwise_cpu_fp32
c_longlongmeanquantize_blockwise)rq   r   r#  rj   r1  r2  r   blocks
cblocksizerK   r   qabsmaxstate2r  s                 r   r<  r<  :  s'   8 |I%#5#7#:#:188#DIi #~GGIiq9}q(!a/fYqxxu}}M
{q4xx}}AAAAZZ	*
qxx(wwqxx 4C()77emm#((
GFOU\]`Uacmoqowowxyxx  yB  pC  DWW%((
GFOU\]`Uacmoqowowxyxx  yB  pC  DWW&((
GFOU\]`Uacmoqowowxyxx  yB  pC  D^_`_f_f^ghii!(( xxz((
GFOU\]`Uacecpcpqzc{}  ~K  ~K  LM  LS  LS  LU  ~V  	W&,VyQVW$	6177FFK : y&!''4F:r   quant_stater1  c                 X   ||J |=|;dt         vr*t               j                  | j                        t         d<   t         d   }||||dt        j
                  ddf}|\  }}}}}}}	|r>t        ||	      }||z  }|j                  t        j
                  k7  r|j                         }|,t	        j                  | j                  || j                        }| j                  j                  dk7  r1t        | j                        }
|j                  | j                        }|dvrt        d| d      t        | ||g       |j                  t        j
                  k(  rtt        j                   t#        |      t#        |       t#        |      t#        |      t%        j&                  |      t%        j&                  | j)                                      n8|j                  t        j*                  k(  rst        j,                  t#        |      t#        |       t#        |      t#        |      t%        j&                  |      t%        j&                  | j)                                      n|j                  t        j.                  k(  rst        j0                  t#        |      t#        |       t#        |      t#        |      t%        j&                  |      t%        j&                  | j)                                      nt        d	| j                         t3        | j                         |S |j5                         }t        j6                  t#        |d
         t#        |       t#        |d         t#        |      t%        j8                  |      t%        j8                  | j)                                      |S )aQ  
    Dequantizes blockwise quantized values.

    Dequantizes the tensor A with maximum absolute values absmax in
    blocks of size 4096.

    Parameters
    ----------
    A : torch.Tensor
        The input 8-bit tensor.
    quant_state : tuple(torch.Tensor, torch.Tensor)
        Tuple of code and absmax values.
    absmax : torch.Tensor
        The absmax values.
    code : torch.Tensor
        The quantization map.
    out : torch.Tensor
        Dequantized output tensor (default: float32)


    Returns
    -------
    torch.Tensor:
        Dequantized tensor (default: float32)
    Nr%  FrT   r(  r+  r*  r,  r-  r   r.  r/  The blockwise of J is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]r0  r   r   )r3  r   r   rJ   rC   rx   dequantize_blockwiserU   floatemptyrW   r   r   r   r   r	   cdequantize_blockwise_fp32ro   rG   r^   rm   r  cdequantize_blockwise_fp16r7  cdequantize_blockwise_bf16r   r(  cdequantize_blockwise_cpu_fp32r:  )rq   rA  r#  r   rj   r1  r2  rU   r   r@  rJ   s              r   rF  rF    s   D "f&888|+I%#5#7#:#:188#DIi #dIuemmT4P{=H:FD)VUFF%ff5&<<5==(6<<>&
{kk!''qxx@xx}}!((#wwqxx AA0  <F  G  H  H1fc"#99%**74='!*gfoW^_bWcegememnwexz|  {C  {C  DE  DK  DK  DM  {N  OYY%--'**74='!*gfoW^_bWcegememnwexz|  {C  {C  DE  DK  DK  DM  {N  OYY%..(**74='!*gfoW^_bWcegememnwexz|  {C  {C  DE  DK  DK  DM  {N  O^_`_f_f^ghii!((
 J xxz**7;q>+BGAJPWXcdeXfPgipqtiuwy  xE  xE  FO  xP  RT  R_  R_  `a  `g  `g  `i  Rj  	kJr   c                 R   |d}d }| dk(  r	 g d}n4| dk(  rg d}n*| dk(  rg d}n | dk(  r|d	k(  rg d
d d d   }nt        d      |t        d|  d      t        |      }||j                         j                         z  }|j	                         dk(  sJ |j                  |      S )NrD   nf4)r   g    6Gg    fg    TFٿg   I4ҿg   ০ǿg    Or   g   __?g   `\?g   ?g   @g?g    4?g   ` ?g   `v"?r   fp4)r   g      ?g       @g      (@g      @g      @r   g      @r   g      g       g      (g      g      g       g      int4)r         rR   r  rS   r   r   r   r0   iiaf4r/  )r   g|8geg:Kڞ׿gH2퓊cпg}Yu-ÿgQ	#(Dr   gF?g`_?g
0E?gL_߹E?gƶ=?ga@?gкv-?r   r0   z94-bit AbnormalFloats currently only support blocksize 64.z	Typename z not supported   )r}   r   r   r   rm   r   )typenamerJ   r1  r   s       r   get_4bit_typerZ    s    ~vD5		) 
U	 l	V	G	U	 ???CtED &(acc|!IhZ~"FGG$<DDHHJNND::<2776?r   c                 "    t        | ||||d      S NrO  quantize_4bitrq   r#  rj   r1  compress_statisticss        r   quantize_fp4ra        FC4GOOr   c                 "    t        | ||||d      S NrN  r]  r_  s        r   quantize_nf4re    rb  r   c           
         | j                   j                  dk7  r"t        d| j                   j                         |dvrt        d| d      | j                         }| j                  }|E||z  }|||z  dkD  rdndz  }t        j                  |f| j                   t
        j                  	      }|8t        j                  |dz   d
z  dft
        j                  | j                         }|dv sJ t        | j                         }	t        | ||g       | j                  t
        j                  k(  r|dk(  rft        j                  t        d      t        |       t        |      t        |      t        j                   |      t        j"                  |             nXt        j$                  t        d      t        |       t        |      t        |      t        j                   |      t        j"                  |             n| j                  t
        j&                  k(  r|dk(  rft        j(                  t        d      t        |       t        |      t        |      t        j                   |      t        j"                  |             njt        j*                  t        d      t        |       t        |      t        |      t        j                   |      t        j"                  |             n| j                  t
        j,                  k(  r|dk(  ret        j.                  t        d      t        |       t        |      t        |      t        j                   |      t        j"                  |             n}t        j0                  t        d      t        |       t        |      t        |      t        j                   |      t        j"                  |             nt3        d| j                         t5        | j                          t7        || j                         }
|r?|j9                         }||z  }t;        |d      \  }}~||| j                  |||g||
g}||fS ||| j                  |d||
g}||fS )a  
    Quantize tensor A in blocks of 4-bit values.

    Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.

    Parameters
    ----------
    A : torch.Tensor
        The input tensor.
    absmax : torch.Tensor
        The absmax values.
    out : torch.Tensor
        The output tensor (8-bit).
    blocksize : int
        The blocksize used in quantization.
    quant_type : str
        The 4-bit quantization data type {fp4, nf4}

    Returns
    -------
    torch.Tensor:
        The 8-bit tensor with packed 4-bit values.
    tuple(torch.Tensor, torch.Size, torch.dtype, int):
        The quantization state to undo the quantization.
    rD   z0Device type not supported for FP4 quantization: rO  rN  4-bit quantization data type  is not implemented.Nr   r   r&  rS   rT   r)  rO  r0  )rJ   r   )r1  )rJ   r   r}   rm   rW   rC   r  rx   r{   r   r   rU   r	   cquantize_blockwise_fp32_fp4ro   rG   rp   r^   cquantize_blockwise_fp32_nf4r  cquantize_blockwise_fp16_fp4cquantize_blockwise_fp16_nf4r7  cquantize_blockwise_bf16_fp4cquantize_blockwise_bf16_nf4r   r   rZ  r;  r<  )rq   r#  rj   r1  r`  
quant_typer   input_shaper=  rK   datatyper   r?  r@  r  s                  r   r^  r^    s'   4 	xx}}!$TUVU]U]UbUbTc"dee'!$A*Ma"bcc		A''K~iq9}q(!a/fYqxxu}}M {kkAaC!8Q-u{{188L====188$Kq#vww%--,,WT]GAJPVY`adYegigqgqr{g|  A  G  G  HI  J  K,,WT]GAJPVY`adYegigqgqr{g|  A  G  G  HI  J  K	
EMM	!,,WT]GAJPVY`adYegigqgqr{g|  A  G  G  HI  J  K,,WT]GAJPVY`adYegigqgqr{g|  A  G  G  HI  J  K	
ENN	",,WT]GAJPVY`adYegigqgqr{g|  A  G  G  HI  J  K,,WT]GAJPVY`adYegigqgqr{g|  A  G  G  HI  J  KZ[\[b[bZcdeeahhZ9H&,VsC+qww	FF;KZYab : aggy$
HU:r   c                 "    t        | ||||d      S r\  dequantize_4bitrq   rA  r#  rj   r1  s        r   dequantize_fp4rw  W      1k63	5IIr   c                 "    t        | ||||d      S rd  rt  rv  s        r   dequantize_nf4rz  Z  rx  r   c           
         |dvrt        d| d      |dvrt        d| d      |||J |j                  }|j                  }n
|\  }}}}}}}	C|\  }
}t	        ||      }||
z  }|j                  t
        j                  k7  r|j                         }|"t        j                  ||| j                        }|j                         }t        | j                        }t        | ||g       |j                  t
        j                  k(  r|d	k(  rft        j                  t        d      t        |       t        |      t        |      t!        j"                  |      t!        j"                  |             nXt        j$                  t        d      t        |       t        |      t        |      t!        j"                  |      t!        j"                  |             n|j                  t
        j&                  k(  r|d	k(  rft        j(                  t        d      t        |       t        |      t        |      t!        j"                  |      t!        j"                  |             njt        j*                  t        d      t        |       t        |      t        |      t!        j"                  |      t!        j"                  |             n|j                  t
        j,                  k(  r|d	k(  ret        j.                  t        d      t        |       t        |      t        |      t!        j"                  |      t!        j"                  |             n}t        j0                  t        d      t        |       t        |      t        |      t!        j"                  |      t!        j"                  |             nt        d
| j                         t3        | j                         | j                  d   dk(  rdnd}|r|j5                         S |S )a  
    Dequantizes FP4 blockwise quantized values.

    Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.

    Parameters
    ----------
    A : torch.Tensor
        The input 8-bit tensor (packed 4-bit values).
    quant_state : tuple(torch.Tensor, torch.Size, torch.dtype)
        Tuple of absmax values, original tensor shape and original dtype.
    absmax : torch.Tensor
        The absmax values.
    out : torch.Tensor
        Dequantized output tensor.
    blocksize : int
        The blocksize used in quantization.
    quant_type : str
        The 4-bit quantization data type {fp4, nf4}


    Returns
    -------
    torch.Tensor:
        Dequantized tensor.
    rC  rD  rE  rg  rh  ri  NrT   rO  r0  r   r   TF)r   r}   rW   rU   rF  rC   rx   rG  rH  rJ   rm   r   r   r	   cdequantize_blockwise_fp32_fp4ro   rG   r^   cdequantize_blockwise_fp32_nf4r  cdequantize_blockwise_fp16_fp4cdequantize_blockwise_fp16_nf4r7  cdequantize_blockwise_bf16_fp4cdequantize_blockwise_bf16_nf4r   r3   )rq   rA  r#  rj   r1  rp  rW   rU   compressed_stats	data_typer   r@  r   rJ   is_transposeds                  r   ru  ru  ]  s   6 ==,YK  8B  C  D  	D'!$A*Ma"bcc!co55				S^Pui)9:y #)%ff5&<<5==(6<<>&
{kk%uQXX>		A ahhFq&#
yyEMM!..wt}gaj'RX/[bcf[gikiqiqr{i|  A  G  G  HI  J  K..wt}gaj'RX/[bcf[gikiqiqr{i|  A  G  G  HI  J  K	emm	#..wt}gaj'RX/[bcf[gikiqiqr{i|  A  G  G  HI  J  K..wt}gaj'RX/[bcf[gikiqiqr{i|  A  G  G  HI  J  K	enn	$..wt}gaj'RX/[bcf[gikiqiqr{i|  A  G  G  HI  J  K..wt}gaj'RX/[bcf[gikiqiqr{i|  A  G  G  HI  J  KZ[\[b[bZcdeeahhWWQZ1_T%MSUUWn*r   c                    |Vdt         vr*t               j                  | j                        t         d<   t         d   }|j                  | j                        }t	        j
                  |       j                         }|j                  t        j                  k7  r|j                         }| |z  }t        |||      }|||ffS )Nr%  )r3  r   r   rJ   rC   r   r   rU   rx   rG  quantize_no_absmax)rq   r   rj   r#  inps        r   quantizer    s    |I%#5#7#:#:188#DIi #wwqxx YYq\F||u}}$v||~f
f*C
S$
,Cr   c                     ||J |X|Vdt         vr*t               j                  | j                        t         d<   t         d   }|j                  | j                        }|||f}t	        | |d   |      }||d   z  S )Nr%  r   r   )r3  r   r   rJ   dequantize_no_absmax)rq   rA  r#  r   rj   s        r   
dequantizer    s     "f&888|+I%#5#7#:#:188#DIi #wwqxx tn
q+a.#
6CQr   c           	      V   t        | j                        }|%t        j                  | t        j                        }t        | |g       t        j                  t        |      t        |       t        |      t        j                  | j                                      t        |       |S )a  
    Quantizes input tensor to 8-bit.

    Quantizes the 32-bit input tensor `A` to the 8-bit output tensor
    `out` using the quantization map `code`.

    Parameters
    ----------
    A : torch.Tensor
        The input tensor.
    code : torch.Tensor
        The quantization map.
    out : torch.Tensor, optional
        The output tensor. Needs to be of type byte.

    Returns
    -------
    torch.Tensor:
        Quantized 8-bit tensor.
    r'  )r   rJ   rC   r4  r{   r   r	   	cquantizero   rG   r^   rm   r   rq   r   rj   rK   s       r   r  r    sr    * 188$K
{%**1EKK@Cq#hMM'$-WS\288AGGI;NOkJr   c           	      X   t        | j                        }|%t        j                  | t        j                        }t        || |g       t        j                  t        |      t        |       t        |      t        j                  | j                                      t        |       |S )a  
    Dequantizes the 8-bit tensor to 32-bit.

    Dequantizes the 8-bit tensor `A` to the 32-bit tensor `out` via
    the quantization map `code`.

    Parameters
    ----------
    A : torch.Tensor
        The 8-bit input tensor.
    code : torch.Tensor
        The quantization map.
    out : torch.Tensor
        The 32-bit output tensor.

    Returns
    -------
    torch.Tensor:
        32-bit output tensor.
    r'  )r   rJ   rC   r4  rx   r   r	   cdequantizero   rG   r^   rm   r   r  s       r   r  r    st    * 188$K
{%**1EMMBCtQnOOGDM71:ws|RXXaggi=PQkJr   optimizer_namegr   state1beta1epsr!  lrr@  beta2weight_decaygnorm_scale	unorm_vec	max_unormc                    d}|dkD  r-t        j                  |j                  j                               }d}|j                  t         j
                  k(  rt        |    d   }n|j                  t         j                  k(  rt        |    d   }nd|j                  t         j                  k(  r"t        t        |          dk(  rt        |    d   }n%t        d|j                   d|j                         t        |||||g       t        |j                        } |t        |      t        |      t        |      t        |      t        |      t        j                   |      t        j                   |      t        j                   |      t        j                   |	      t        j                   |      t        j                   |
      t        j"                  |      t        j                   |      t        j                   |      t        j$                  |      t        j"                  |j'                                      t)        |       y)	a$  
    Performs an inplace optimizer update with one or two optimizer states.

    Universal optimizer update for 32-bit state and 32/16-bit gradients/weights.

    Parameters
    ----------
    optimizer_name : str
        The name of the optimizer: {adam}.
    g : torch.Tensor
        Gradient tensor.
    p : torch.Tensor
        Parameter tensor.
    state1 : torch.Tensor
        Optimizer state 1.
    beta1 : float
        Optimizer beta1.
    eps : float
        Optimizer epsilon.
    weight_decay : float
        Weight decay.
    step : int
        Current optimizer step.
    lr : float
        The learning rate.
    state2 : torch.Tensor
        Optimizer state 2.
    beta2 : float
        Optimizer beta2.
    gnorm_scale : float
        The factor to rescale the gradient to the max clip value.
    unorm_vec : torch.Tensor
        The tensor for the update norm.
    max_unorm : float
        The maximum update norm relative to the weight norm.
    skip_zeros : bool
        Whether to skip zero-valued gradients or not (default: False).
    r   Nr   r   r  rS   AGradient+optimizer bit data type combination not supported: grad , optimizer )rC   r   r   rG  rU   rx   str2optimizer32bitr  r7  r   r   r   r   rJ   ro   rG   rz   rp   c_boolrm   r   )r  r  r   r  r  r  r!  r  r@  r  r  r  r  r  
skip_zeros
param_norm
optim_funcrK   s                     r   optimizer_update_32bitr    s   p J3ZZ/
 Jww%--'7:
	
EMM	!'7:

''U^^
#,>~,N(OQR(R'7:
\]^]d]d\eeqrxr~r~q  A  B  	Bq!VVY/0188$K

	


9


:


5


5


3


< 


4


2


;
		*


1779!" kr   qmap1qmap2max1max2new_max1new_max2c                 h   d}|dkD  r-t        j                  |j                  j                               }t	        |j
                        }t        ||||||
|||||g       |j                  t         j                  k(  rt|j                  t         j                  k(  rVt        |    d   t        |      t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |	      t        |
      t        |      t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |j                                      n|j                  t         j                   k(  rs|j                  t         j                  k(  rUt        |    d   t        |      t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |	      t        |
      t        |      t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |j                                      n%t#        d|j                   d|j                         t%        |       y)a  
    Performs an inplace Adam update.

    Universal Adam update for 32/8-bit state and 32/16-bit gradients/weights.
    Uses AdamW formulation if weight decay > 0.0.

    Parameters
    ----------
    optimizer_name : str
        The name of the optimizer. Choices {adam, momentum}
    g : torch.Tensor
        Gradient tensor.
    p : torch.Tensor
        Parameter tensor.
    state1 : torch.Tensor
        Adam state 1.
    state2 : torch.Tensor
        Adam state 2.
    beta1 : float
        Adam beta1.
    beta2 : float
        Adam beta2.
    eps : float
        Adam epsilon.
    weight_decay : float
        Weight decay.
    step : int
        Current optimizer step.
    lr : float
        The learning rate.
    qmap1 : torch.Tensor
        Quantization map for first Adam state.
    qmap2 : torch.Tensor
        Quantization map for second Adam state.
    max1 : torch.Tensor
        Max value for first Adam state update.
    max2 : torch.Tensor
        Max value for second Adam state update.
    new_max1 : torch.Tensor
        Max value for the next Adam update of the first state.
    new_max2 : torch.Tensor
        Max value for the next Adam update of the second state.
    gnorm_scale : float
        The factor to rescale the gradient to the max clip value.
    unorm_vec : torch.Tensor
        The tensor for the update norm.
    max_unorm : float
        The maximum update norm relative to the weight norm.
    r   r   r   r  r  N)rC   r   r   rG  r   rJ   r   rU   rx   r{   str2optimizer8bitro   rG   rz   rp   rm   r  r   r   )r  r  r   r  r@  r  r  r  r!  r  r  r  r  r  r  r  r  r  r  r  r  rK   s                         r   optimizer_update_8bitr  d  s   P J3ZZ/
188$Kq!VVYudD(T\]^ww%--FLLEKK$?.)!,AJAJFOFOIJJy!JJz"JJuJJuJJsOJJtJJrNENENDMDMHHJJ|$JJ{#JJqwwy!+	
. 
EMM	!fllekk&A.)!,AJAJFOFOIJJy!JJz"JJuJJuJJsOJJtJJrNENENDMDMHHJJ|$JJ{#JJqwwy!+	
0 OPQPWPWyXdekeqeqdrs
 	
 kr   absmax1absmax2c                 $   d }t        |j                        }t        |||||
|||g       |j                  t        j
                  k(  r*|j                  t        j                  k(  rt        |    d   }n|j                  t        j                  k(  r*|j                  t        j                  k(  rt        |    d   }n|j                  t        j                  k(  r?|j                  t        j                  k(  r"t        t        |          dk(  rt        |    d   }n%t        d|j                   d|j                         t        |       t        |||||
|||g       t        |j                        } |t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |      t        j                   |      t        j                  |	      t        |
      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j"                  |      t        j                   |j%                                      t        |       y )Nr   r   r  rS   r  r  )r   rJ   r   rU   rC   rx   r{   str2optimizer8bit_blockwiser  r7  r   r   r   ro   rG   rz   rp   r  rm   )r  r  r   r  r@  r  r  r  r!  r  r  r  r  r  r  r  r  r  rK   s                      r   optimizer_update_8bit_blockwiser    s   ( J188$Kq!VVUE7GDEww%--FLLEKK$?0@C
	
EMM	!fllekk&A0@C

''U^^
#(C
).9
:A
=0@C
OPQPWPWyXdekeqeqdrs
 	
 kq!VVUE7GDE188$K




5


5


3


4


2


< 


;
		*


1779#& kr   grad	gnorm_vec
percentilec           	      6   t        | j                        }t        | |g       | j                  t        j
                  k(  r_t        j                  t        |       t        |      t        j                  |      t        j                  | j                                      n| j                  t        j                  k(  r_t        j                  t        |       t        |      t        j                  |      t        j                  | j                                      nt        d| j                   d      t        |       t	        j                   ||dz           }t	        j"                  |      \  }}t	        j                   ||         }d}	||kD  r||z  }	|||	fS )a   Applies percentile clipping

    grad: torch.Tensor
        The gradient tensor.
    gnorm_vec: torch.Tensor
        Vector of gradient norms. 100 elements expected.
    step: int
        The current optimiation steps (number of past gradient norms).

    zGradient type z not supported!d   r   )r   rJ   r   rU   rC   rx   r	   cpercentile_clipping_g32ro   rG   rp   rm   r  cpercentile_clipping_g16r   r   sqrtr   )
r  r  r!  r  rK   current_gnormvalsr"  
clip_valuer  s
             r   percentile_clippingr  %  s9    4;;'KtY zzU]]"$$DMIJJtJJtzz|$		
 
u}}	$$$DMIJJtJJtzz|$		
 >$**_EFFkJJy45M

9%ID#D,-JKz! =0*k11r   	histogramindex1index2sourcec                 ,   t        | j                        dk(  sJ | j                  t        j                  k(  sJ |j                  t        j                  k(  sJ |j                  t        j
                  k(  sJ |j                  t        j
                  k(  sJ | j                  j                  dk(  sJ |j                  j                  dk(  sJ |j                  j                  dk(  sJ |j                  j                  dk(  sJ t        j                  | j                  d         }t        j                  |j                               }t        | |||g       t        j                  t        |       t        |      t        |      t        |      ||       y )NrS   rD   r   )r   rW   rU   rC   rx   int32rJ   r   rG   rp   rm   r   r	   chistogram_scatter_add_2dro   )r  r  r  r  maxdim1r   s         r   histogram_scatter_add_2dr  Q  s?    y1$$$??emm+++<<5==(((<<5;;&&&<<5;;&&&  F***=='''=='''=='''jj+,G


6<<>"Ay&&&12!!')"4gfowvX_`fXgiprstr   c                    t         j                  j                         st         j                  j                          | j                  |k7  s|j                  |k7  r%t        d| j                   d|j                         | j                  }|j                  }|}|}	d}
t        |      dk(  rt        |      dk(  r|s%|	s#| j                  d   |j                  d   k7  rd}
n|r%|	s#| j                  d   |j                  d   k7  rd}
n|r%|	r#| j                  d   |j                  d   k7  rd}
n|s|	r| j                  d   |j                  d   k7  rld}
nht        |      dk(  rt        |      dk(  r|s%|	s#| j                  d   |j                  d   k7  rd}
n%|r$|	s"| j                  d   |j                  d   k7  rd}
n|r$|	r"| j                  d   |j                  d   k7  rd}
n|s|	r| j                  d   |j                  d   k7  rd}
nt        |      dk(  rt        |      dk(  r|s$|	s"| j                  d   |j                  d   k7  rd}
nq|r$|	s"| j                  d   |j                  d   k7  rd}
nK|r$|	r"| j                  d   |j                  d   k7  rd}
n%|s#|	r!| j                  d   |j                  d   k7  rd}
|a|j                  }|
syt        |      dk(  rjt        |      dk(  r[|d   |d   k(  rO|d   |d   k(  rC|d   |d   k(  r7|d   |d   k(  r+d}
n't        |      dk(  rJt        |      dk(  r<|s|	s|d   |d   f}n|r|	r|d   |d   f}n|r|	s|d   |d   f}n|s|	r|d   |d   f}nt        |      dk(  rZt        |      dk(  rL|s|	s|d   |d   |d   f}n|r|	r|d   |d   |d   f}n|r|	s|d   |d   |d   f}nz|sx|	rv|d   |d   |d   f}ngt        |      dk(  rYt        |      dk(  rK|s|	s|d   |d   |d   f}n8|r|	r|d   |d   |d   f}n%|r|	s|d   |d   |d   f}n|s|	r|d   |d   |d   f}|
st        d	| d
| d| d
|	 d	      S )Nz3Expected torch.int8 input tensors A and B, but got  and TrS   r   r   Fr  z?Tensor dimensions incorrect for matrix mulitiplication: A x B:  x z with transpose for A x B: .)	rC   rD   is_initializedinitrU   r   rW   r   r   )rq   r   rj   transposed_Atransposed_Bexpected_typesAsBtAtBcorrectsouts               r   check_matmulr  d  su   ::$$&

(9ww-177m#;A!''%PQPWPWyY
 	
 
B	
B	B	BG
2w!|B1"qwwqz!9GqwwqzQWWQZ7GB1771:3GqwwqzQWWQZ7G	RA#b'Q,"qwwqz!9GqwwqzQWWQZ7GB1771:3GqwwqzQWWQZ7G	RA#b'Q,"qwwqz!9GqwwqzQWWQZ7GB1771:3GqwwqzQWWQZ7G
yy3r7a<CGqLQ2a5 Gr!u$qERUNqERUNr7a<CGqLb1r!u~1r!u~B1r!u~B1r!u~W\c"glb1r!ube,1r!ube,B1r!ube,B1r!ube,W\c"glb1r!ube,1r!ube,B1r!ube,B1r!ube,MbTQTUWTXXstvswwz{}z~~  A
 	
 Kr   r   c                 z   t        | j                        }|t        d      | j                         | j                  d   k7  rt        d      |d   }|d   }|\  }	}
}}}}}||\  }}t        |	|      }	|	|z  }	|t        | j                        dk(  rJt        j                  | j                  d   | j                  d   |f| j                  | j                        }n;t        j                  | j                  d   |f| j                  | j                        }d}|d   }|d   }|d   }|d   }| j                  d   dz   dz  }t        || ||	|d   g       t        j                  |      }t        j                  |      }t        j                  |      }t        j                  |      }t        j                  |      }t        j                  |      }|j                  t        j                  k(  r| j                  t        j                  k(  rht        j                   |||t#        |       t#        |      t#        |	      t#        |d         t#        |      |||t        j                  |d                n8| j                  t        j$                  k(  rgt        j&                  |||t#        |       t#        |      t#        |	      t#        |d         t#        |      |||t        j                  |d                n| j                  t        j(                  k(  rgt        j*                  |||t#        |       t#        |      t#        |	      t#        |d         t#        |      |||t        j                  |d                n0t-        d	| j                         t-        d	| j                         t/        |       |S )
NzGstate cannot None. gem_4bit( ) requires the state from quantize_4bit( )r0   zcDimensions of A are invalid. Must be a vector with the leading dimensions of "1", e.g. [1, 1, 2048]r   r   r  sizerU   rJ   rS   z%Matmul not implemented for data type )r   rJ   r   rm   rW   rF  r   rC   rH  rU   r   rG   rp   r{   r  r	   cgemm_4bit_inference_naive_fp16ro   r7  cgemm_4bit_inference_naive_bf16rx   cgemm_4bit_inference_naive_fp32r}   r   )rq   r   rj   r  r  r  rK   Bshapeboutr#  rW   rU   r1  r  rp  r  r   r@  r   mkldaldcldbs                           r   	gemv_4bitr    s    188$K}bddwwyAGGBK~  A  	A1XF!9DOTLFE5)%5z9#)%ff5&
{qww<1++AGGAJ
D#AYZYaYabC++AGGAJ#5QWWQXXVC	Aq	Aq	A
)C
)C772;q=1
Cq!S&%),-


1A


1A


1A
**S/C
**S/C
**S/Cww%++77emm#//1aWQZQXY_Q`bijoprjsbtv}  B  wC  EH  JM  OR  TV  T^  T^  _d  ef  _g  Th  iWW&//1aWQZQXY_Q`bijoprjsbtv}  B  wC  EH  JM  OR  TV  T^  T^  _d  ef  _g  Th  iWW%//1aWQZQXY_Q`bijoprjsbtv}  B  wC  EH  JM  OR  TV  T^  T^  _d  ef  _g  Th  i%(MaggY&WXX "$I!''"STTkJr   c                    t        | ||||      }|0t        j                  |t        j                  | j                        }t        | j                        dk(  rct        |j                        dk(  rK| j                  d   |j                  d   k(  r,| j                  d   |j                  d   k(  rt        | ||      S | j                  }|j                  }|rt        |      dk(  r|d   |d   f}n|rt        |      dk(  r|d   |d   |d   f}|rt        |      dk(  r|d   |d   f}n|rt        |      dk(  r|d   |d   |d   f}t        |      dk(  rx|j                         d   |j                  d   k(  rd}n%|j                         d   |j                  d   k(  rd}t        | j                        dk(  rL| j                         d   | j                  d   k(  rd}nq| j                         d   | j                  d   k(  rNd}nK| j                         d   | j                  d   k(  rd}n%| j                         d   | j                  d   k(  rd}t        |      dk(  r|d   }| j                         |rdnd   }	n,t        |      dk(  rt        |      dk(  r|d   |d   z  }|d   }	|d   }
|d   }|j                         |rdnd   }|d   }ngt        |      dk(  rYt        |      dk(  sJ |d   |d   k(  r|d   |d   k(  st        d| d	|       d}d}|d   }
|d   }|d   |d   z  }|
}|d   }	|
}t        j                         j                  | j                        }t        || |g       t        j                  |t!        j"                  |      t!        j"                  |      t!        j$                  
      t!        j$                        t!        j$                        t'        |      t'        |       t'        |      t!        j$                        t!        j$                  	      t!        j$                               |S )
Nr  r  r   rS   r   FTzMOnly bsi,bso->io supported for tensor contractions, but dims for A x B were: r  )r  rC   r  r  rJ   r   rW   batched_igemmstrider   r<   r-   rI   r   r	   cigemmrG   r  rp   ro   )rq   r   rj   r  r  r  r  r  r   r  r  r  r  r  r  s                  r   igemmr    s    1c<>D
{kkt5;;qxxH
177|qS\Q.771:#
aggaj(@ As++	
B	
BB1eRU^	#b'Q,eRUBqE"B1eRU^	#b'Q,eRUBqE" 2w!|88:a=AGGAJ& LXXZ]aggaj(Lqww<1xxz!}
*$A!''!*,#xxz!}
*$A!''!*,#r7a<1A((*,QA6CW\c"gl11AQ%CqEqEhhj|!4e	RA2w!||1A2a5BqE>_`b_ccfgifjk  qEqEqEBqEMe

%
%
'
3
3AHH
=C q!SkJJsBIIl+RYY|-DbjjQRmUWU_U_`aUbdfdndnopdqqz71:ws|RZZ_bjjQToWYWaWabeWfhJr   c                    t        | j                        dk(  rt        |j                        dk(  s%t        d| j                   d|j                         t        | ||||      }|0t	        j
                  |t        j                  | j                        }|j                         r|j                         d   }d}n|j                         }|d   |j                  d   k7  r$|j                         }|j                         d   }n|d   |j                  d   k(  rd	}|j                         d   }n{|d   dk(  r$|j                         }|j                         d   }nO|d   dk(  r$|j                         }|j                         d   }n#|j                         }|j                         d   }| j                         r| j                         d   }d}n| j                         }|d   | j                  d   k7  r&| j                         } | j                         d   }d}nP|d   | j                  d   k(  r| j                         d   }d	}n%| j                         } | j                         d   }d}| j                  d   }	| j                  d   }
|j                  d   }|j                  d   }|}|j                  d   |j                  d   z  }| j                  d   | j                  d   z  }| j                  d   |j                  d   z  }t        j                         j                  | j                        }t        || |g       t        j                   |t#        j$                  |      t#        j$                  |      t#        j&                  |      t#        j&                  |
      t#        j&                  |      t)        |      t)        |       t)        |      t#        j&                  |      t#        j&                  |      t#        j&                  |      t#        j*                  |      t#        j*                  |      t#        j*                  |      t#        j,                  |	             |S )
Nr  z@Expected 3-dimensional tensors for bmm, but got shapes A and B: r  r  r   Fr   rS   T)r   rW   r   r  rC   r  r  rJ   is_contiguousr  
contiguousr<   r-   rI   r   r	   cbatched_igemmrG   r  rp   ro   c_longc_uint32)rq   r   rj   r  r  r  r  sr  	num_batchr   r  r  r  strideAstrideBstrideCr  s                     r   r  r  S  sv    qww<1CLA$5NqwwiW\]^]d]d\ef
 	
 1c<>D
{kkt5;;qxxHhhjmHHJQ41771:A((*Q-CqTQWWQZL((*Q-CtqyLLNhhjm1LLNhhjmLLNhhjmhhjmHHJQ41771:A((*Q-C LqTQWWQZ((*Q-CLA((*Q-C L 
I	
A	
A	
A
Cggaj1771:%Gggaj1771:%Gggaj1771:%G

%
%
'
3
3AHH
=Cq!SksBIIl3RYY|5LbjjYZm]_]g]ghi]jlnlvlvwxlyqz71:ws|RZZ_bjjQToWYWaWabeWfyy!299W#5ryy7I2;;W`Kac Jr   c                 	   |d   }|d   }t        |      }	t        |      }
|
dk(  sJ d       |	dk(  r|d   }n|	dk(  r|d   |d   z  }|d   x}}t        t        |            dkD  s
J d|        |d   dk(  r:|	dk(  r5t        j                  d|d   f| j
                  t        j                        S |d   dk(  rH|	dk(  rCt        j                  t        |d d |d   gz         | j
                  t        j                        S |	dk(  r'|%t        |d   |d   f|| j
                  dd	      \  }}n/|	dk(  r*|(t        |d   |d   |d   f|| j
                  dd	      \  }}|
dk7  sJ d
       | j
                  j                  dk(  sJ |j
                  j                  dk(  sJ | j                  t        j                  k(  sJ |j                  t        j                  k(  sJ |j                  |k(  sJ |d   dk(  sJ |d   dv sJ |d   dk(  sJ |d   |d   k(  sJ d| d|        |d   }| j
                  }t        j                  j                  | j
                         t        j                         j!                  | j
                        }t#        |       }t#        |      }t#        |      }|d   }t%        j&                  dz        }|dk(  r"t%        j&                  |dz   dz  dz  dz        }n!t%        j&                  |dz   dz  dz  dz        }t%        j&                  |dz        }t%        j&                  |      }t%        j&                  |      }t%        j&                  |      }d}t#        d       }t)        | ||g       |dk(  rS|t        j*                  k(  r t-        j.                  |||||||||||      }nwt-        j0                  |||||||||||      }nW|dk(  rR|t        j*                  k(  r t-        j2                  |||||||||||      }nt-        j4                  |||||||||||      }|dk(  r.t7        d| d| d|d    d|||f d|||f 
       t9        d      t        j                  j                  |       ||fS )Nr   rS   z:Only two dimensional matrices are supported for argument Br  r   z(Input tensor dimensions need to be > 0: r&  r  r  zlen(B.shape)==3 not supportedrD   r   r   r0   zNMatmullt only supports A @ B^T. Inner matrix dimensions do not match: A @ B = z @ r   r   r   r   r  r   zA: z, B: z, C: z; (lda, ldb, ldc): z; (m, n, k): zcublasLt ran into an error!)r   r   r   rC   rH  rJ   r  tupler  r   rU   r   rD   rF   r<   r-   rI   ro   rG   rp   r   r  r	   cigemmlt_turing_32cigemmlt_turing_8cigemmlt_ampere_32cigemmlt_ampere_8r   	Exception)rq   r   SASBrj   SoutrU   shapeAshapeBdimsAdimsBr  r  r   formatBrK   r  ptrAptrBptrCr  r  r  r  	has_errorptrRowScales                             r   igemmltr    s   UFUFKEKEA:SSS:z1I	!1Iq	!ayD1V!V%MfX#VV! ayA~%1*{{Avay>!((%--PP	aEQJ{{5vayk!9:188SXS`S`aazck(AYq	"E188We
	T 
!(AYq	6!9-uahh
	T A:666:88==F"""88==F"""77ejj   77ejj   99a5Ga500007gr
fRj l	WX^W__bcibjkl eG((K	JJ!((#

%
%
'
3
3AHH
=C1:D1:D3<Dr
A
**QV
C, jj4!8/Q.34 jj4"9+r1B67
**QV
C


1A


1A


1AI$-Kq!Sk,EKK..Q1dD$S#sI --Q1dD$S#sI 
L	 EKK..Q1dD$S#sI --Q1dD$S#sI A~F85d1gY6I3PSUX/IZZgijlmophqgrst566	JJ+&9r   c                    | j                   t        j                  k(  sJ ||j                   t        j                  k(  sJ |d   }t	        |      dk(  r|d   |d   z  |d   f}|0t        j
                  |t        j                  | j                        }|3t        j
                  |d   t        j                  | j                        }|3t        j
                  |d   t        j                  | j                        }|j                  d   |j                  d   k(  s J |j                   d|j                          |j                  d   |j                  d   k(  s J |j                   d|j                          t        | j                        }	t        |       }
t        |      }t        |      }t        |      }t        |      }t        |      }t        |      }t        j                  |d         }t        j                  |d         }t        | ||||||g       t        j                  |
||||||||	       t!        |	       |S )Nr   r  r   rS   rT    vs )rU   rC   r  r  r   rH  rJ   rx   rW   r   ro   rG   rp   r   r	   cdequant_mm_int32_fp16r   )rq   rA  	row_stats	col_statsrj   new_row_statsnew_col_statsr   	out_shaperK   r  ptrOutptrRowStatsptrColStatsptrNewRowStatsptrNewColStatsptrBiasnumRowsnumColss                      r   
mm_dequantr    s"    77ekk!!!

emm ;; ;AI
9~q\IaL0)A,?	
{kk)5==JaLahh
 aLahh
 	A)//!"445


	d9??"3454 	A)//!"445


	d9??"3454 188$K1:DS\F)$K)$K]+N]+NdmGjj1&Gjj1&Gq)Y]M4PQt[+v~Wegnpw  zA  BkJr   c           	         | j                   t        j                  k(  sJ | j                  }| j                  d   }t        | j                        dk(  r | j                  d   | j                  d   z  }n| j                  d   }|dz   dz  }|dz   dz  dz  }	|6t        j                  |ft        j                  |	      j                  d
      }|6t        j                  |ft        j                  |	      j                  d
      }|2|dkD  r-t        j                  |	|z  dz   ft        j                  |	      }t        |       }
t        |      }t        |      }t        |      }t        j                  |      }t        j                  |      }t        | j                        }t        | |||g       t!        j"                  |
|||t        j$                  |      ||       t'        |       |dkD  r|j)                  d       |||fS )Nr0   r  r   r   r  r      rX  rT   g     jr   )rU   rC   r  rJ   rW   r   rH  rx   fill_r  r  ro   rG   rp   r   r   r	   cget_col_row_statsrz   r   cumsum_)rq   r  r  nnz_block_ptr	thresholdrJ   r  r  	col_tiles
tiled_rowsr  r  r  
ptrNnzrowsrK   s                  r   get_colrow_absmaxr'  :  s    77emm###XXF772;D
177|qwwqzAGGAJ&wwqz#I"9#r)JKKG5==

%/ 	 KKG5==

%/ 	 S9$)+5;;v
 1:D)$K)$K'J::dD::dD188$Kq)Y674k:rzzR[G\^bdhik3a i..r   c                       e Zd Zd Zy)COOSparseTensorc                    |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j	                         |k(  sJ |j	                         |k(  sJ |j	                         |k(  sJ || _        || _        || _        || _        || _	        || _
        y r$   )rU   rC   r  r  rm   r  r  nnzrowidxcolidxr   )r    r  r  r+  r,  r-  r   s          r   r!   zCOOSparseTensor.__init__i  s    ||u{{***||u{{***||u}},,,||~$$$||~$$$||~$$$		r   Nr6   r7   r8   r!   r:   r   r   r)  r)  h      r   r)  c                       e Zd Zd Zy)CSRSparseTensorc                    |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j	                         |k(  sJ |j	                         |k(  sJ |j	                         |dz   k(  sJ || _        || _        || _        || _        || _	        || _
        y r   )rU   rC   r  r  rm   r  r  r+  rowptrr-  r   )r    r  r  r+  r3  r-  r   s          r   r!   zCSRSparseTensor.__init__z      ||u{{***||u{{***||u}},,,||~$$$||~$$$||~)))		r   Nr.  r:   r   r   r1  r1  y  r/  r   r1  c                       e Zd Zd Zy)CSCSparseTensorc                    |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j	                         |k(  sJ |j	                         |k(  sJ |j	                         |dz   k(  sJ || _        || _        || _        || _        || _	        || _
        y r   )rU   rC   r  r  rm   r  r  r+  colptrr,  r   )r    r  r  r+  r8  r,  r   s          r   r!   zCSCSparseTensor.__init__  r4  r   Nr.  r:   r   r   r6  r6    r/  r   r6  c                    t        j                  | j                  d      \  }}|j                  d       t        j                  | j
                  dz   ft         j                  | j                  j                        }|j                  |j                         |j                         d       |j                  d       t        | j
                  | j                  | j                  || j                  | j                         S NTreturn_countsr   rT   r   )rB   srcdim)rC   uniquer,  add_r  r  r  rJ   scatter_r  r   r!  r1  r  r+  r-  r   )cooAr   countsr3  s       r   coo2csrrD    s    \\$++TBNFF
KKN[[	QDKK4F4FF OO&++-VZZ\qOA
NN1		499dhhT[[ r   c                 F   t        j                  | j                        \  }}| j                  |   }| j                  |   }t        j
                  |d      \  }}|j                  d       t        j                  | j                  dz   ft         j                  | j                  j                        }|j                  |j                         |j                         d       |j                  d       t        | j                   | j                  | j"                  |||      S r:  )rC   r   r-  r,  r   r?  r@  r  r  r  rJ   rA  r  r   r!  r6  r  r+  )rB  r   
col2rowidxr,  r   	colvaluesrC  r8  s           r   coo2cscrH    s    jj-OC[[$F[[$FS=IvNN1[[	QDKK4F4FF OO)..*

!OD
NN1		499dhh r   c                     t        j                  |ft         j                  |      }t        j                  |ft         j                  |      }t        j                  |f||      }t        | |||||      S )NrT   )rC   r  r  r)  )r  r  r+  rJ   rU   r,  r-  r   s           r   	coo_zerosrJ    s[    [[#u{{6BF[[#u{{6BF[[#uV<F4sFFFCCr   c                    | j                   }| j                  t        j                  k(  sJ |j                  dk(  sJ t        | j                         }| j                  d   }t        | j                        dk(  r | j                  d   | j                  d   z  }	n| j                  d   }	||t        | |      \  }}}
|0t        j                  | j                  |t        j                        }|0t        j                  | j                  |t        j                        }d }t        |       }t        |      }t        |      }t        |      }t        |      }t        | ||||g       |dkD  r
d   j                         }|dkD  r.t        | j                  d   | j                  d   |
d   j                         |      }t        |j                        }t        |j                         }t        |j"                        }t        |
      }t%        j&                  |||||||||t)        j*                  |      t)        j,                  |	      t)        j,                  |             t        j.                  |j                        \  }}||_        |j                   |   |_        |j"                  |   |_        nt%        j&                  |||||d d d d t)        j*                  d      t)        j,                  |	      t)        j,                  |             nYt%        j&                  |||||d d d d t)        j*                  |      t)        j,                  |	      t)        j,                  |             t1        |       |||||fS )	NrD   r0   r  r   r   )r#  r&  r   )rJ   rU   rC   halfr   r   rW   r   r'  r  r   ro   r   itemrJ  r,  r-  r   r	   cdouble_rowcol_quantrG   rz   rp   r   r   )rq   r  r  out_colout_rowr#  rJ   rK   r  r  nnz_row_ptr
coo_tensorr  r  r  	ptrOutCol	ptrOutRowr+  	ptrRowIdx	ptrColIdxptrVal	ptrRowPtrr   r"  s                           r   double_quantrY    s    XXF77ejj   ;;&   188$K772;D
177|qwwqzAGGAJ&wwqzI-,=-
)	9k ++aggfEJJG++aggfEJJGJ1:D)$K)$K I Iq)Y9:3"o""$7"
AGGAJB(<(<(>J  
 1 12I
 1 12IZ../F,I$$

9%

4 

4  zz*"3"34HC #J * 1 1# 6J * 1 1# 6J$$

3

4 

4  	  JJy!JJtJJt	
 kGY	:==r   c                    t        | j                        }|| j                  |f}n|d   }|.t        |d   | j                  | j                  ||d   |      \  }}n|d   |f}|d   }	t        |	      dk(  r1t        j                  |	d         }
t        j                  |	d         }n6t        j                  |	d   |	d   z        }
t        j                  |	d         }t        | |g       |dk(  rZ|r,t        j                  t        |       t        |      |
|       n\t        j                  t        |       t        |      |
|       n0|dk(  rX|r+t        j                  t        |       t        |      |
|       nt        j                  t        |       t        |      |
|       n|dk(  rX|r+t        j                  t        |       t        |      |
|       nt        j                   t        |       t        |      |
|       nv|dk(  r`|dk(  r+t        j"                  t        |       t        |      |
|       nA|dk(  r<t        j$                  t        |       t        |      |
|       nt'        d| d	|       t)        |       ||fS )
Nr   r   rS   r  r   r   r  z)Transform function not implemented: From r   )r   rJ   rW   r  rU   r   rG   rp   r   r	   ctransform_row2col32Tro   ctransform_row2col32ctransform_row2turingTctransform_row2turingctransform_row2ampereTctransform_row2amperectransform_turing2rowctransform_ampere2rowr}   r   )rq   r  r	  rj   r   r  r  rK   r  rW   r  r  s               r   	transformrc    s#   188$K}qww
3eQx*
{%9%(AGGQXXW_afghaikt%uNCQx*)!HE
5zQzz%(#zz%(#zz%(U1X-.zz%(#q#h7%%gaj'#,dK$$WQZtTJ	\	!&&wqz73<tL%%gaj'#,dK	\	!&&wqz73<tL%%gaj'#,dK	U	%%%gaj'#,dK<'%%gaj'#,dK!$Mj\Y]^f]g"hiik	>r   c                    |Et        j                  | j                  |j                  d   f|j                  |j
                        }| j                  }| j                  j                         |k(  sJ | j                  j                         |k(  sJ | j                  j                         |k(  sJ | j                  |j                  d   k(  sJ |j                         rdnd}|j                         |rdnd   }|j                  d   }t        j                         j                   }t#        | j                        }t#        | j                        }	t#        | j                        }
t#        |      }t#        |      }t%        j&                  | j                        }t%        j&                  | j                        }t%        j&                  | j                        }t%        j&                  |j                  d         }t%        j&                  |      }t%        j&                  |      }t)        | j                  | j                  | j                  ||g       t+        j,                  |||	|
||||||||t%        j.                  |             |S )Nr   r&  r   FT)rC   rH  r  rW   rJ   rU   r+  r,  rm   r-  r   r  r  r  rM   r-   r?   ro   rG   rp   r   r	   	cspmm_coor  )rB  r   rj   r+  r  r  r  r  	ptrRowidx	ptrColidx	ptrValuesr  r  cnnzcrowsAccolsAccolsBcldbcldcs                      r   spmm_cooro  K  s   
{kkYY
#AHHAGG
 ((C;;#%%%;;#%%%;;#%%%99
"""OO-54L
((*<aQ
0C
''!*C

'
'
)
1
1C$I$I$I1:D3<D::dhhDZZ		"FZZ		"FZZ
#F::c?D::c?Dt{{DKKa=>MM#y)YfffVZ\`bfhlnpnwnw  yE  oF  GJr   c                 b   |Ot        j                  | j                  |j                  d   f|j                  | j
                  j                        }| j                  }t        |j                        }| j                  j                         |k(  sJ | j                  j                         |k(  sJ | j
                  j                         |k(  sJ | j                  |j                  d   k(  s J | j                   d|j                          |j                         rdnd}|j                         |rdnd   }|j                  d   }t        j                  | j                  d      \  }	}
|
j!                  d      j#                         }t        j$                  |
d      \  }}|j#                         }|j#                         }|d   d	k  sJ d
|d    d       |j                  t         j&                  t         j(                  fv sJ t+        |      }t+        |      }t+        |      }t+        | j                        }t+        | j                        }t+        | j
                        }t+        |      }t+        |      }t+        |      }t-        j.                  |
j                               }t-        j.                  | j                        }t-        j.                  | j                        }t-        j.                  | j                        }t-        j.                  |j                  d         }t-        j.                  |j                  d         }t-        j.                  |      }t-        j.                  |      }t1        | j                  | j                  | j
                  |||g       |j                  t         j&                  k(  r#t3        j4                  ||||||||||||||       n?|j                  t         j(                  k(  r"t3        j6                  ||||||||||||||       t9        |       |S )Nr   r&  r   r  FTr;  )
descendingr   z)Current max count per row is 8 but found r  )rC   r  r  rW   rJ   r   rU   r+  r   r,  rm   r-  r  r  r  r?  cumsumr   r   r  r   ro   rG   rp   r   r	    cspmm_coo_very_sparse_naive_fp16 cspmm_coo_very_sparse_naive_int8r   )rB  r   dequant_statsrj   r+  rK   r  r  r  r   rC  r   	max_countmax_idx	ptrOffsetptrMaxCount	ptrMaxIdxrf  rg  rh  r  r  ptrDequantStats	cnnz_rowsri  rj  rk  crowsBrl  rm  rn  s                                  r   spmm_coo_very_sparser~  o  sa   
{kkYY
#AHHDKK<M<M
 ((C188$K;;#%%%;;#%%%;;#%%%99
"?tyykaggY$??"OO-54L
((*<aQ
0C
''!*C\\$++TBNFF]]1!!#FFt<IwkkmGI!C	29Q<.BC77u}}ejj1111I)$K I$I$I$I1:D3<Dm,O

6<<>*I::dhhDZZ		"FZZ		"FZZ
#FZZ
#F::c?D::c?Dt{{DKKamLMww%--,,	
  
EJJ	,,	
" kJr   g     _@vectorc                 (   |dk(  rmt        j                  |       j                         j                         }t        j                  | |z  dz        j                  t         j                        }||fS |dv rkt        j                  t        j                  |       |d      }t        j                  | t        |z  z        j                  t         j                        }||fS |dk(  r| j                  }| j                         } | j                         | j                         z
  }|dk(  rd}d	|z  }| j                         }t        j                  ||z        }	t        j                  || z  |	z
        |	z   } | |fS |d
v r| j                  }| j                         } t        j                  | |d      t        j                  | |d      z
  }d||dk(  <   d	|z  }t        j                  | |d      }t        j                  ||z        }	t        j                  || z  |	z
        |	z   } | |fS |dk(  rt        j                         5  t        j                  |       }
t        j                  |
|d      }|dz  }|
|j                  |
      kD  }t        j                  | |         }|j                  |
      |   |z  | |<   t        j                  | |z  t        z        j                  t         j                        }d d d        ||fS y # 1 sw Y   fS xY w)Nlinear   )r  r  T)r>  keepdim	zeropointr   r   g     o@)vector-zeropointrow-zeropointtruncated-vectorgffffff?)rC   r   r   rG  r  r   r   amaxCrU   minaminno_grad	expand_asr   )xr>  rp  r  xqrU   dynaqxminxzpxabsxr"  r   s                r   vectorwise_quantr    s   Xyy|!'')[[TC(++EJJ74x	(	(zz%))A,C>[[a$h(++EJJ74x	{	"GGIuuw 19DT\uuwkk$)$KKQ%+"u	<	<GGIzz!d3ejj37
 
 TQYT\zz!d3kk$)$KKQ%+"u	)	)]]_ 	:99Q<D::dT:D#:D--C::af%D^^D)#.5AcFQX\*--ejj9B	: 4x	: 4xs   B5LLc                 d    |dk(  r+| t         z  |z  j                  t        j                        }|S y )Nr  )r  r   rC   rx   )r  r  rp  r  s       r   vectorwise_dequantr    s.    X!Vd]u}}-r   c                    |dk(  r5||z  t         t         z  z  }| j                         |z  j                  |      S |dk(  r*d||z  z  }| j                         |z  j                  |      S |dk(  rd||z  z  }| j                         }t        |j                        dk(  r)t        |j                        dk(  r|j                  d      }t        |j                        dk(  r)t        |j                        dk(  r|j                  d      }t        |j                        dk(  r||z  }n||z  }|j                  |      S |dk(  r| j                         }t        |j                        dk(  r)t        |j                        dk(  r|j                  d      }t        |j                        dk(  r)t        |j                        dk(  r|j                  d      }t        |j                        dk(  r	|d|z  z  }n|d|z  z  }|d|j                         z  z  }|j                  |      S |d	k(  r| j                         }t        |j                        dk(  r)t        |j                        dk(  r|j                  d      }t        |j                        dk(  r)t        |j                        dk(  r|j                  d      }t        |j                        dk(  r|||z  t         t         z  z  z  }n|||z  t         t         z  z  z  }|j                  |      S |d
v r| j                         }t        |j                        dk(  r)t        |j                        dk(  r|j                  d      }t        |j                        dk(  r)t        |j                        dk(  r|j                  d      }t        |j                        dk(  r||t         z  z  }n||t         z  z  }||t         z  z  }|j                  |      S y )Nr  r  r   r  r  rS   r   r  r  )r  r  )r  rG  r   r   rW   squeezer3   )r  S1S2rU   rp  r   r  s          r   vectorwise_mm_dequantr    s4   XBw!a% 
T!%%e,,	{	"b2g
T!%%e,,		&b2gHHJrxx=A#agg,!"3ABrxx=A#agg,!"3ABrxx=AIAIAttE{	)	)HHJrxx=A#agg,!"3ABrxx=A#agg,!"3ABrxx=ArMArMA	S2446\ttE{	u	HHJrxx=A#agg,!"3ABrxx=A#agg,!"3ABrxx=AbAE""AbAE""AttE{	5	5HHJrxx=A#agg,!"3ABrxx=A#agg,!"3ABrxx=AaKAaKA	R!VttE{r   c                    |j                         j                         j                  d      |d   |d   z   z  }| j                         }t        | j                        dk(  r)t        |j                        dk(  r|j                  d      }t        |j                        dk(  r||j                         dz  z  }n||dz  z  }||d   dz  z  }||z  }|j                  |      S )Nr   r   rS   r  r  )rG  r3   sumr   rW   r  r   )r  rq   r   r  r  rU   r   r  s           r   dequant_min_maxr  9	  s    WWY[[]q!RURU]3F

A
288}c"((mq0ZZ]
288}	RTTVc\	R#XAAKA44;r   c                    |d   }|d   }|dv sJ | j                   j                  dk(  sJ t        j                  |d   |j	                         ft        j
                  | j                         }t        j                  |j	                               }t        j                  |d         }t        j                  |d         }t        |       }	t        |      }
t        |      }t        | j                         }|dk(  rt        j                  |	|
||||       n|dk(  rt        j                  |	|
||||       t        |       |S )Nr   r   r  rD   rT   r   r   )rJ   r   rC   r  rm   r   rG   rp   ro   r   r	   cextractOutliers_turingcextractOutliers_amperer   )rq   r  r"  r  formatArj   idx_sizer  r  r  ptrIdxr  rK   s                r   extract_outliersr  G	  s   UFeG222288==F"""
++	CIIK 

188C zz#))+&H::fQi D::fQi D1:DS\FS\F188$K,##D&&(D$O	L	 ##D&&(D$OkJr   c                     t        j                  |       }t        j                  t	        |       t	        |      t        j                  | j                               t        j                  |             |S r$   )rC   r4  r	   cpipeline_testro   rG   r[   rm   )rq   
batch_sizerj   s      r   pipeline_testr  a	  sL    


1
Cwqz73<QWWY1GU_I`aJr   r5   )T)NTr$   )Tr   T)g+ew?T)TrR  rS   r   )Tr   r   )r   )r  F)r  NFNN)Nr  r   )NNNr*  F)NNNNr*  F)Nr/  )NNr/  F)NNr/  FrO  )NNNr/  )NNNr/  rO  )NN)NNNN)Nr   r   r   Nr   F)r   r   Nr   )r   r   F)rR  )NFFN)NFF)NNNr   )NNNNr   )r   r  )r  )ctypesrG   r   r   randomrC   mathscipy.statsr   numpyr_   	functoolsr   typingr   r   
cextensionr   r	   r   r3  r  cadam32bit_grad_fp32cadam32bit_grad_fp16cadam32bit_grad_bf16cmomentum32bit_grad_32cmomentum32bit_grad_16crmsprop32bit_grad_32crmsprop32bit_grad_16clion32bit_grad_fp32clion32bit_grad_fp16clion32bit_grad_bf16cadagrad32bit_grad_32cadagrad32bit_grad_16r  cadam_static_8bit_grad_32cadam_static_8bit_grad_16cmomentum_static_8bit_grad_32cmomentum_static_8bit_grad_16crmsprop_static_8bit_grad_32crmsprop_static_8bit_grad_16clion_static_8bit_grad_32clion_static_8bit_grad_16r  cadam_8bit_blockwise_grad_fp32cadam_8bit_blockwise_grad_fp16cadam_8bit_blockwise_grad_bf16"cmomentum_8bit_blockwise_grad_fp32"cmomentum_8bit_blockwise_grad_fp16!crmsprop_8bit_blockwise_grad_fp32!crmsprop_8bit_blockwise_grad_fp16clion_8bit_blockwise_grad_fp32clion_8bit_blockwise_grad_fp16clion_8bit_blockwise_grad_bf16!cadagrad_8bit_blockwise_grad_fp32!cadagrad_8bit_blockwise_grad_fp16r   r<   rM   rY   rx   r  r7  r{   r   rJ   rk   r1   r   r   r   r   r   r   r   r   r   r   r   rH   ro   r   r   r   r  r  rG  r   r<  r   rF  rZ  ra  re  r^  rw  rz  ru  r  r  r  r  strr  r  r  r  r  r  r  r  r  r  r  r  r'  r)  r1  r6  rD  rH  rL  rJ  rY  rc  ro  r~  r  r  r  r  r  r  r  r:   r   r   <module>r     s
  
             /- 	$"%":":C<T<TVYVnVn!ov""""&z"
 	!!!!%y! #&":":C<T<TVYVnVn!ov!!!!%y!
 %%%%!f
 	))))%j!
 	(((($i 
 	%%%%!f
 	%%%%!f
 	))))!f
 #%******+' 	..../
+
 	----.	*
 	******+' 	----.	*
' '2* *2   EMM EMM ENN EKK EJJ !MM,%,,vQ2O L!0 Y B >Q$,*\0d.v ."++ .&'" AF&IX 
%P3& 3v 3e 3ek 3lF& F Fv FSY F  DJ FV *.HHvv~&H H 	H
 
H H HT0hPF PF P PPF PF P PNV NV N N  GM N`Jf J5+@ JQW Jek J  @C J  MS JJf J5+@ JQW Jek J  @C J  MS JKv K5+@ KQW Kek K  @C K  _e K\ f & F   *.  vv~&    	 
 
   (&  V v :F & v  L ZZZ Z 	Z
 Z 
Z Z 	Z Z Z Z Z Z Z  
!Z\ )BBB B 	B
 B B B 
B B 	B B B B B B  !B" #B$ %B& 'B( )B* 
+Bh #::: : 	:
 : : : 
: : 	: : : : : :  !:$ 
%:z CD)2
)2#)2+.)2<?)2Xuu%u/5u?Eu& GLjj Wx 
;;; 
;@ VVV 
Vx PPP 
Pf #U[[ ^L 		0h FI+/\ " " "
  .3ZZ D NQ]>@)X!HSl 
+\ -2JJ8 8v -2JJ 4r   