
    ;iEz                        S r SSKrSSKrSSKJr  SSKJrJr  SSKJ	r	J
r
  SSKJr  SSKrSSKJrJr  S	S
KJr  S	SKJrJr  S	SKJrJrJr  S	SKJr  \" 5       (       a	  SSKJs  Jr  \" 5       (       a  SSK J!r!  S r"S r#S r$S r%S r&\"SS.S jr'SES jr(S r)S r*S r+S r,S r-S r.S r/S r0 " S  S!\15      r2S" r3S# r4\3S$ 5       r5S%\4S& jr6S%\4S' jr7SFS( jr8SGS) jr9\Rt                  S	\Rv                  S\Rx                  S*\Rz                  S+\R|                  S,\R~                  S-\R                  S.\R                  S/\R                  S0\R                  S10
rD\DR                  5        V Vs0 s H  u  pX_M	     snn rFS2 rGSHS3\R                  4S4 jjrI\3SFS5\J4S6 jj5       rKSFS5\J4S7 jjrLSIS8 jrMSFS9 jrN " S: S;\O5      rP\4SJS< j5       rQSFS= jrR\3SKS> j5       rSS? rT " S@ SA5      rUSB rVSC rW\SLSD j5       rXgs  snn f )MzB
A set of basic tensor ops compatible with tpu, gpu, and multigpu
    N)Mapping)contextmanagernullcontext)update_wrapperwraps)Any   )AcceleratorStatePartialState   )!TORCH_DISTRIBUTED_OPERATION_TYPES)DistributedTypeTensorInformation)is_npu_availableis_torch_distributed_availableis_torch_xla_available)is_torch_version)ReduceOpc                 6    [        U [        R                  5      $ N)
isinstancetorchTensortensors    k/home/dmtnaga/Documents/work/airagagent/rag_env/lib/python3.13/site-packages/accelerate/utils/operations.pyis_torch_tensorr   -   s    fell++    c           
      v   [        U [        R                  R                  [        R                  R                  [        R                  R
                  [        R                  R                  [        R                  R                  [        R                  R                  [        R                  R                  5      $ r   )
r   r   xpuFloatTensor
ByteTensor	IntTensor
LongTensor
HalfTensorDoubleTensorBFloat16Tensorr   s    r   is_torch_xpu_tensorr(   1   sm    														  	 	r   c                 "    [        U [        5      $ r   )r   r   tensor_infos    r   is_tensor_informationr,   >   s    k#455r   c                 n    [        U [        5      =(       a    [        U S5      =(       a    [        U S5      $ )z
Checks if `data` is a `namedtuple` or not. Can have false positives, but only if a user is trying to mimic a
`namedtuple` perfectly.
_asdict_fields)r   tuplehasattrdatas    r   is_namedtupler4   B   s*    
 dE"\wtY'?\GDR[D\\r   c                 r    [        U 5      (       a  [        U 5      " [        U5      6 $ [        U 5      " U5      $ )zG
Cast a generator to the same type as obj (list, tuple, or namedtuple)
)r4   typelist)obj	generators     r   
honor_typer:   J   s1    
 SCy$y/**Cy##r   F	test_typeerror_on_other_typec                  ^ ^^^^ [        U[        [        45      (       a  [        UUUU UU4S jU 5       5      $ [        U[        5      (       aF  [        U5      " UR                  5        VVs0 s H  u  pgU[        T U/TQ7TTS.TD6_M     snn5      $ T" U5      (       a  T " U/TQ70 TD6$ T(       a2  [        S[        U5       ST R                   STR                   S35      eU$ s  snnf )a  
Recursively apply a function on a data structure that is a nested list/tuple/dictionary of a given base type.

Args:
    func (`callable`):
        The function to recursively apply.
    data (nested list/tuple/dictionary of `main_type`):
        The data on which to apply `func`
    *args:
        Positional arguments that will be passed to `func` when applied on the unpacked data.
    main_type (`type`, *optional*, defaults to `torch.Tensor`):
        The base type of the objects to which apply `func`.
    error_on_other_type (`bool`, *optional*, defaults to `False`):
        Whether to return an error or not if after unpacking `data`, we get on an object that is not of type
        `main_type`. If `False`, the function will leave objects of types different than `main_type` unchanged.
    **kwargs (additional keyword arguments, *optional*):
        Keyword arguments that will be passed to `func` when applied on the unpacked data.

Returns:
    The same data structure as `data` with `func` applied to every object of type `main_type`.
c              3   J   >#    U  H  n[        TU/TQ7TTS .TD6v   M     g7f)r;   Nrecursively_apply).0oargsr=   funckwargsr<   s     r   	<genexpr>$recursively_apply.<locals>.<genexpr>n   sA       A "!".7M`dj s    #r;   zUnsupported types (z) passed to `z?`. Only nested list/tuple/dicts of objects that are valid for `z` should be passed.)
r   r0   r7   r:   r   r6   itemsrA   	TypeError__name__)rE   r3   r<   r=   rD   rF   kvs   ` ````  r   rA   rA   U   s   , $&&  	
 	
 
D'	"	"Dz
 !JJL	 )DA $!".7M`dj  )	
 	
 
4D*4*6**	!$t*]4==/ J++4+=+=*>>QS
 	
 Ks   . C*
c                   ^^^ [        U 5      (       d  [        U S5      (       a  TS:X  a  Sm U R                  TTS9$ [        U [        [        45      (       a  [        U UUU4S jU  5       5      $ [        U [        5      (       ad  [        T[        5      (       a  T/mOTc  / m[        U 5      " U R                  5        VVs0 s H  u  pVXUT;   a  UO[        UTTTS9_M     snn5      $ U $ ! [         a    U R                  T5      s $ [         a5  n[        5       (       a  [        T[        5      (       a  ST 3m SnAO
UeSnAff = f U R                  TTS9$ ! [         a    U R                  T5      s $ f = fs  snnf )	as  
Recursively sends the elements in a nested list/tuple/dictionary of tensors to a given device.

Args:
    tensor (nested list/tuple/dictionary of `torch.Tensor`):
        The data to send to a given device.
    device (`torch.device`):
        The device to send the data to.

Returns:
    The same data structure as `tensor` with all tensors sent to the proper device.
tonpuznpu:0)non_blockingznpu:Nc              3   <   >#    U  H  n[        UTTTS 9v   M     g7f)rQ   	skip_keysN)send_to_device)rB   tdevicerQ   rT   s     r   rG   !send_to_device.<locals>.<genexpr>   s      ohncd^AvLT]^hns   rS   )r   r1   rO   rJ   AssertionErrorr   r   intr0   r7   r:   r   strr6   rI   rU   )r   rW   rQ   rT   errorrL   rV   s    ```   r   rU   rU      s{    v'&$"7"7U?F	99V,9?? 
FUDM	*	*ohno
 	
 
FG	$	$i%%"IIF| #LLN*DA Y1N1fS_kt,uu*
 	
 =  	%99V$$ 	  !!fc**#F8_F		%99V,9?? 	%99V$$	%s;   C,  E<
,E		E)E EEE E98E9c                     S n[        X5      $ )a/  
Recursively gathers the information needed to rebuild a nested list/tuple/dictionary of tensors.

Args:
    data (nested list/tuple/dictionary of `torch.Tensor`):
        The data to send to analyze.

Returns:
    The same data structure as `data` with [`~utils.TensorInformation`] instead of tensors.
c                 >    [        U R                  U R                  S9$ )N)shapedtype)r   r_   r`   r   s    r   _get_data_structure/get_data_structure.<locals>._get_data_structure   s     v||6<<HHr   r@   )r3   ra   s     r   get_data_structurerc      s    I 077r   c                     S n[        X5      $ )a  
Recursively gathers the shape of a nested list/tuple/dictionary of tensors as a list.

Args:
    data (nested list/tuple/dictionary of `torch.Tensor`):
        The data to send to analyze.

Returns:
    The same data structure as `data` with lists of tensor shapes instead of tensors.
c                 ,    [        U R                  5      $ r   )r7   r_   r   s    r   
_get_shapeget_shape.<locals>._get_shape   s    FLL!!r   r@   )r3   rf   s     r   	get_shaperh      s    " Z..r   c                 $    S n[        X[        S9$ )z
Recursively initializes tensors from a nested list/tuple/dictionary of [`~utils.TensorInformation`].

Returns:
    The same data structure as `data` with tensors instead of [`~utils.TensorInformation`].
c                 V    [         R                  " U R                  SU R                  06$ Nr`   )r   emptyr_   r`   r*   s    r   _initialize_tensor.initialize_tensors.<locals>._initialize_tensor   s"    {{K--G[5F5FGGr   r<   )rA   r,   )data_structurerm   s     r   initialize_tensorsrq      s    H /K`aar   c                    [        U [        [        [        45      (       a'  [	        U 5      S:X  a  [        S[        U 5       S35      e[        U [        [        45      (       a  [        U S   5      $ [        U [        5      (       a%  U R                  5        H  n[        X   5      s  $    O7[        U [        R                  5      (       d  [        S[        U 5       S35      eU R                  S   $ )z
Recursively finds the batch size in a nested list/tuple/dictionary of lists of tensors.

Args:
    data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to find the batch size.

Returns:
    `int`: The batch size.
r   z&Cannot find the batch size from empty .z0Can only find the batch size of tensors but got )r   r0   r7   r   len
ValueErrorr6   find_batch_sizekeysr   r   rJ   r_   )r3   rL   s     r   rv   rv      s     $g.//SY!^A$t*QOPP$&&tAw''	D'	"	"A"47++ ell++J4PT:,VWXYY::a=r   c                 F     [        U 5      $ ! [        [        4 a     gf = f)a   
Same as [`utils.operations.find_batch_size`] except will ignore if `ValueError` and `TypeErrors` are raised

Args:
    data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to find the batch size.

Returns:
    `int`: The batch size.
N)rv   ru   rJ   r2   s    r   ignorant_find_batch_sizery     s,    t$$	" s   
   c                     S n[        X5      $ )a;  
Recursively finds tensors in a nested list/tuple/dictionary and converts them to a list of numbers.

Args:
    data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to convert to regular numbers.

Returns:
    The same data structure as `data` with lists of numbers instead of `torch.Tensor`.
c                     U R                  5       R                  5       n U R                  [        R                  :X  a  U R                  [        R                  5      n U R                  5       $ r   )detachcpur`   r   bfloat16rO   float32tolistr   s    r   _convert_to_list!listify.<locals>._convert_to_list!  sF    $$&<<5>>) YYu}}-F}}r   r@   )r3   r   s     r   listifyr     s     -44r   c                 J    S n[        XSS9n[        R                  " 5         U$ )Nc                     U R                   S:X  a  U R                  5       S    n U R                  5       (       d  U R                  5       n [        R
                  " U 5      $ )Nr   )ndimcloneis_contiguous
contiguousxm
all_gatherr   s    r   _tpu_gather_one$_tpu_gather.<locals>._tpu_gather_one.  sL    ;;!\\^D)F ##%%&&(F}}V$$r   Tr=   )rA   r   	mark_step)r   r   ress      r   _tpu_gatherr   -  s#    % O
NCLLNJr   c                   ^^ [        5       m[        R                  R                  mTR                  R
                  S:X  a/  [        SS5      (       a  [        R                  R                  5         UU4S jn[        XSS9$ )Nr    z<=z2.8c                   > U R                   S:X  a  U R                  5       S    n U R                  5       (       d  U R                  5       n TR                  b  TR                  S:w  aq  [
        R                  " TR                  U R                  5       -  U R                  TR                  S9nT" X5        UR                  " S/U R                  5       SS  Q76 $ [        TR                  5       Vs/ s H  n[
        R                  " U 5      PM     nn[
        R                  R!                  X5        [
        R"                  " USS9$ s  snf )Nr   gloor`   rW   r   dim)r   r   r   r   backendr   rl   num_processesnumelr`   rW   viewsizerange
empty_likedistributedr   cat)r   output_tensors_	gather_opstates      r   _gpu_gather_one$_gpu_gather.<locals>._gpu_gather_oneD  s   ;;!\\^D)F ##%%&&(F==$&)@
 #[[##flln4ll||N
 n-!&&r>FKKM!",=>>
 AFeFYFY@Z[@Z1e..v6@ZN[((@99^33 \s   / ETr   )
r   r   r   all_gather_into_tensorrW   r6   r   r    synchronizerA   )r   r   r   r   s     @@r   _gpu_gatherr   <  s`    NE!!88I ||E!&6tU&C&C		48 _$OOr   c                       \ rS rSrSrSrg)DistributedOperationExceptionic  z}
An exception class for distributed operations. Raised if the operation cannot be performed due to the shape of the
tensors.
 N)rK   
__module____qualname____firstlineno____doc____static_attributes__r   r   r   r   r   c  s    
 	r   r   c                 0   ^  [        T 5      U 4S j5       nU$ )zn
Verifies that `tensor` is the same shape across all processes. Only ran if `PartialState().debug` is `True`.
c                  R  > [        5       R                  [        R                  :X  d  [        5       R                  (       d  T
" U 0 UD6$ T
R
                   ST
R                   3nSU;   a  US   nOU S   n[        5       R                  R                  [        U5      R                  :w  ag  [        SU SUR                  R                   S[        5       R                  R                   S[        5       R                  R                   SU S35      e[        U5      n[        U/5      nUS   bm  UR                  US   5      [        U5      :H  nU(       dF  S	R                  [!        U5       VVs/ s H  u  pxS
U SU 3PM     snn5      n	[        SU SU	 35      eT
" U 0 UD6$ s  snnf )Nrs   r   r   z%One or more of the tensors passed to z were not on the z+ while the `Accelerator` is configured for z. Please move it to the z before calling z
  - zProcess z: znCannot apply desired operation due to shape mismatches. All shapes across devices must be valid.

Operation: `z`
Input shapes:
  - )r   distributed_typer   NOdebugr   rK   rW   r6   find_devicer   rh   gather_objectcountrt   join	enumerate)rD   rF   	operationr   shapesoutputare_sameir_   process_shape_strfunctions             r   wrapper!verify_operation.<locals>.wrapperq  s   >**o.@.@@H\H\T,V,,**+1X->->,?@	vH%F!WF>  %%V)<)A)AA/7	{BSTZTaTaTfTfSg  hS  T`  Tb  Ti  Ti  Tn  Tn  So o))5)>)>)C)C(DDTU^T__`b  6"x(!9 ||F1I.#f+=H$,MM[dek[l2m[lxqXaS5'3J[l2m$n!3''0k1GHYGZ\ 
 ((( 3ns   .F#
r   r   r   s   ` r   verify_operationr   l  s"    
 8_) )4 Nr   c                 0   ^  [        T 5      U 4S j5       nU$ )z
Checks that `verify_operation` failed and if so reports a more helpful error chaining the existing
`DistributedOperationException`.
c                     >  T" U 0 UD6$ ! [          a0  nTR                   STR                   3n[        SU S35      UeS nAff = f)Nrs   zError found while calling `z1`. Please see the earlier error for more details.)r   r   rK   )rD   rF   er   r   s       r   r   "chained_operation.<locals>.wrapper  sc    	T,V,,, 	#../q1B1B0CDI/-i[8ij	s    
A+A  Ar   r   s   ` r   chained_operationr     s"     8_  Nr   c                     [        5       R                  [        R                  :X  a  [	        U 5      $ [        5       R                  [
        ;   a  [        U 5      $ U $ )a  
Recursively gather tensor in a nested list/tuple/dictionary of tensors from all devices.

Args:
    tensor (nested list/tuple/dictionary of `torch.Tensor`):
        The data to gather.

Returns:
    The same data structure as `tensor` with all tensors sent to the proper device.
)r   r   r   XLAr   r   r   r   s    r   gatherr     sF     ~&&/*=*==6""		(	(,M	M6""r   objectc                     [        [        5       R                  5       Vs/ s H  nS PM     nn[        R                  R                  X 5        U VVs/ s H  o3  H  oDPM     M     snn$ s  snf s  snnf r   )r   r   r   r   r   all_gather_object)r   r   output_objectsyxs        r   _gpu_gather_objectr     sa    $),.*F*F$GH$Gqd$GNH	''?%1~!q!AqA~11 I 2s   A*A/c                     [        5       R                  [        R                  :X  a  [	        S5      e[        5       R                  [
        ;   a  [        U 5      $ U $ )a  
Recursively gather object in a nested list/tuple/dictionary of objects from all devices.

Args:
    object (nested list/tuple/dictionary of picklable object):
        The data to gather.

Returns:
    The same data structure as `object` with all the objects sent to every device.
z&gather objects in TPU is not supported)r   r   r   r   NotImplementedErrorr   r   )r   s    r   r   r     sG     ~&&/*=*==!"JKK		(	(,M	M!&))r   c                 "    SS jn[        X SUS9$ )Nc                 @    [         R                  R                  XS9  U $ )Nsrc)r   r   	broadcast)r   r   s     r   _gpu_broadcast_one*_gpu_broadcast.<locals>._gpu_broadcast_one  s    ##F#4r   T)r=   r   r   r@   )r3   r   r   s      r   _gpu_broadcastr     s     /4UXYYr   c                 l  ^^ [        U [        [        45      (       a  [        U U4S j[	        U 5       5       5      $ [        U [
        5      (       aB  [        U 5      " U R                  5        VVs0 s H  u  p4U[        UT SU 3S9_M     snn5      $ [        R                  " TU U4S j5      $ s  snnf )Nc              3   F   >#    U  H  u  p[        UT S U 3S9v   M     g7f)r   nameN)_tpu_broadcast)rB   r   rV   r   s      r   rG   !_tpu_broadcast.<locals>.<genexpr>  s&     "gUfTQ>!TF!A3-#HUfs   !r   r   c                    > U T   $ r   r   )r   r   s    r   <lambda> _tpu_broadcast.<locals>.<lambda>  s	    !C&r   )r   r7   r0   r:   r   r   r6   rI   r   r   mesh_reduce)r   r   r   rL   rM   s    ``  r   r   r     s    &4-((&"gU^_eUf"ghh	FG	$	$F|RXR^R^R`aR`$!Qq$q} EER`abb>>$(899 bs   0B0
                  	   
   c                    Sn[        5       n[        R                  " U[        R                  UR                  S9nU bT  U R
                  n[        U R                     n[        R                  " [        U5      U/-   [        S9US[        U5      S-   & [        USS9nX3R                  5          n[        USS S	   5      nUSS nX64$ )
z]
Grabs the shape of `tensor` only available on one process and returns a tensor of its shape
i   r   Nr`   r   sum	reductionr   r   )r   r   rl   rZ   rW   r_   TENSOR_TYPE_TO_INTr`   r   r7   rt   reducenonzero)r   max_tensor_dimensionr   base_tensorr_   tensor_dtyper`   s          r   gather_tensor_shaper     s    
 !NE++2%))ELLYK
 )&,,7(-T%[L>5QY\(]$c%j1n%6K1134KBC #$Ecr"Kr   returnc                     [        5       n[        U 5      u  p#U c5  [        R                  " U[        U   S9R                  UR                  5      n [        U SS9$ )at  
Copies a tensor that only exists on a single device and broadcasts it to other devices. Differs from `broadcast` as
each worker doesn't need to know its shape when used (and tensor can be `None`)

Args:
    tensor (`torch.tensor`):
        The tensor that should be sent to all devices. Must only have it be defined on a single device, the rest
        should be `None`.
r   r   r   )r   r   r   zerosTENSOR_INT_TO_DTYPErO   rW   r   )r   r   r_   r`   s       r   copy_tensor_to_devicesr  	  sN     NE&v.LE~U*=e*DEHHV&E**r   from_processc                     [        5       R                  [        R                  :X  a
  [	        XSS9$ [        5       R                  [
        ;   a	  [        XS9$ U $ )a  
Recursively broadcast tensor in a nested list/tuple/dictionary of tensors to all devices.

Args:
    tensor (nested list/tuple/dictionary of `torch.Tensor`):
        The data to gather.
    from_process (`int`, *optional*, defaults to 0):
        The process from which to send the data

Returns:
    The same data structure as `tensor` with all tensors broadcasted to the proper device.
zaccelerate.utils.broadcast)r   r   r   )r   r   r   r   r   r   r   )r   r  s     r   r   r     sI     ~&&/*=*==f=YZZ		(	(,M	Mf77r   c                 0  ^ [        5       R                  [        R                  :X  a4  [	        U 5       H#  u  p#[
        R                  " SUU4S j5      X'   M%     U $ [        5       R                  [        ;   a  [        R                  R                  U TS9  U $ )ar  
Broadcast a list of picklable objects form one process to the others.

Args:
    object_list (list of picklable objects):
        The list of objects to broadcast. This list will be modified inplace.
    from_process (`int`, *optional*, defaults to 0):
        The process from which to send the data.

Returns:
    The same list containing the objects from process 0.
z&accelerate.utils.broadcast_object_listc                    > U T   $ r   r   )r   r  s    r   r   'broadcast_object_list.<locals>.<lambda>?  s    efgsetr   r   )r   r   r   r   r   r   r   r   r   r   broadcast_object_list)object_listr  r   r8   s    `  r   r  r  0  s}     ~&&/*=*==,FA^^,TVY[tuKN -  
	(	(,M	M///Nr   c                      S n[        X@U5      $ )a*  
Recursively takes a slice in a nested list/tuple/dictionary of tensors.

Args:
    data (nested list/tuple/dictionary of `torch.Tensor`):
        The data to slice.
    tensor_slice (`slice`):
        The slice to take.

Returns:
    The same data structure as `data` with all the tensors slices.
c                 
    X   $ r   r   )r   tensor_slices     r   _slice_tensor$slice_tensors.<locals>._slice_tensorS  s    ##r   r@   )r3   r  process_indexr   r  s        r   slice_tensorsr  E  s    $ ],??r   c                 .  ^ ^ [        T S   [        [        45      (       a/  [        T S   U U4S j[	        [        T S   5      5       5       5      $ [        T S   [        5      (       aS  [        T S   5      " T S   R                  5        VVs0 s H   o"[        T  Vs/ s H  o3U   PM	     snTS9_M"     snn5      $ [        T S   [        R                  5      (       d  [        S[        T S   5       35      e[        R                  " T TS9$ s  snf s  snnf )a  
Recursively concatenate the tensors in a nested list/tuple/dictionary of lists of tensors with the same shape.

Args:
    data (nested list/tuple/dictionary of lists of tensors `torch.Tensor`):
        The data to concatenate.
    dim (`int`, *optional*, defaults to 0):
        The dimension on which to concatenate.

Returns:
    The same data structure as `data` with all the tensors concatenated.
r   c              3   h   >#    U  H"  n[        T Vs/ s H  o"U   PM	     snTS 9v   M$     gs  snf 7f)r   N)concatenate)rB   r   dr3   r   s      r   rG   concatenate.<locals>.<genexpr>g  s/     #lXkSTKt0Dt!1t0D#$NXk0Ds   2-
2r   z%Can only concatenate tensors but got )r   r0   r7   r:   r   rt   r   r6   rw   r  r   r   rJ   r   )r3   r   rL   r  s   ``  r   r  r  Y  s     $q'E4=))$q'#lX]^abfghbi^jXk#lmm	DGW	%	%DG}UYZ[U\UaUaUcdUcPQD-ADqdD-As!KKUcdeeQ..?T!WOPP99Ts## .Bds   D
D)
D
D
c                       \ rS rSrSrg)CannotPadNestedTensorWarningio  r   N)rK   r   r   r   r   r   r   r   r  r  o  s    r   r  c           	      $    SS jn[        X@SXUS9$ )a  
Recursively pad the tensors in a nested list/tuple/dictionary of tensors from all devices to the same size so they
can safely be gathered.

Args:
    tensor (nested list/tuple/dictionary of `torch.Tensor`):
        The data to gather.
    dim (`int`, *optional*, defaults to 0):
        The dimension on which to pad.
    pad_index (`int`, *optional*, defaults to 0):
        The value with which to pad.
    pad_first (`bool`, *optional*, defaults to `False`):
        Whether to pad at the beginning or the end.
c                   ^^	^
 [        U SS5      (       a  [        R                  " S[        5        U $ T[	        U R
                  5      :  d  T[	        U R
                  5      * :  a  U $ TS:  a  T[	        U R
                  5      -  m[        R                  " U R
                  U R                  S9S    n[        U5      R                  5       n[        U4S jU 5       5      m	T	U R
                  T   :X  a  U $ U R
                  m
[        T
5      nT	UT'   U R                  [        U5      5      U-   nU(       a*  [        UU	U
4S j[        [	        U5      5       5       5      nO([        UU
4S j[        [	        U5      5       5       5      nXU'   U$ )	N	is_nestedFzHCannot pad nested tensors without more information. Leaving unprocessed.r   )rW   c              3   ,   >#    U  H	  oT   v   M     g 7fr   r   )rB   sr   s     r   rG   Fpad_across_processes.<locals>._pad_across_processes.<locals>.<genexpr>  s     -u!vus   c              3   h   >#    U  H'  oT:X  a  [        TTT   -
  T5      O
[        S 5      v   M)     g 7fr   slice)rB   r   r   max_sizeold_sizes     r   rG   r     s4      `t[\#Xh#.95QU;V`ts   /2c              3   b   >#    U  H$  oT:X  a  [        S TT   5      O
[        S5      v   M&     g7fr   Nr"  rB   r   r   r%  s     r   rG   r     s,     oZnUVCxE!Xc]3U4[PZn   ,/)getattrwarningswarnr  rt   r_   r   r   rW   r   r}   maxr7   	new_zerosr0   r   )r   r   	pad_index	pad_firstr   sizesnew_size
new_tensorindicesr$  r%  s    `       @@r   _pad_across_processes3pad_across_processes.<locals>._pad_across_processes  sL   6;..MMZ, M#fll##sc&,,.?-?'?M73v||$$C ||FLL?Et  "-u--v||C((M<<> %%eHo6B
 `efijrfs`t G oZ_`cdl`mZnooG$7r   T)r=   r   r/  r0  r   r   Fr@   )r   r   r/  r0  r5  s        r   pad_across_processesr8  s  s!    " D 4Sir r   c           	      (    SS jn[        UU SUUUS9$ )z
Takes a `tensor` of arbitrary size and pads it so that it can work given `num_processes` needed dimensions.

New tensors are just the last input repeated.

E.g.:
  Tensor: ([3,4,4]) Num processes: 4 Expected result shape: ([4,4,4])

c                 <  ^^
 X-  nXU-  -
  nX-  S:X  a  X!-
  nOX!U-  -
  nXVU-  s=:  a  S:  a  O  OXV-
  nU R                   m
[        T
5      nX-   US'   U R                  [        U5      5      n[        UU
4S j[	        [        U5      5       5       5      n	XU	'   U$ )Nr   r   c              3   b   >#    U  H$  oT:X  a  [        S TT   5      O
[        S5      v   M&     g7fr'  r"  r(  s     r   rG   @pad_input_tensors.<locals>._pad_input_tensors.<locals>.<genexpr>  s,     kVjQR8a#/tLVjr)  )r_   r7   r.  r0   r   rt   )r   
batch_sizer   r   	remainderlast_inputsto_padr2  r3  r4  r%  s      `      @r   _pad_input_tensors-pad_input_tensors.<locals>._pad_input_tensors  s    /	 $=>&!+"/F"M&ABF &,1, )F<<> )%%eHo6
kV[\_`h\iVjkk$7r   T)r=   r=  r   r   r   r@   )r   r=  r   r   rA  s        r   pad_input_tensorsrC    s(    &  # r   c                 "    SS jn[        X0SXS9$ )a'  
Recursively reduce the tensors in a nested list/tuple/dictionary of lists of tensors across all processes by the
mean of a given operation.

Args:
    tensor (nested list/tuple/dictionary of `torch.Tensor`):
        The data to reduce.
    reduction (`str`, *optional*, defaults to `"mean"`):
        A reduction method. Can be of "mean", "sum", or "none"
    scale (`float`, *optional*):
        A default scaling value to be applied after the reduce, only valid on XLA.

Returns:
    The same data structure as `data` with all the tensors reduced.
c                    [        5       nU R                  5       nUR                  [        R                  :X  a  U$ UR                  [        R
                  :X  aR  [        R                  " 5         [        R                  " [        R                  U/U5        [        R                  " 5         OLUR                  R                  [        ;   a.  [        R                  R                  U[        R                  5        US:X  a  XCR                   -  nU$ )Nmean)r   r   r   r   r   r   r   r   
all_reduce
REDUCE_SUMvaluer   r   r   r   SUMr   )r   r   scaler   cloned_tensors        r   _reduce_across_processes(reduce.<locals>._reduce_across_processes  s    !!_%7%77  !!_%8%88
 LLNMM"---%@LLN##))-NN((E000Mr   T)r=   r   rK  rF  g      ?r@   )r   r   rK  rM  s       r   r   r     s    $&  di r   c                 "    S nS n[        XUS9$ )aZ  
Recursively converts the elements nested list/tuple/dictionary of tensors in FP16/BF16 precision to FP32.

Args:
    tensor (nested list/tuple/dictionary of `torch.Tensor`):
        The data to convert from FP16/BF16 to FP32.

Returns:
    The same data structure as `tensor` with all tensors that were in FP16/BF16 precision converted to FP32.
c                 "    U R                  5       $ r   )floatr   s    r   _convert_to_fp32)convert_to_fp32.<locals>._convert_to_fp32	  s    ||~r   c                     [        U 5      =(       d    [        U S5      =(       a-    U R                  [        R                  [        R
                  4;   $ rk   )r   r1   r`   r   float16r~   r   s    r   _is_fp16_bf16_tensor-convert_to_fp32.<locals>._is_fp16_bf16_tensor  s@    'C767+C 
MMNNZ
 J
 	
r   ro   r@   )r   rS  rW  s      r   convert_to_fp32rY    s    
 -AUVVr   c                   *    \ rS rSrSrS rS rS rSrg)ConvertOutputsToFp32i  aE  
Decorator to apply to a function outputting tensors (like a model forward pass) that ensures the outputs in FP16
precision will be convert back to FP32.

Args:
    model_forward (`Callable`):
        The function which outputs we want to treat.

Returns:
    The same function as `model_forward` but with converted outputs.
c                 &    Xl         [        X5        g r   )model_forwardr   )selfr]  s     r   __init__ConvertOutputsToFp32.__init__"  s    *t+r   c                 8    [        U R                  " U0 UD65      $ r   )rY  r]  )r^  rD   rF   s      r   __call__ConvertOutputsToFp32.__call__&  s    t114B6BCCr   c                 .    [         R                  " S5      e)NzCannot pickle a prepared model with automatic mixed precision, please unwrap the model with `Accelerator.unwrap_model(model)` before pickling it.)picklePicklingError)r^  s    r   __getstate__!ConvertOutputsToFp32.__getstate__)  s    "" `
 	
r   )r]  N)	rK   r   r   r   r   r_  rb  rg  r   r   r   r   r[  r[    s    
,D
r   r[  c                 8   ^  [        T 5      m U 4S jnT Ul        U$ )Nc                     > T" U 0 UD6$ r   r   )rD   rF   r]  s     r   forward(convert_outputs_to_fp32.<locals>.forward2  s    d-f--r   )r[  __wrapped__)r]  rk  s   ` r   convert_outputs_to_fp32rn  /  s!    (7M. (GNr   c                 F   [        U [        5      (       a*  U R                  5        H  n[        U5      nUc  M  Us  $    g[        U [        [
        45      (       a  U  H  n[        U5      nUc  M  Us  $    g[        U [        R                  5      (       a  U R                  $ g)z
Finds the device on which a nested dict/list/tuple of tensors lies (assuming they are all on the same device).

Args:
    (nested list/tuple/dictionary of `torch.Tensor`): The data we want to know the device of.
N)	r   r   valuesr   r0   r7   r   r   rW   )r3   r8   rW   s      r   r   r   ;  s     $  ;;=C %F! ! 
D5$-	(	(C %F!  
D%,,	'	'{{ 
(r   c              #   b  #    [        5       R                  [        R                  :w  d<  [        5       R                  b2  [        5       R                  R                  5       (       d  [        5       nOSSKnUR                  R                  XX#S9nU   Sv   SSS5        g! , (       d  f       g= f7f)z|
Wrapper around `deepspeed.runtime.zero.GatheredParameters`, but if Zero-3 is not enabled, will be a no-op context
manager.
Nr   )modifier_rank
fwd_moduleenabled)
r
   r   r   	DEEPSPEEDdeepspeed_pluginis_zero3_init_enabledr   	deepspeedzeroGatheredParameters)paramsrr  rs  rt  gather_param_contextrx  s         r   rz  rz  P  s      **o.G.GG++7 "33IIKK*}(~~@@J  A  
 
 
		s   BB/B	B/
B,(B/)FNr   )r   zbroadcast tensorr   )NNr7  rO  )NNT)Yr   re  r+  collections.abcr   
contextlibr   r   	functoolsr   r   typingr   r   r   r
   r   	constantsr   dataclassesr   r   importsr   r   r   versionsr   torch_xla.core.xla_modelcore	xla_modelr   torch.distributedr   r   r(   r,   r4   r:   rA   rU   rc   rh   rq   rv   ry   r   r   r   	Exceptionr   r   r   r   r   r   r   r   rR  doublehalfr~   uint8int8int16int32int64boolr   rI   r  r   r   r  rZ   r   r  r  r  UserWarningr  r8  rC  r   rY  r[  rn  r   rz  )rL   rM   s   00r   <module>r     s@     # 2 +   2 8 ; 
 ' ))!##*,
6]$ 4CX] 0f1h8$/$b."5.$PN	I 	 F&  &2s 2# &Z: 
KK	LL!	JJ	NNA	KK	JJ	KK	KK	KK	JJ  );(@(@(BC(Bqt(BC 2+5<< +" C  *S *@($,	; 	 4 4n%P & &RW0
 
4	*  I Ds   *G5