
    ;i3                         S SK r S SKrS SKJr  S SKJrJrJr  S SKJ	r	J
r
  S SKJr  \(       a  S SKJr  \ " S S5      5       rg)	    N)	dataclass)TYPE_CHECKINGOptionalUnion)TorchContextParallelConfigTorchTensorParallelConfig)is_torch_version)Acceleratorc                      \ rS rSr% SrSr\\   \S'   Sr	\\   \S'   Sr
\\   \S'   Sr\\   \S'   Sr\S\4   \S'   Sr\S\4   \S	'   SrS
 rS r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r \S 5       r!S\"4S jr#S%S\\"   4S jjr$S\%\%\S4   \%\"S4   4   4S jr&S r'S \"S!\4S" jr(S&S# jr)S$r*g)'ParallelismConfig   a,  
A dataclass to configure parallelisms applied to the model. Inspired by torchtitan's `ParallelDims`
https://github.com/pytorch/torchtitan/blob/main/torchtitan/distributed/parallel_dims.py

Args:
    dp_replicate_size (`int`, defaults to `1`):
        The size of the data parallel group. If `dp_replicate_size` is set to 1, the data parallel replication
        group will not be used.
    dp_shard_size (`int`, defaults to `1`):
        The size of the model shard group. If `dp_replicate_size > 1` and `tp_size > 1`, `dp_shard_size` must also
        be greater than 1, as composing DDP + TP is currently not supported.
    tp_size (`int`, defaults to `1`):
        The size of the tensor parallel group. If `tp_size` is set to `1`, the tensor parallel group will not be
        used.
    cp_size (`int`, defaults to `1`):
        The size of the context parallel group. Currently not supported, but reserved for future use and enabled
        for downstream libraries.
    tp_handler (`~utils.TorchTensorParallelConfig`, defaults to `None`):
        The handler for the tensor parallel group.

You may obtain different distributed data parallel paradigms by configuring `dp_replicate_size` and `dp_shard_size`
together:
    - `dp_replicate_size == 1` and `dp_shard_size > 1`, we obtain Fully Sharded Data Parallel (FSDP).
    - `dp_replicate_size > 1` and `dp_shard_size > 1`, we obtain Hybrid Sharded Data Parallel (HSDP).
    - `dp_replicate_size > 1` and `dp_shard_size == 1` is an invalid configuration, to use pure DP, use
      `DistributedDataParallelKwargs` instead.

Ndp_replicate_sizedp_shard_sizetp_sizecp_size
tp_handler
cp_handlerc                     SU R                    SU R                   SU R                   SU R                   SU R                   SU R
                   SU R                   S3$ )	Nz'ParallelismConfig(
 	dp_replicate_size=z,
	dp_shard_size=z,
	tp_size=z,
	cp_size=z,
	total_size=z
	tp_handler=z,
	cp_handler=z)
)r   r   r   r   
total_sizer   r   selfs    m/home/dmtnaga/Documents/work/airagagent/rag_env/lib/python3.13/site-packages/accelerate/parallelism_config.py__repr__ParallelismConfig.__repr__F   sy    ##'#9#9": ;#112 3 ' ' OO, - OO, - OO,C1		
    c                    SS K nS/nUR                  U R                  R                  5        VVs0 s H;  u  p4X2;  d  M  U[	        US5      (       a  UR                  UR                  5      OU_M=     snn5        g s  snnf )Nr   device_mesh__dict__)copydeepcopyr   itemshasattr)r   r   _non_serializable_fieldskvs        r   to_jsonParallelismConfig.to_jsonR   ss    $1?  !MM//11DA4 N:0F0F4==,AM1	
s
   A;
3A;
c                 f    / nU R                   (       a  US/-  nU R                  (       a  US/-  nU$ )zENames of enabled dimensions across which data parallelism is applied.dp_replicatedp_shard)dp_replicate_enableddp_shard_enabledr   dimss     r   dp_dim_namesParallelismConfig.dp_dim_names_   s9     $$^$$D  ZL Dr   c                 f    / nU R                   (       a  US/-  nU R                  (       a  US/-  nU$ )z]Names of enabled dimensions which will receive the same batch (non-data parallel dimensions).tpcp)
tp_enabled
cp_enabledr-   s     r   non_dp_dim_names"ParallelismConfig.non_dp_dim_namesi   s2     ??TFND??TFNDr   c                 f    / nU R                   (       a  US/-  nU R                  (       a  US/-  nU$ )zlNames of enabled dimensions which will be flattened into a joint mesh across which is model sharded in FSDP.r*   r3   )r,   r5   r-   s     r   dp_shard_cp_dim_names'ParallelismConfig.dp_shard_cp_dim_namess   s5       ZL D??TFNDr   c                     / nU R                   (       a  US/-  nU R                  (       a  US/-  nU R                  (       a  US/-  nU$ )z@Names of enabled dimensions across which loss should be averagedr)   r*   r3   )r+   r,   r5   r-   s     r   dp_cp_dim_names!ParallelismConfig.dp_cp_dim_names}   sK     $$^$$D  ZL D??TFNDr   c                 D    / nU R                   (       a  US/-  nUS/-  nU$ )z^Names of enabled dimensions across which FSDP is applied, including data parallel replication.r)   dp_shard_cp)r+   r-   s     r   fsdp_dim_names ParallelismConfig.fsdp_dim_names   s0     $$^$$Dr   c                 h    U R                   U R                  -  U R                  -  U R                  -  $ )zSThe total size of the parallelism configuration, which is the product of all sizes.)r   r   r   r   r   s    r   r   ParallelismConfig.total_size   s-     %%(:(::T\\IDLLXXr   c                 4    U R                   U R                  -  $ )zhThe size of the non-data parallel dimensions, which is the product of tensor and context parallel sizes.)r   r   r   s    r   non_data_parallel_size(ParallelismConfig.non_data_parallel_size   s     ||dll**r   c                 4    U R                   U R                  -  $ )z_The size of the data parallel dimensions, which is the product of data parallel replication and)r   r   r   s    r   data_parallel_size$ParallelismConfig.data_parallel_size   s     %%(:(:::r   c                      U R                   S:  $ )zKTrue if data parallel replication is enabled, i.e. `dp_replicate_size > 1`.   )r   r   s    r   r+   &ParallelismConfig.dp_replicate_enabled   s     %%))r   c                      U R                   S:  $ )zDTrue if data parallel sharding is enabled, i.e. `dp_shard_size > 1`.rK   )r   r   s    r   r,   "ParallelismConfig.dp_shard_enabled   s     !!A%%r   c                      U R                   S:  $ )z:True if tensor parallelism is enabled, i.e. `tp_size > 1`.rK   )r   r   s    r   r4   ParallelismConfig.tp_enabled        ||ar   c                      U R                   S:  $ )z;True if context parallelism is enabled, i.e. `cp_size > 1`.rK   )r   r   s    r   r5   ParallelismConfig.cp_enabled   rQ   r   c                 4    U R                   U R                  -   $ )z$Names of all active mesh dimensions.)r/   r6   r   s    r   active_mesh_dims"ParallelismConfig.active_mesh_dims   s       4#8#888r   device_typec                    [        SS5      (       a  SSKJn  O[        S5      eU R	                  5       n[        U5      S:X  a  gUu  pEU" UUUS9nU R                  (       a  X`R                     R                  S5        U R                  (       a  X`R                     R                  S	5        U R                  (       a  X`R                     R                  S
5        U$ )a  Builds a device mesh for the given device type based on the parallelism configuration.
This method will also create required joint meshes (e.g. `dp_shard_cp`, `dp_cp`, `dp`).

Args:
    device_type (`str`): The type of device for which to build the mesh, e
z>=z2.2.0r   )init_device_meshz4Building a device_mesh requires to have torch>=2.2.0N)mesh_dim_namesdpr?   dp_cp)
r	   torch.distributed.device_meshrY   RuntimeError	_get_meshlenr/   _flattenr9   r<   )r   rW   rY   meshrZ   
mesh_shaper   s          r   build_device_mesh#ParallelismConfig.build_device_mesh   s     D'**FUVV~~t9>%)"&)

 ))*33D9%%223<<]K,,-66w?r   c                    U R                   c'  Ub"  U R                  U5      U l         U R                   $ SeUb@  U R                   R                  U:w  a&  [        SU R                   R                   SU S35      eU R                   $ )Nz@You need to pass a device_type e.g cuda to build the device meshz4The device_mesh is already created with device type z@. However, you are trying to get a device mesh with device_type z<. Please check if you correctly initialized your device_mesh)r   rd   rW   
ValueError)r   rW   s     r   get_device_mesh!ParallelismConfig.get_device_mesh   s    #&#'#9#9+#F   ZZ&##//;>$NtO_O_OkOkNl  mm  ny  mz  zv  w  r   return.c                    ^ U R                    Vs0 s H  oU R                  U   _M     nn/ SQm[        UR                  5       U4S jS9n[	        [        U6 5      $ s  snf )zQGenerate mesh shape and dimension names for torch.distributed.init_device_mesh().)r)   r*   r3   r2   c                 ,   > TR                  U S   5      $ )Nr   )index)x
mesh_orders    r   <lambda>-ParallelismConfig._get_mesh.<locals>.<lambda>   s    :++AaD1r   )key)rU   _sizessortedr!   tuplezip)r   parallelism	mesh_dimssorted_itemsro   s       @r   r_   ParallelismConfig._get_mesh   se     OSNcNcdNc{$++k"::Nc	d >
OO2
 S,'(( es   Ac                    U R                   c.  [        [        R                  R	                  SS5      5      U l         U R
                  c.  [        [        R                  R	                  SS5      5      U l        U R                  c.  [        [        R                  R	                  SS5      5      U l        U R                  c.  [        [        R                  R	                  SS5      5      U l        U R                  S:  a  U R                  c  [        5       U l        U R                  S:  a  U R                  c  [        5       U l
        U R                   S:  a  [        SU R                    35      eU R
                  S:  a  [        SU R
                   35      eU R                  S:  a  [        S	U R                   35      eU R                  S:  a  [        S
U R                   35      eU R                  S:  d  U R                  S:  a+  U R                   S:  a  U R
                  S:X  a  [        S5      eU R                   U R
                  U R                  U R                  S.U l        g )N$PARALLELISM_CONFIG_DP_REPLICATE_SIZE1 PARALLELISM_CONFIG_DP_SHARD_SIZEPARALLELISM_CONFIG_TP_SIZEPARALLELISM_CONFIG_CP_SIZErK   z.dp_replicate_size must be at least 1, but got z*dp_shard_size must be at least 1, but got z$tp_size must be at least 1, but got z$cp_size must be at least 1, but got aC  Tensor/Context parallelism (tp/cp_size > 1) cannot be used with pure data parallelism (dp_replicate_size > 1 and dp_shard_size == 1). Please set dp_shard_size > 1 and dp_replicate_size == 1 to compose FSDP + TP/CP for 2D parallel, or set dp_replicate_size == 1 and dp_shard_size > 1 to compose HSDP + TP/CP for 3D parallel.)r)   r*   r2   r3   )r   intosenvirongetr   r   r   r   r   r   r   rg   rs   r   s    r   __post_init__ParallelismConfig.__post_init__   s   !!)%(8^`c)d%eD"%!$RZZ^^4VX[%\!]D<<rzz~~.JCPQDL<<rzz~~.JCPQDL<<!&";"=<<!&"<">!!A%MdNdNdMefgg!I$J\J\I]^__<<!CDLL>RSS<<!CDLL>RSSLL1q 0d6L6Lq6PUYUgUgklUlo  !22**,,,,	
r   rw   sizec                     XR                   R                  5       ;   d"   SU R                   R                  5        35       eX R                   U'   [        X S3U5        g )NzParallelism must be one of _size)rs   keyssetattr)r   rw   r   s      r   	_set_sizeParallelismConfig._set_size  sV    kk..00d4OPTP[P[P`P`PbOc2dd0#'K U+T2r   c                    [        5       nUR                  (       d  U R                  S:X  a  g U R                  S:X  a  U R                  SUR                  5        U R                  UR                  :w  a&  [        SU R                   SUR                   S35      eU R                  S:  a;  UR                  (       d*  UR                  (       d  [        SUR                   S35      eU R                  R                  5        H7  u  p4US:X  d  M  [        X S3S 5      c  M  UR                  S	U S
U S35        M9     U(       a@  UR                  (       a.  [        R                  " SSR                  U5      -   [         5        g g g )NrK   r)   zParallelismConfig total_size (z ) does not match num_processes (zB). Please adjust dp_replicate_size/ dp_shard_size/tp_size/cp_size.zpParallelismConfig is only compatible DistributedType.FSDP (version 2) or DistributedType.Multi{Device}, but got ._handlerzParallelismConfig.z_handler is set, but z0_size is set to 1. This handler will be ignored.z.ParallelismConfig has the following warnings:

)setmulti_devicer   r   num_processesrg   is_fsdp2distributed_typers   r!   getattraddis_main_processwarningswarnjoinUserWarning)r   accelerator	_warningsrw   r   s        r   _validate_accelerator'ParallelismConfig._validate_accelerator"  st   E	''DOOq,@ ??aNN>;+D+DE??k77700A B""-";";!< =12  ??Q(<(<@X@X E  FQ  Fb  Fb  Ec  cd  e  "&!2!2!4KqyWT](+CTJV(5J;-  XH  I "5 44MMADIIiDXX 59r   )rs   r   r   r   r   r   r   r   )N)r   r
   )+__name__
__module____qualname____firstlineno____doc__r   r   r   __annotations__r   r   r   r   r   r   r   r   r   r   r&   propertyr/   r6   r9   r<   r@   r   rE   rH   r+   r,   r4   r5   rU   strrd   rh   ru   r_   r   r   r   __static_attributes__ r   r   r   r      s   : (,x}+#'M8C='!GXc]!!GXc]! :>Jd556=:>Jd667>K


       	 	   Y Y + + ; ; * * & &         9 9S < 8C=  )5sCx%S/!AB )'
R3S 3 3
 r   r   )r   r   dataclassesr   typingr   r   r   accelerate.utils.dataclassesr   r   accelerate.utils.versionsr	   
accelerater
   r   r   r   r   <module>r      s@    
  ! 1 1 ^ 6 & e e er   