
    ;iӓ              	          S SK r S SKrS SKrS SKrS SKrS SKrS SKJr  S SKJ	r	  S SK
Jr  S SKJr  S SKJrJr  S SKrSSKJr  S	S
KJrJrJrJr  S	SKJr  S	SKJrJr  S	SKJrJ r J!r!  S	SK"J#r#  \" \$5      r%S r&S r'S0S jr(S0S jr)S r*S1S jr+S1S jr,S2S jr-S1S jr.S3S\/S\/S\04S jjr1 S4S\/S\/S\0S\04S jjr2S\Rf                  Rh                  S \Rj                  4S! jr6S\Rf                  Rh                  S"\74S# jr8S$\Rr                  Rt                  S%\74S& jr;S\Rf                  Rh                  4S' jr<S\Rf                  Rh                  S(\Rf                  Rh                  4S) jr=S\Rf                  Rh                  S(\\Rf                  Rh                  /\04   4S* jr>S+ r?S,\7S(\74S- jr@S.\\	\Rf                  Rh                     \/4   S(\A\Rf                  R                     4S/ jrCg)5    N)defaultdict)Iterable)nullcontext)Path)CallableUnion   )
get_logger   )FSDP_MODEL_NAMEOPTIMIZER_NAMESAFE_WEIGHTS_NAMEWEIGHTS_NAME)get_module_class_from_name)get_non_persistent_buffersis_peft_model)get_module_children_bottom_upis_compiled_modulesave)is_torch_versionc                  x    S[         R                  ;  a  S[         R                  S'   S[         R                  S'   g)zS
Enables RAM efficient loading of Hugging Face models for FSDP in the environment.
ACCELERATE_USE_FSDPTrueFSDP_CPU_RAM_EFFICIENT_LOADINGNosenviron     k/home/dmtnaga/Documents/work/airagagent/rag_env/lib/python3.13/site-packages/accelerate/utils/fsdp_utils.py!enable_fsdp_ram_efficient_loadingr!   '   s.    
 BJJ.,2

()39BJJ/0r   c                  *    S[         R                  S'   g)zT
Disables RAM efficient loading of Hugging Face models for FSDP in the environment.
Falser   Nr   r   r   r    "disable_fsdp_ram_efficient_loadingr$   1   s     4;BJJ/0r   c                     U(       a&  [        U 5      (       a  SSKJn  U" X R                  S9$ Ub  SSKJn  U" XS9$ U R                  5       $ )Nr   )get_peft_model_state_dictadapter_name)get_model_state_dictoptions)r   peftr&   active_adapter'torch.distributed.checkpoint.state_dictr)   
state_dict)modeladapter_only
sd_optionsr&   r)   s        r    _get_model_state_dictr3   8   sH    e,,2(=Q=QRR P#E>>!!r   c                     U(       a'  [        U 5      (       a  SSKJn  U" XU R                  S9$ Ub  SSKJn  U" XUS9$ U R                  U5      $ )Nr   )set_peft_model_state_dictr'   )set_model_state_dictr*   )r   r,   r5   r-   r.   r6   load_state_dict)r0   r/   r1   r2   r5   r6   s         r    _set_model_state_dictr8   G   sN    e,,2(I]I]^^ P#EzJJ$$Z00r   c           	          S nU R                   S:X  aU  SSKJn  SSKJn  U" U R
                  UR                  :H  [        U R                  SS5      [        U R                  SS5      S9nU$ )	Nr	   r   )StateDictOptionsStateDictTypeoffload_to_cpuF
rank0_only)full_state_dictcpu_offloadbroadcast_from_rank0)	fsdp_versionr.   r:   2torch.distributed.fsdp.fully_sharded_data_parallelr<   state_dict_typeFULL_STATE_DICTgetattrstate_dict_config)fsdp_pluginr2   r:   r<   s       r    _prepare_sd_optionsrI   V   sj    J 1$LT%'77=;X;XX = =?OQVW!()F)FV[!\

 r   c                    SS K Js  Jn  SSKJn  SSKJn  SSKJn	  [        R                  " USS9  U R                  U	R                  :X  a/  UR                  S:  n
XR                  l        XR                  l        U R                   S:X  a1  UR                  X R                  U R                  U R"                  5      O	[%        5       n['        U 5      nU   [)        X%US9nU R                  U	R                  :X  a  US:X  a	  [*         S	3O[*         S
U S	3n[        R,                  R/                  X>5      nUR0                  S:X  aF  [2        R5                  SU 35        [6        R8                  " X5        [2        R5                  SU 35        GOfU R                  U	R:                  :X  a  US:X  a  [*         SUR0                   S	3O[*         S
U SUR0                   S	3n[        R,                  R/                  X>5      n[2        R5                  SU 35        [6        R8                  " X5        [2        R5                  SU 35        OU R                  U	R<                  :X  a  [        R,                  R/                  U[*         S
U 35      n[        R                  " USS9  [2        R5                  SU 35        SU0nUR9                  UUR?                  U5      U" 5       S9  [2        R5                  SU 35        S S S 5        g ! , (       d  f       g = f)Nr   DefaultSavePlannerFullyShardedDataParallelr;   Texist_okr   r1   r2   .bin_zSaving model to zModel saved to _rankr0   r/   storage_writerplanner) torch.distributed.checkpointdistributed
checkpoint,torch.distributed.checkpoint.default_plannerrL   rC   rN   r<   r   makedirsrD   rE   num_processesrG   r=   r>   rB   optim_state_dict_configr   rI   r3   r   pathjoinprocess_indexloggerinfotorchr   LOCAL_STATE_DICTSHARDED_STATE_DICTFileSystemWriter)rH   acceleratorr0   
output_dirmodel_indexr1   dist_cprL   FSDPr<   is_multi_processctxr2   r/   weights_nameoutput_model_fileckpt_dirs                    r    save_fsdp_modelrr   g   s   22OcPKK
T*""m&C&CC '44q87G%%43C%%0 ##q( 	..0M0M{OrOr	
 ]  %[1J	*5Xbc
&&-*G*GG7Ba7Go.d3P_O``abmannrMsL "Z F((A-./@.ABC

:9o.?-@AB((M,J,JJ !# ##5)B)B(C4H'(+eK<U<U;VVZ[ 
 !#Z FKK*+<*=>?JJz5KK/*;)<=>((M,L,LLww||J?2C1[M0RSHKK40KK*8*56!:.JLL%&77A*,  
 KK/(45? 
s   HK88
Lc                    SS K Js  Jn  SSKJn  SSKJn  SSKJn	  UR                  5         U R                  U	R                  :X  a/  UR                  S:  n
XR                  l        XR                  l        U R                  S:X  a1  UR                  X R                  U R                  U R                   5      O	[#        5       n[%        U 5      nU   U R                  U	R                  :X  Ga  ['        U5      ULaW  UR(                  S:w  aG  UR*                  (       d6  U R,                  (       d  U R                  S:X  a  [/        S5      e S S S 5        g US:X  a	  [0         S3O[0         SU S3n[2        R4                  R7                  X=5      n[8        R;                  S	U 35        UR*                  (       + =(       d    UR<                  nU(       a  [>        R@                  " US
S9nO0 n[8        R;                  SU 35        GOjU R                  U	RB                  :X  a  US:X  a  [0         SUR(                   S3O[0         SU SUR(                   S3n[2        R4                  R7                  X=5      n[8        R;                  S	U 35        [>        R@                  " US
S9n[8        R;                  SU 35        OU R                  U	RD                  :X  a  [0         U;  a)  [2        R4                  R7                  U[0         SU 35      OUn[8        R;                  S	U 35        S[G        X%US90nURA                  UURI                  U5      U" 5       S9  US   n[8        R;                  SU 35        [K        UWX\S9nS S S 5        U$ ! , (       d  f       W$ = f)Nr   )DefaultLoadPlannerrM   r;   r   zzSet the `sync_module_states` flag to `True` so that model states are synced across processes when initializing FSDP objectrR   rS   zLoading model from Tweights_onlyzModel loaded from rT   r0   rQ   )r/   storage_readerrW   )&rX   rY   rZ   r[   rt   rC   rN   r<   wait_for_everyonerD   rE   r]   rG   r=   r>   rB   r^   r   rI   typera   is_fsdp2sync_module_states
ValueErrorr   r   r_   r`   rb   rc   is_main_processrd   loadre   rf   r3   FileSystemReaderr8   )rH   rh   r0   	input_dirrj   r1   rk   rt   rl   r<   rm   rn   r2   ro   input_model_file
load_modelr/   rq   load_results                      r    load_fsdp_modelr      sO   22OcP!!#""m&C&CC '44q87G%%43C%%0 ##q( 	..0M0M{OrOr	
 ]  %[1J	&&-*G*GGE{$&;+D+D+IR]RfRf"55+:R:RVW:W$3   
 8Ca7Go.d3P_O``abmannrMsL!ww||IDKK-.>-?@A(111P[5P5PJ"ZZ(8tL

KK,-=,>?@((M,J,JJ !# ##5)B)B(C4H'(+eK<U<U;VVZ[ 
  "ww||IDKK-.>-?@A$44HJKK,-=,>?@((M,L,LL &&y8 Y?*;1[M(JK 
 KK-hZ89!#8fp#qrJLL%&77A*,  
 $G,JKK,XJ78+E:Lp[ 
\ ] 
\ s   A8M<H$M<<
Nc                 &   SS K Js  Jn  SSKJn  SSKJn  SSKJn	  [        R                  " USS9  U R                  S:X  a1  UR                  X0R                  U R                  U R                  5      O	[        5       n
[        U 5      nU
   U R                  S:X  a  SS	KJn  U" X2US
9nOUR%                  X25      nU R                  U	R&                  :X  a  UR(                  S:X  a  US:X  a	  [*         S3O[*         SU S3n[        R,                  R/                  XN5      n[0        R3                  SU 35        [4        R6                  " X5        [0        R3                  SU 35        O[        R,                  R/                  U[*         SU 35      n[        R                  " USS9  [0        R3                  SU 35        UR7                  SU0UR9                  U5      U" 5       S9  [0        R3                  SU 35        S S S 5        g ! , (       d  f       g = f)Nr   rK   rM   r;   TrO   r   r	   )get_optimizer_state_dictr*   rR   rS   zSaving Optimizer state to zOptimizer state saved in 	optimizerrU   )rX   rY   rZ   r[   rL   rC   rN   r<   r   r\   rB   rD   rG   r^   r   rI   r.   r   optim_state_dictrE   ra   r   r_   r`   rb   rc   rd   r   rg   )rH   rh   r   r0   ri   optimizer_indexrk   rL   rl   r<   rn   r2   r   optim_stateoptim_state_nameoutput_optimizer_filerq   s                    r    save_fsdp_optimizerr      s   22OcPKK
T* ##q( 	..0M0M{OrOr	
 ]  %[1J	##q(X25ZXK//AK&&-*G*GG((A-/>!/C~&d+NK[[\]l\mmqIr ! )+Z(R%89N8OPQ

;>78M7NOPww||J>2B!OCT0UVHKK40KK4XJ?@LL'5&77A*,  
 KK3H:>?5 
s   	E0H
Hc                 x   SS K Js  Jn  SSKJn  SSKJn	  UR                  5         U R                  S:X  a1  UR                  X0R                  U R                  U R                  5      O	[        5       n
[        U 5      nU
   U R                  U	R                  :X  a  S nUR                  S:X  d  U R                  R                  (       d  US:X  a	  [          S3O[          SU S3n["        R$                  R'                  XM5      n[(        R+                  SU 35        [,        R.                  " USS	9n[(        R+                  S
U 35        O[          U;  a)  ["        R$                  R'                  U[          SU 35      OUn[(        R+                  SU 35        SUR1                  5       0nUR/                  UUUR3                  U5      S9  US   n[(        R+                  SU 35        U R                  S:X  a"  UR5                  X2US9nUR7                  U5        OSSKJn  U" X2XS9  S S S 5        g ! , (       d  f       g = f)Nr   rM   r;   r   rR   rS   zLoading Optimizer state from Tru   zOptimizer state loaded from zLoading Optimizer from r   )checkpoint_idrw   zOptimizer loaded from )r0   optimr   )set_optimizer_state_dictr*   )rX   rY   rZ   rC   rN   r<   rx   rB   rD   rG   r^   r   rI   rE   ra   r>   r   r   r_   r`   rb   rc   rd   r~   r/   r   optim_state_dict_to_loadr7   r.   r   )rH   rh   r   r0   r   r   r1   rk   rl   r<   rn   r2   r   optimizer_nameinput_optimizer_filerq   flattened_osdr   s                     r    load_fsdp_optimizerr     s   22cP!!#
 ##q( 	..0M0M{OrOr	
 ]  %[1J	&&-*G*GGK((A-[5X5X5c5c/>!/C~&d+NK[[\]l\mmqIr  (*ww||I'N$;<P;QRS#jj)=DQ:;O:PQR %%i7 Y>*:!O;L(MN 
 KK1(<=&	(<(<(>?KLL&&77A  
 &k2KKK0
;<##q( 99it9uM%%m4X$U{WC 
s   >F$H++
H9checkpoint_dir	save_pathsafe_serializationc                 h   SSK Js  Jn  SSKJs  Js  Jn  0 n[        U5      nUR                  SS9  UR                  UUR                  U 5      UR                  5       SS9  U(       a	  U[        -  OU[        -  n[        UR                  5       5      S:X  a  U[        U5      S      n[        XQUS9  U$ )z
Passthrough to `torch.distributed.checkpoint.format_utils.dcp_to_torch_save`

Will save under `save_path` as either `model.safetensors` or `pytorch_model.bin`.
r   NTrO   )rw   rW   no_distr   )r   )rX   rY   rZ   )torch.distributed.checkpoint.format_utilsformat_utilsr   mkdir_load_state_dictr   _EmptyStateDictLoadPlannerr   r   lenkeyslistr   )r   r   r   rk   dist_cp_format_utilsr/   s         r    )_distributed_checkpoint_to_merged_weightsr   L  s     32LLJYIOOTO"))//?$??A	 *  2D	--UaIaI :??"Z 0 34
3EFr   output_pathremove_checkpoint_dirc                    [        U 5      n SSKJn  [        SS5      (       d  [	        S5      eU R                  5       (       d  U S-  R                  5       nU S-  R                  5       nSU  S	3nU(       a  U(       a  US
-  nUSU  SU  S3-  nUS-  nO+U(       a  US-  nUSU  S3-  nOU(       a  US-  nUSU  S3-  n[	        U5      eU" 5       nUR                  (       aq  [        R                  SU  35        [        XU5      n	[        R                  SU	 35        U(       a.  [        R                  SU  35        [        R                  " U 5        UR                  5         g)a  
Merge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if
`SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}/model.safetensors` if
`safe_serialization` else `pytorch_model.bin`.

Note: this is a CPU-bound process.

Args:
    checkpoint_dir (`str`):
        The directory containing the FSDP checkpoints (can be either the model or optimizer).
    output_path (`str`):
        The path to save the merged checkpoint.
    safe_serialization (`bool`, *optional*, defaults to `True`):
        Whether to save the merged weights with safetensors (recommended).
    remove_checkpoint_dir (`bool`, *optional*, defaults to `False`):
        Whether to remove the checkpoint directory after merging.
r   )PartialStatez>=z2.3.0z/`merge_fsdp_weights` requires PyTorch >= 2.3.0`pytorch_model_fsdp_0optimizer_0zTried to load from z) but couldn't find a valid metadata file.zE However, potential model and optimizer checkpoint directories exist.zPlease pass in either z/pytorch_model_fsdp_0 or z/optimizer_0zinstead.z8 However, a potential model checkpoint directory exists.zPlease try passing in z/pytorch_model_fsdp_0 instead.z< However, a potential optimizer checkpoint directory exists.z/optimizer_0 instead.zMerging FSDP weights from z.Successfully merged FSDP weights and saved to z"Removing old checkpoint directory N)r   accelerate.stater   r   r|   existsr}   rb   rc   r   shutilrmtreerx   )
r   r   r   r   r   model_path_existsoptimizer_path_existserrstater   s
             r    merge_fsdp_weightsr   h  sx   ( .)N-D'**JKK   ""+.DDLLN!/-!? G G I#N#33\]!6ZZC+N+;;TUcTddpqqC:CMMC+N+;;YZZC"QQC+N+;;PQQCo NE00@AB=n[mn	DYKPQ KK<^<LMNMM.)	r   r0   devicec                 8  ^ ^	 [        USS 5      nU(       d  T $ 0 m	U HV  nUR                  S5      nSR                  US S 5      US   pTUR                  U5      n[        Xe5      nS T	[	        U5      '   MX     S[
        R                  R                  4U	U 4S jjnU$ )N_tied_weights_keys.modulec                 N  > [        [        5      nU R                  SS9 H3  u  p#[        U5      T;   d  M  U[        U5         R	                  U5        M5     T" U 5      n UR                  5        H2  u  pEU H'  nTU   nUc  [        X5      TU'   M  [        XU5        M)     M4     U $ )NF)recurse)r   r   named_parametersidappenditemsrF   setattr)	r   params_to_tienparamid_key_param_names
param_name_tied_paramsparam_init_fns	          r    param_init_fn_tied_param7ensure_weights_retied.<locals>.param_init_fn_tied_param  s     $D)///>HA%yL(bi(//2 ? v& %2$7$7$9 F*
$V,= ,36+FL(F6 + %: r   )rF   splitr`   get_submoduler   rd   nnModule)
r   r0   r   _tied_namesnamer   modr   r   r   s
   `        @r    ensure_weights_retiedr     s    %!5t<K Lzz#88D"I.Rj!!$'("&RY   2 $#r   full_sdc                 b   SSK Jn  SSKJn  UR	                  5       n0 nS nS nU R
                  (       a  [        UR                  5       UR                  5       5       H  u  u  pnUR                  nU
R                  5       R                  UR                  5      n
UR                  U
SUR                  R                  S9  U" XUR                   5      nU" UU	U
5      u  pU" XU5      nXU	'   M     OUR                  5        H  u  pUR                  n["        R$                  " UR'                  5       UR                  UR(                  S9nUR                  USUR                  R                  S9  U" UXR                   5      nU" UU	U5      u  pU" XU5      nXU	'   M     UR+                  USS	9  U$ )
a  
Loads the full state dict (could be only on rank 0) into the sharded model. This is done by broadcasting the
parameters from rank 0 to all other ranks. This function modifies the model in-place.

Args:
    accelerator (`Accelerator`): The accelerator instance
    model (`torch.nn.Module`):
        The model to load the state dict into, expected to be on meta device or a VRAM spike can occur
    full_sd (`dict`): The full state dict to load, can only be on rank 0
r   N)distribute_tensorc                     U R                  U5      n[        [        S5      nS nU=(       a    UR                  [        R                  :H  n	UR                  R                  (       a  U	(       d  UR                  nUS L=(       a    UR                  5       U4$ ! [         a3    UR                  SS5      u  pEU R                  U5      n[	        Xe5      n Nf = f)Nr   r   float8_e4m3fn)get_parameter_or_bufferAttributeErrorrsplitr   rF   hasattrrd   dtyper   is_floating_pointis_contiguous)
r0   r   empty_param	old_parambase_param_namelocal_param_name	submoduleis_torch_e4m3fn_availablecasting_dtypeis_param_float8_e4m3fns
             r    _infer_parameter_dtype:fsdp2_load_full_state_dict.<locals>._infer_parameter_dtype  s    	=55jAI %,E?$C!!:!g{?P?PTYTgTg?g..7M%OOM$B)@)@)BMQQ  	=0:0A0A#q0I-O++O<I	<I		=s   B :CCc                 X    Ub  U R                  US9n U(       a  U R                  5       n U $ )N)r   )to
contiguous)tensorto_contiguousr   s      r    _cast_and_contiguous8fsdp2_load_full_state_dict.<locals>._cast_and_contiguous  s.    YYUY+F&&(Fr   )srcgroup)r   r   T)assign)torch.distributedrY   torch.distributed.tensorr   r/   r}   zipr   valuesdevice_meshdetachr   device_type	broadcastr   WORLD
placementsrd   emptysizer   r7   )rh   r0   r   distr   meta_sharded_sd
sharded_sdr   r   r   
full_paramsharded_paramr   sharded_tensorr   r   full_tensors                    r    fsdp2_load_full_state_dictr    s    %: &&(OJR$ ""7:7==?OLbLbLd7e3$Zm'33K#**,//0G0GHJNN:1DJJ4D4DNE.zH`H`aN+A,(M
 2.Q^_N%3z" 8f *9)>)>)@%J'33K++m&8&8&:;CZCZbobubuvKNN;ATZZ5E5ENF.{KIaIabN+A,(M
 2.Q^_N%3z" *A 
*T2Lr   r   mappingc                     SSK Jn  0 nSX2'    U R                   H(  nUS    Vs/ s H  oQUR                     PM     snUS'   M*     gs  snf ! [         a    [	        S5      ef = f)a  
Switches the parameters of the optimizer to new ones (sharded parameters in usual case). This function modifies the
optimizer in-place.

Args:
    optimizer (`torch.optim.Optimizer`): Optimizer instance which contains the original model parameters
    mapping (`dict`): Mapping from the original parameter (specified by `data_ptr`) to the sharded parameter

Raises:
    KeyError:
        If a parameter in the optimizer couldn't be switched to its sharded version. This should never happen and
        indicates a bug. If we kept the original params instead of raising, the training wouldn't be numerically
        correct and weights wouldn't get updated.
r   )DTensor_local_tensorparamszA parameter in the optimizer couldn't be switched to its sharded version. This breaks the training. Please raise an issue on GitHub.N)r   r
  param_groupsdata_ptrKeyError)r   r  r
  accessor_mappingparam_groupps         r    !fsdp2_switch_optimizer_parametersr    sy     1 /
$11KBMhBW$XBWQQZZ%8BW$XK! 2$X 
  S
 	

s   A A
A A A"c                 n   SSK Jn  [        U R                  R                  U5      n[        USS9SS  H}  u  pE[        UR                  S5      5      S:  a  UR                  SS5      u  pgOSnUnU(       a  UR                  U5      OUnU" U5      (       d  Me  U" US	S
9nUR                  Xu5        M     U$ )a  
Applies the activation checkpointing to the model.

Args:
    accelerator (`Accelerator`): The accelerator instance
    model (`torch.nn.Module`): The model to apply the activation checkpointing to

Returns:
    `torch.nn.Module`: The model with the activation checkpointing applied
r   )checkpoint_wrapperT)return_fqnsNr   r   r   F)preserve_rng_state);torch.distributed.algorithms._checkpoint.checkpoint_wrapperr  fsdp2_prepare_auto_wrap_policyr   rH   r   r   r   r   r   register_module)	rh   r0   r  auto_wrap_policy_func
layer_namelayerparent_name
child_nameparent_modules	            r    fsdp2_apply_acr!  9  s     ;;;L;L;X;XZ_`:5dSTWUWX
z$%)&0&7&7Q&?#KK#J<G++K8U //&uGE))*< Y Lr   returnc           	      B   SSK JnJnJn  [	        X5      =(       d(    [        U5      =(       a    [	        UR                  U5      nU(       a  U$ U R                  R                  nUR                  U5        UR                  5       n[        U SS5      nUR                  UR                  UR                  =(       d    U" 5       Ub"  U[        U R                   R"                  5         OS[%        UR&                  XR(                  5      S.n	Sn
UR+                  5        H#  u  pUR,                  R.                  S:X  d  M!  Sn
  O   UR0                  (       a  U
(       d  [3        USSS	9n[4        R6                  " UR9                  5        VVs0 s H  u  pX;   d  M  X_M     snn5      nUR;                  [<        R(                  " S
5      5      n[?        US5      (       a  URA                  5         [C        Xa5      nUb@  [E        U5      SS  H.  nU" U5      (       d  M  [	        UU5      (       a  M%  U" U40 U	D6  M0     [	        X5      (       d	  U" U40 U	D6  UR0                  (       a  [G        XU5        UR0                  (       a  U
(       d  WRI                  5        Hc  u  nnUR;                  U R(                  5      nSU;   a'  URK                  SS5      u  nnURM                  U5      nOUnUnURO                  UUSS9  Me     [?        US5      (       a  URA                  5         [        USS5      nU RP                  S:w  a]  Ub  U[<        RR                  :w  aF  UR;                  [<        RR                  5      nU RT                  (       a  [V        RX                  " S5        U$ s  snnf )a
  Prepares the model for FSDP2 in-place. Also returns the model to avoid misuse of the original model.

Args:
    accelerator (`Accelerator`): The accelerator instance
    model (`torch.nn.Module`): The model to prepare

Returns:
    `torch.nn.Module`: Prepared model
r   )
FSDPModuleMixedPrecisionPolicyfully_shardtorch_device_meshN)reshard_after_forwardoffload_policy	mp_policymeshignored_paramsF
Params4bitT)r   fqnsmetatie_weightsr   r   r   )
persistentr   noz~FSDP upcast of low precision parameters to fp32 (since mixed_precision != 'no') may affect the precision of model checkpoints.)-torch.distributed.fsdpr$  r%  r&  
isinstancer   	_orig_modr   rH   set_auto_wrap_policyr/   rF   r(  r@   mixed_precision_policytupleparallelism_configfsdp_dim_namesget_parameters_from_modulesignored_modulesr   r   	__class____name__cpu_ram_efficient_loadingr   copydeepcopynamed_buffersr   rd   r   r0  r  r   r  r   r   r   register_buffermixed_precisionfloat32r}   warningswarn)rh   r0   r$  r%  r&  is_type_fsdpfsdp2_pluginoriginal_sdr+  fsdp2_kwargsmodel_has_params4bitr   r   non_persistent_buffer_fqnskvoriginal_non_persistent_buffersr  r   fqnbuffer_tensor
parent_fqnlocal_buffer_namer   model_dtypes                            r    fsdp2_prepare_modelrV  Z  s4    UTe0 5!Mj*&M  $$00L%%e,""$K; 3T:D ".!C!C&22!88R<P<RNRN^U;99HHIJdh5l6R6RTY[m[mnL !--/ ??##|3#'  0 --6J &@tZ^%_"*.--#113W3daq7VTQT3W+
' f-. 5-((:<O(3E:3B?F$V,,Z
5S5SF3l3 @ e((E*\*-- 	#;{C--6J"A"G"G"IC),,[-?-?@Mcz03

30B-
- % 3 3J ?$'! %))*;]W\)] #J  5-(( %$/K""d*0C{V[VcVcGc '&&MM Q Ls Xs   N
N
c                 .  ^ ^
 SSK JnJn  T R                  n[	        U[
        R                  5      (       a  UR                  nXCL a  [        USS5      nUc  / n[        U5      nT R                  b  T R                  n[        5       m
U H1  n[        X5      nUc  [        SU S35      eT
R                  U5        M3     S[        R                   R"                  S[$        4U U
4S	 jjn	U	$ XBL a*  S[        R                   R"                  S[$        4U 4S
 jjn	U	$ g)a  Prepares the auto wrap policy based on its type, done to mimic the behaviour of FSDP1 auto wrap policy.

Args:
    fsdp2_plugin (`FullyShardedDataParallelPlugin`):
        Instance of `FullyShardedDataParallelPlugin` containing the configuration options
    auto_wrap_policy_type (`str`):
        Either `transformer` or `size`
    model (`torch.nn.Module`):
        The model to wrap

Returns:
    `Callable[[torch.nn.Module], bool]`:
        The auto wrap policy function to be applied to the model
r   )size_based_auto_wrap_policytransformer_auto_wrap_policy_no_split_modulesNz+Could not find the transformer layer class z in the model.r   r"  c                 J   > TR                   c  g[        U [        T5      5      $ )NF)transformer_cls_names_to_wrapr4  r8  )r   rI  transformer_cls_to_wraps    r    policy.fsdp2_prepare_auto_wrap_policy.<locals>.policy  s%    99Afe,C&DEEr   c                 b   > [        S U R                  5        5       5      nUTR                  :  $ )Nc              3   @   #    U  H  oR                  5       v   M     g 7f)N)numel).0r  s     r    	<genexpr>Afsdp2_prepare_auto_wrap_policy.<locals>.policy.<locals>.<genexpr>  s     #K7J!GGII7Js   )sum
parametersmin_num_params)r   module_num_paramsrI  s     r    r^  r_    s.     ##Kv7H7H7J#K K$|'B'BBBr   )torch.distributed.fsdp.wraprX  rY  auto_wrap_policyr4  	functoolspartialfuncrF   r   r\  setr   r|   addrd   r   r   bool)rI  r0   rX  rY  fnno_split_modulesr\  layer_classtransformer_clsr^  r]  s   `         @r    r  r    s    f		&	&B"i''((WW	)"5*=tD#!(,-=(>%55A,8,V,V)"%%8K8LO& #N{m[i!jkk#''8	 9	F588?? 	Ft 	F 	F M 
	*	C588?? 	Ct 	C M r   c                      SSK Jn  U" S0 U D6$ )z
Returns a `GradScaler` for FSDP2, as the current implementation of `get_grad_scaler` doesn't accept other args. We
need this as current `get_grad_scaler` accepts only `distributed_type` as arg, which doesn't differentiate between
FSDP1 and FSDP2
r   )
GradScalerr   )torch.amp.grad_scalerrw  )kwargsrw  s     r    get_fsdp2_grad_scalerrz    s     1r   named_paramsc                    U R                  5        VVs0 s H  u  pUR                  SS5      U_M     n nnU R                  5        VVs0 s H0  u  pUR                  S5      (       a  UR                  SS5      OUU_M2     n nnU R                  5        VVs0 s H  u  pUR                  SS5      U_M     n nnU $ s  snnf s  snnf s  snnf )a  Removes parameter name modifiers in order to map them back to their original names.

See huggingface/accelerate#3554 for more context.

Args:
    named_params (`dict`): The named parameters dictionary to canonicalize.

Returns:
    `dict`: The canonicalized named parameters dictionary
z._checkpoint_wrapped_module z
_orig_mod.z
._orig_mod)r   replace
startswith)r{  rN  rO  s      r    fsdp2_canonicalize_namesr    s     Q]PbPbPdePdAII;R@!CPdLeXdXjXjXlXlPTPQq||L'A'A		,#q!KXl   @L?Q?Q?ST?StqAIIlB/2?SLT f Us   B:	7C Cmodulesc                    U c
  [        5       $ / n[        U [        5      (       am  [        R                  " U 5      n/ nUR                  5        H?  u  pgUR                  U5      (       d  M  UR                  U5        UR                  U5        MA     Un U  H+  nUR                  [        UR                  5       5      5        M-     [        U5      $ )zConverts modules to parameters where modules can be a string or list of torch.nn.Module

Args:
    modules (`Union[Iterable[torch.nn.Module], str]`): List of modules

Returns:
    `set[torch.nn.Parameter]`: List of parameters
)ro  r4  strrecompilenamed_modules	fullmatchr   r   extendr   rg  )r  r0   r   rg  regmapped_modulesr   r   s           r    r;  r;    s     uJ'3jj!!//1LD}}T""		&!%%f- 2 !$v00234 z?r   )FN)r   F)r   )T)TF)Dr@  rl  r   r  r   rF  collectionsr   collections.abcr   
contextlibr   pathlibr   typingr   r   rd   loggingr
   	constantsr   r   r   r   dataclassesr   modelingr   r   otherr   r   r   versionsr   r>  rb   r!   r$   r3   r8   rI   rr   r   r   r   r  rq  r   r   r   r   r   r   dictr  r   	Optimizerr  r!  rV  r  rz  r  ro  	Parameterr;  r   r   r    <module>r     s     	 	   # $ "  "    W W 3 ? J J & 
H	:;"1"76tEP-@`0Xfc c gk : kp44&)4?C4cg4n+$ +$ +$\J588?? JT JZ
1F1F 
QU 
>uxx BoEHHOO o od2 2HV[V^V^VeVeUfhlUlLm 2j 4 D &8EHHOO,c12		r   