
    ;i                     h   S SK r S SKrS SKrS SKJr  S SKJr  S SKrS SKJ	s  J
r  S SKJrJrJr  S SKJr  SSKJr  SSKJr  S	S
KJr  S	SKJrJr  \" 5       (       a  S SKJrJr  S SKJr   S SKJ!r!  S SK"J#r#  S SK$J%r%  S SK&J'r'  S SK(J)r)J*r*  S SK+J,r,  S SK-J.r.  S SK/J0r0J1r1  S SK2J3r3J4r4  S SK5J6r6  S SK7J8r8J9r9J:r:J;r;  S SK<J=r=  S SK>J?r?J@r@JArAJBrB  S SKCJDrDJErEJFrFJGrGJHrH  S SKIJJrJJKrKJLrL  S SKMJNrN  S SKOJPrPJQrQJRrRJSrSJTrTJUrU  S SKVJWrW  S SKXJYrYJZrZJ[r[J\r\J]r]J^r^  S S K_J`r`JaraJbrbJcrc  S=S! jrdS" re " S# S$5      rfS% rgS& rh " S' S(\5      riS) rj " S* S+5      rk " S, S-\5      rlS. rm " S/ S0\5      rn " S1 S2\n5      ro " S3 S4\n5      rp " S5 S6\n5      rqS7 rrS0 4S8 jrs " S9 S:\R                  R                  5      ruS; rvS< rwg)>    N)ABC)partial)BCEWithLogitsLossCrossEntropyLossMSELoss)DistributedDataParallel   )AcceleratedOptimizer)AcceleratedScheduler   )is_megatron_lm_available)recursively_applysend_to_device)mputensor_parallel)finalize_model_grads)	ModelType)get_num_microbatches)get_megatron_optimizer)get_tensor_model_parallel_group"get_tensor_model_parallel_src_rank)get_forward_backward_func)get_model_config)broadcast_int_listbroadcast_tensor)%beam_search_and_return_on_first_stage/generate_tokens_probs_and_return_on_first_stage)build_train_valid_test_datasets)	BertModelFloat16ModuleGPTModelT5Model)Classification)get_argsget_tensorboard_writerget_tokenizerprint_rank_last)_add_data_args_add_validation_args!core_transformer_config_from_args
parse_argsvalidate_args)load_args_from_checkpointload_checkpointsave_checkpoint)set_global_variables)_compile_dependencies_init_autoresume_initialize_distributed_set_random_seedset_jit_fusion_optionswrite_args_to_tensorboard)_vocab_size_with_padding)%build_train_valid_test_data_iteratorsget_optimizer_param_schedulernum_floating_point_operationssetup_model_and_optimizer
train_steptraining_log))average_losses_across_data_parallel_groupcalc_params_l2_normget_ltor_masks_and_position_idsunwrap_modelc           
      H   [        5       nUR                  (       a  SOSnUR                  S:X  a'  [        SUR                   SU S35        [        S5        [        U5      nUR                  S:X  aZ  UR                  (       a/  UR                  (       a  S	OSn[        UUUR                  S
U US9nU$ [        UUR                  S	U US9n U$ UR                  S:X  a  [        USS
U US9nU$ UR                  S:X  a  [        USS
U UUUS9nU$ [        SUR                   35      e)zBuild the model.zpre-trainingzfine-tuningr   z	Building z model in the z mode.zThe Megatron LM model weights are initialized at random in `accelerator.prepare`. Please use `accelerator.load_checkpoint` to load a pre-trained checkpoint matching the distributed setup.bertr	   T)confignum_tokentypesadd_binary_headparallel_outputpre_processpost_process)rD   num_classesrE   rH   rI   gpt)rD   rE   rG   rH   rI   t5)rD   rE   rG   rH   rI   add_encoderadd_decoderUnsupported model type: )r$   pretraining_flagrankprintmodel_type_namer*   bert_binary_headr   r#   
num_labelsr!   r"   
ValueError)	rH   rI   rM   rN   argsmoderD   rE   models	            l/home/dmtnaga/Documents/work/airagagent/rag_env/lib/python3.13/site-packages/accelerate/utils/megatron_lm.pymodel_provider_funcr[   [   sS   :D!22>DyyA~	$../~dV6JKx	
 /t4Fv%  "&"7"7QQN- $ 5 5 $')EH L7 # OO ')E6 L) 
			& #%
& L 
			% #%##
 L 3D4H4H3IJKK    c                    U R                  S5        [        5       nU R                  R                  R                  b  U R                  R                  R
                  c  [        S5      eU R                  R                  R
                  nU R                  R                  R	                  U5      n[        X5      n[        XS S9nO[        R                  nUR                  S:X  a  [        R                  n[        nU R                  R                  R
                  b   U R                  R                  R
                  n[        UUUR                  UR                   UR"                  S9u  p4n[%        U5      Ul        X4U4$ )Nz#Preparing model optimizer schedulerzaYou must provide a `custom_model_provider_function` when using a `custom_prepare_model_function`.)	schedulerrL   )no_wd_decay_condscale_lr_condlr_mult)rR   r$   statemegatron_lm_plugincustom_prepare_model_functioncustom_model_provider_functionrV   prepare_optimizerprepare_schedulerr   encoder_or_decoderrS   encoder_and_decoderr[   r;   r_   r`   ra   len	model_len)acceleratorrW   custom_model_provider_funcrY   	optimizerr^   
model_typemodel_provider_func_s           rZ   !prepare_model_optimizer_schedulerrq      sE   ;<:D++IIU//NNVs  &1%6%6%I%I%h%h"!!44RRSmn%k9	%kM	11
4'"66J2//NNZ#.#4#4#G#G#f#f (A !22,,LL)
%9 ZDNY&&r\   c                   0    \ rS rSrSrS rS rS rS rSr	g)	MegatronLMDummyDataLoader   z
Dummy dataloader presents model parameters or param groups, this is primarily used to follow conventional training

Args:
    **dataset_kwargs: Megatron data arguments.
c                     [         R                  " 5       n[        U5      n[        U5      nUR	                  5       n[        US   5      U l        U R                  R                  U5        SU R                  S'   g )Nr   Tmegatron_dataset_flag)argparseArgumentParserr(   r)   parse_known_argsvarsdataset_argsupdate)selfdataset_kwargsparser	data_argss       rZ   __init__"MegatronLMDummyDataLoader.__init__   sh    ((*'%f-++-	 1.  05912r\   c                     [        5       nU R                  R                  5        H9  u  p#[        XS5      nXC:w  a  [	        SU SU SU SU 35        [        XU5        M;     g )N z<WARNING: MegatronLMDummyDataLoader overriding arguments for : with )r$   r{   itemsgetattrrR   setattr)r}   rW   keyvalue	old_values        rZ   set_megatron_data_args0MegatronLMDummyDataLoader.set_megatron_data_args   sq    z++113JC2.I!RSVRWWXYbXccijminnopuovw Du% 4r\   c                    S nUR                   R                  R                  b   UR                   R                  R                  $  [        5       nUR                  S:X  a  SSKJn  SUl        U$ UR                  S:X  a  SSKJn  SUl        U$ UR                  S:X  a  SSK	Jn  SUl        U$  U$ ! [         a     U$ f = f)Nc                 Z   [        5       n[        UR                  [        [        45      (       a  UR                  OUR                  /UR
                  U UR                  S.nUR                  S:X  a)  UR                  UR                  UR                  S.5        OUR                  S:X  a  UR                  SUR                  05        ORUR                  S:X  a*  UR                  UR                  UR                  SS.5        O[        SUR                   35      e[        S	0 UD6u  p4nX4U4$ )
z&Build train, valid, and test datasets.)data_prefixsplits_stringtrain_valid_test_num_samplesseedrC   )max_seq_lengthbinary_headrK   r   rL   )r   max_seq_length_decdataset_typerO    )r$   
isinstance	data_pathlisttuplesplitr   rS   r|   
seq_lengthrT   encoder_seq_lengthdecoder_seq_lengthrV   r   )train_val_test_num_samplesrW   r{   train_dsvalid_dstest_dss         rZ   "train_valid_test_datasets_providerlMegatronLMDummyDataLoader.get_train_valid_test_datasets_provider.<locals>.train_valid_test_datasets_provider   s   :D1;DNNTSXM1Z1Zt~~aeaoao`p!%0J			L ##v-##*.//'+'<'< %%.##($//
 %%-##*.*A*A.2.E.E(, !#;D<P<P;Q!RSS*I*YL*Y'Hw..r\   rC   r   )r   TrK   rL   )rb   rc   *custom_megatron_datasets_provider_functionr$   rS   pretrain_bertr   is_distributedpretrain_gptpretrain_t5ImportError)r}   rl   r   rW   s       rZ   &get_train_valid_test_datasets_provider@MegatronLMDummyDataLoader.get_train_valid_test_datasets_provider   s    !	/F //ZZf$$77bbb	:D##v-LDH2A99%%.KDH2A99%%-JDH2A99	 . 21  	11	s   (B0 /B0 B0 0
B>=B>c                 x   [        5       nU R                  U5      nUR                  b  / n/ n/ n[        [	        USS5      5       H`  n[
        R                  " U5        [        U5      nUR                  US   5        UR                  US   5        UR                  US   5        Mb     O[        U5      u  pEnXEU4$ )Nrk   r   r   r	   )	r$   r   $virtual_pipeline_model_parallel_sizeranger   r   (set_virtual_pipeline_model_parallel_rankr8   append)	r}   rl   rW   !train_valid_test_dataset_providertrain_data_iteratorvalid_data_iteratortest_data_iteratori	iteratorss	            rZ   r8   ?MegatronLMDummyDataLoader.build_train_valid_test_data_iterators  s    z,0,W,WXc,d)44@"$"$!#74a89<<Q?ABcd	#**9Q<8#**9Q<8")))A,7 : Lq1LH6H #9KKKr\   )r{   N)
__name__
__module____qualname____firstlineno____doc__r   r   r   r8   __static_attributes__r   r\   rZ   rs   rs      s    :&:2xLr\   rs   c                     " S S5      nUS L n[         R                  " U[         R                  U R                  S9n[         R                  R                  U[        5       [        5       S9  U(       d  U(       a  U" 5       $ U$ )Nc                        \ rS rSrS rS rSrg)?_handle_megatron_data_iterator.<locals>.DummyMegatronDataloaderi  c                     U $ Nr   r}   s    rZ   __iter__H_handle_megatron_data_iterator.<locals>.DummyMegatronDataloader.__iter__  s    Kr\   c                     0 $ r   r   r   s    rZ   __next__H_handle_megatron_data_iterator.<locals>.DummyMegatronDataloader.__next__!  s    Ir\   r   N)r   r   r   r   r   r   r   r   r\   rZ   DummyMegatronDataloaderr     s    		r\   r   dtypedevicegroup)torchtensorboolr   distributed	broadcastr   r   )rl   data_iteratorr   is_data_iterator_emptyis_src_data_iterator_emptys        rZ   _handle_megatron_data_iteratorr     sw      +d2!&.DEJJ_j_q_q!r	"$F$HPoPq    &*@&((r\   c                 0   U R                  S5        [        5       nUR                  (       Gd3  SSKJnJn  UR                  UR                  -  nU Vs0 s H  of[        XX6   5      _M     nnUS   cS  [        US   [        R                  R                  R                  5      (       a
  XWS   l        OUS	 US	 US	 XWS   l        OUS	 XWS'   [        R                  R                  R                  " UR                   40 UD6nU" UU R"                  [$        R&                  " 5       [$        R(                  " 5       SS	U R*                  R-                  5       U R.                  S
9$ UR0                  b   UR0                  u  Ul        Ul        Ul        OSu  Ul        Ul        Ul        UR                  UR                  -  Ul        UR9                  U 5      u  nn	n
UR                  UR                  -  Ul        [;        XS9n[;        X	S9n	[;        X
S9n
XU
4$ s  snf )NzPreparing dataloaderr	   )_PYTORCH_DATALOADER_KWARGSprepare_data_loader
batch_sizesamplershufflebatch_samplerFT)num_processesprocess_indexsplit_batchesput_on_device	rng_typesdispatch_batches)r   r   r   )rl   r   )rR   r$   rv   data_loaderr   r   micro_batch_sizenum_micro_batchesr   r   r   utilsdataBatchSamplerr   
DataLoaderdatasetr   r   get_data_parallel_world_sizeget_data_parallel_rankr   copyr   consumed_samplesconsumed_train_samplesconsumed_valid_samplesconsumed_test_samplesr8   r   )rl   
dataloaderrW   r   r   r   kkwargsr   r   r   s              rZ   r   r   .  s   ,-:D%%%Q0043I3IITnoTnqWZ,F,IJJTno,'&+U[[-=-=-J-JKK/?y!,9%9%<(5E'2'#3< [[%%001C1CNvN
 #::<446!++002(99	
 		
   ,
 %%	++* dk`D')DdF` $ 5 58N8N N <<[I		
 $ 5 59O9O O<#
 =#
 <v"9KKKo ps   Hc                   H   ^  \ rS rSrU 4S jrSS jrS r\S 5       rSr	U =r
$ )MegatronLMOptimizerWrapperip  c                 $   > [         TU ]  USS S9  g )NF)device_placementscalersuperr   )r}   rn   	__class__s     rZ   r   #MegatronLMOptimizerWrapper.__init__q  s    U4Hr\   c                     g r   r   )r}   set_to_nones     rZ   	zero_grad$MegatronLMOptimizerWrapper.zero_gradt      r\   c                     g r   r   r   s    rZ   stepMegatronLMOptimizerWrapper.stepw  r  r\   c                 .    U R                   R                  $ )zTWhether or not the optimizer step was done, or skipped because of gradient overflow.)rn   skipped_iterr   s    rZ   step_was_skipped+MegatronLMOptimizerWrapper.step_was_skippedz  s     ~~***r\   r   r   )r   r   r   r   r   r  r
  propertyr  r   __classcell__r  s   @rZ   r   r   p  s'    I + +r\   r   c                     U R                  S5        [        5       n[        XR                  UR                  UR
                  5      $ )NzPreparing optimizer)rR   r$   r   r_   r`   ra   )rl   rY   rW   s      rZ   rf   rf     s:    +,:D!%)>)>@R@RTXT`T`aar\   c                   "    \ rS rSrSrSS jrSrg)MegatronLMDummyScheduleri  a  
Dummy scheduler presents model parameters or param groups, this is primarily used to follow conventional training
loop when scheduler config is specified in the deepspeed config file.

Args:
    optimizer (`torch.optim.optimizer.Optimizer`):
        The optimizer to wrap.
    total_num_steps (int):
        Total number of steps.
    warmup_num_steps (int):
        Number of steps for warmup.
    **kwargs (additional keyword arguments, *optional*):
        Other arguments.
Nc                 4    Xl         X l        X0l        X@l        g r   )rn   total_num_stepswarmup_num_stepsr   )r}   rn   r  r  r   s        rZ   r   !MegatronLMDummyScheduler.__init__  s    ". 0r\   )r   rn   r  r  Nr   )r   r   r   r   r   r   r   r   r\   rZ   r  r    s    r\   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )MegatronLMSchedulerWrapperi  c                 $   > [         TU ]  X5        g r   r   )r}   r^   
optimizersr  s      rZ   r   #MegatronLMSchedulerWrapper.__init__  s    /r\   c                     g r   r   )r}   rW   r   s      rZ   r
  MegatronLMSchedulerWrapper.step  s    r\   r   )r   r   r   r   r   r
  r   r  r  s   @rZ   r  r    s    0 r\   r  c                 >    U R                  S5        [        U5      nU$ )NzPreparing scheduler)rR   r9   )rl   rn   r^   s      rZ   rg   rg     s!    +,-i8Ir\   c                   >   ^  \ rS rSrSrU 4S jrS rS rS rSr	U =r
$ )AbstractTrainStepi  z;Abstract class for batching, forward pass and loss handler.c                 .   > [         TU ]  5         Xl        g r   )r  r   name)r}   r&  r  s     rZ   r   AbstractTrainStep.__init__  s    	r\   c                     g r   r   )r}   rl   rv   s      rZ   get_batch_func AbstractTrainStep.get_batch_func  r  r\   c                     g r   r   r   s    rZ   get_forward_step_func'AbstractTrainStep.get_forward_step_func  r  r\   c                     g r   r   )r}   rl   s     rZ   get_loss_funcAbstractTrainStep.get_loss_func  r  r\   )r&  )r   r   r   r   r   r   r)  r,  r/  r   r  r  s   @rZ   r$  r$    s    E r\   r$  c                   >   ^  \ rS rSrSrU 4S jrS rS rS rSr	U =r
$ )BertTrainStepi  zW
Bert train step class.

Args:
    args (`argparse.Namespace`): Megatron-LM arguments.
c                 Z  > [         TU ]  S5        U R                  XR                  5      U l        U R                  XR                  UR                  5      U l        U R                  UR                  UR                  5      U l        UR                  (       d  S U l        g SSKJn  X0l        g )Nr2  r   )SequenceClassifierOutput)r  r   r)  rv   	get_batchr/  rP   rU   	loss_funcr,  rT   forward_stepmodel_return_dictmodel_output_classtransformers.modeling_outputsr4  )r}   rl   rW   r4  r  s       rZ   r   BertTrainStep.__init__  s    ),,[:T:TU++K9N9NPTP_P_` 66t7L7LdNcNcd%%&*D#N&>#r\   c                     S nS nUR                   R                  R                  b   UR                   R                  R                  $ U(       a	   SSKJn  U$ U$ ! [
         a     U$ f = f)Nc                 h   / SQn[         R                  nU b  [        U 5      nOSn[        R                  " XU5      nUS   R                  5       nUS   R                  5       nUS   R                  5       nUS   R                  5       nUS   R                  5       n	US   R                  5       n
XVXxX4$ )	Build the batch.)texttypeslabels	is_random	loss_maskpadding_maskNr?  r@  rB  rC  rA  rD  r   int64nextr   broadcast_datalongfloat)r   keysdatatyper   data_btokensr@  sentence_orderrC  	lm_labelsrD  s              rZ   get_batch_megatron8BertTrainStep.get_batch_func.<locals>.get_batch_megatron  s     YD{{H (M*$33DIF F^((*F7O((*E#K0557N{+113Ix(--/I!.1668L.YTTr\   c                    [        U 5      n[        U[        R                  R	                  5       5      nUS   R                  5       nUS   R                  5       nSU;   a  US   R                  5       nOSnSU;   a9  US   R                  5       nUS   S:g  R                  [        R                  5      nOSnSnSU;   a  US   R                  5       nOSnX$XvXS4$ )r>  	input_idsattention_masktoken_type_idsNrA  next_sentence_label)rG  r   r   cudacurrent_devicerI  torJ  )r   r   rN  rD  r@  rP  rC  rO  s           rZ   get_batch_transformer;BertTrainStep.get_batch_func.<locals>.get_batch_transformer  s    &D!$

(A(A(CDD +&++-F 01668L4'-.3354 N//1	!(^t377D	 	 	$,!%&;!<!A!A!C!%.YTTr\   r   r5  )rb   rc   custom_get_batch_functionr   r5  r   r}   rl   rv   rQ  r\  r5  s         rZ   r)  BertTrainStep.get_batch_func  ss    	U0	U2 //IIU$$77QQQ 3  
 )(	  %%   A 
A('A(c                    ^ ^ S nUU 4S jnUR                   R                  R                  b   UR                   R                  R                  $ U(       a  U$ U$ )Nc                    Uu  p4UR                  5       nU R                  5       n [        R                  " UR                  S5      U R	                  S5      -  5      U R                  5       -  nUbo  [
        R                  " UR                  SS5      R                  5       UR                  S5      SS9nUR                  5       nXV-   n[        XV/5      nXxS   US   S.4$ Un[        U/5      nUSUS   04$ )Nr	   )ignore_indexr   r   )lm losszsop lossrg  )rJ  r   sumviewreshapeFcross_entropyr>   )	rC  rO  output_tensorlm_loss_
sop_logitslm_losssop_losslossaveraged_lossess	            rZ   loss_func_pretrain7BertTrainStep.get_loss_func.<locals>.loss_func_pretrain  s    #0 H~~'H!)Iiib 1I4E4Eb4I IJY]]_\G%??:??2q+A+G+G+I>K^K^_aKbqst#>>+)"KWL_"`);YZI[\\\ "KWI"Vi);<<<r\   c                   > TS:X  a2  [        5       nU" UR                  S5      U R                  S5      5      nOTR                  S:  aa  U R                  [        R
                  [        R                  4;   a3  [        5       nU" UR                  ST5      U R                  S5      5      nO[        5       nU" X5      n[        U/5      nUSUS   04$ )Nr   re  rr  r   )
r   ri  rU   r   r   rI  intr   r   r>   )rA  logitsloss_fctrr  rs  rU   r}   s        rZ   loss_func_finetune7BertTrainStep.get_loss_func.<locals>.loss_func_finetune%  s    Q"9BRA1$&,,5::uyy:Q*Q+-B
 ;V[[_M,./GOO&/!"4555r\   rb   rc   custom_loss_function)r}   rl   rP   rU   rt  rz  s   `  `  rZ   r/  BertTrainStep.get_loss_func  sN    	=&	6 //DDP$$77LLL%%%%r\   c                    ^ ^^ UUU 4S jnU$ )Nc                    > TR                  U 5      u  p#pEpgT
(       d  SnT(       a  U" X'X6S9nU[        TR                  XT5      4$ U" X'US9n	U	[        TR                  U5      4$ )Forward step.Ntokentype_idsrP  )r  r5  r   r6  )r   rY   rN  r@  rO  rC  rA  rD  rm  rx  rT   rP   r}   s             rZ   r7  9BertTrainStep.get_forward_step_func.<locals>.forward_step;  sh    MQ^^\iMjJF>f# %f% b$gdnni&XXXv5Iwt~~v>>>r\   r   )r}   rP   rT   r7  s   ``` rZ   r,  #BertTrainStep.get_forward_step_func:  s    	? r\   r7  r5  r6  r9  r   r   r   r   r   r   r)  r/  r,  r   r  r  s   @rZ   r2  r2    s#    
?>)@'&R r\   r2  c                   >   ^  \ rS rSrSrU 4S jrS rS rS rSr	U =r
$ )GPTTrainStepiK  zV
GPT train step class.

Args:
    args (`argparse.Namespace`): Megatron-LM arguments.
c                   > [         TU ]  S5        U R                  XR                  5      U l        U R                  U5      U l        U R                  5       U l        UR                  S-
  U l
        UR                  b  [        5       nUR                  U l
        UR                  U l        UR                  U l        UR                   U l        UR"                  (       d  S U l        g SSKJn  X@l        g )Nr  r   r   )!CausalLMOutputWithCrossAttentions)r  r   r)  rv   r5  r/  r6  r,  r7  padded_vocab_size	eod_token
vocab_filer&   eodreset_position_idsreset_attention_maskeod_mask_lossr8  r9  r:  r  )r}   rl   rW   	tokenizerr  r  s        rZ   r   GPTTrainStep.__init__S  s    (,,[:T:TU++K8 668//!3??&%I&]]DN"&"9"9$($=$=!!//%%&*D#W&G#r\   c                    ^  U 4S jnU 4S jnUR                   R                  R                  b   UR                   R                  R                  $ U(       a	   SSKJn  U$ U$ ! [
         a     U$ f = f)Nc                   > S/n[         R                  nU b  [        U 5      nOSn[        R                  " XU5      nUS   R                  5       nUSS2SS24   R                  5       nUSS2SS24   R                  5       n[        UTR                  TR                  TR                  TR                  5      u  pn
XvXU
4$ )zGenerate a batchr?  Nr   re  )r   rF  rG  r   rH  rI  
contiguousr@   r  r  r  r  )r   rK  rL  r   rM  tokens_rA  rN  rU  rC  position_idsr}   s              rZ   rQ  7GPTTrainStep.get_batch_func.<locals>.get_batch_megatrong  s     8D{{H (M*$33DIF Vn))+GQU^..0FQV_//1F 7V(?(?AZAZ\`\n\n73N| 9lJJr\   c                 $  > [        U 5      nSUS   0n[        U[        R                  R	                  5       5      nUS   R                  5       n[        R                  " UR                  S   S4UR                  UR                  S9T	R                  -   n[        R                  " X#/SS9nUS S 2SS 24   R                  5       nUS S 2S S24   R                  5       n[        UT	R                  T	R                  T	R                  S5      u  pgnXTXvU4$ )NrT  r   r   r   dimre  T)rG  r   r   rY  rZ  rI  zerosshaper   r   r  concatr  r@   r  r  )
r   r   r  paddingrA  rN  rU  rC  r  r}   s
            rZ   r\  :GPTTrainStep.get_batch_func.<locals>.get_batch_transformer  s    &Dk!23D!$

(A(A(CDD;',,.Gkk7==#3Q"7w}}U\UcUcdgkguguuGllG#51=GQU^..0FQV_//1F6U(?(?AZAZ\`73N| 9lJJr\   r   r^  )rb   rc   r_  r   r5  r   r`  s   `     rZ   r)  GPTTrainStep.get_batch_funcf  st    	K2	K  //IIU$$77QQQ 2  
 )(	  %%s   A! !
A/.A/c                    ^ [        5       mU4S jnUR                  R                  R                  b   UR                  R                  R                  $ U$ )Nc                   > TR                   (       a  Uu  p#OUnUR                  5       nU R                  S5      R                  5       n TR                  S:  a  [        R
                  " [        R                  " UR                  S5      U -  5      R                  S5      U R                  5       R                  S5      /5      n[        R                  R                  U[        R                  " 5       S9  US   US   -  nO9[        R                  " UR                  S5      U -  5      U R                  5       -  nTR                  (       au  [        R                  R                  5       nUR                  5       (       aB   SU S[        R                  R                  5        S[         R"                  " 5       S    35       e[%        U/5      nSUS   0nTR                   (       a  UR'                  S	W05        XG4$ )
Nre  r   r   r   zRank z7: found NaN in local forward loss calculation. Device: z, node: rg  rx  )return_logitsrJ  ri  context_parallel_sizer   catrh  r   
all_reducer   get_context_parallel_groupcheck_for_nan_in_loss_and_gradget_rankisnanrY  rZ  osunamer>   r|   )	rC  rm  lossesrx  rr  global_rankaveraged_lossoutput_dictrW   s	           rZ   r6  -GPTTrainStep.get_loss_func.<locals>.loss_func  s   !!!.&\\^F!r*002I))A-yy%))FKKOi,G"H"M"Ma"PR[R_R_RaRfRfghRi!jk!!,,T9W9W9Y,ZAwa(yyR9!<=	O 22#//88:::<< K= )$zz88:;8BHHJqM?T' FtfMM$mA&67K!!""Hf#56$$r\   )r$   rb   rc   r}  )r}   rl   r6  rW   s      @rZ   r/  GPTTrainStep.get_loss_func  sG    z	%< //DDP$$77LLLr\   c                    ^  U 4S jnU$ )Nc                 l   > TR                  U 5      u  p#pEnU" X&XSS9nU[        TR                  U5      4$ )r  )rA  r  )	r   rY   rN  rA  rC  rU  r  rm  r}   s	           rZ   r7  8GPTTrainStep.get_forward_step_func.<locals>.forward_step  s?     GKnnUbFcCFI|!&VM '$..)"DDDr\   r   r}   r7  s   ` rZ   r,  "GPTTrainStep.get_forward_step_func  s    	E r\   )r  r  r7  r5  r6  r9  r  r  r  r  s   @rZ   r  r  K  s$    H&6)p#J	 	r\   r  c                   n   ^  \ rS rSrSrU 4S jr\S 5       r\S 5       r\S 5       r	S r
S rS	 rS
rU =r$ )T5TrainStepi  zU
T5 train step class.

Args:
    args (`argparse.Namespace`): Megatron-LM arguments.
c                   > [         TU ]  S5        U R                  XR                  5      U l        U R                  U5      U l        U R                  5       U l        UR                  (       d  S U l
        g SSKJn  X0l
        g )Nr  r   )Seq2SeqLMOutput)r  r   r)  rv   r5  r/  r6  r,  r7  r8  r9  r:  r  )r}   rl   rW   r  r  s       rZ   r   T5TrainStep.__init__  se    ',,[:T:TU++K8 668%%&*D#E&5#r\   c                 \    U R                  S5      nU R                  S5      nX-  nUS:  nU$ )Nr   r	         ?)	unsqueeze)rU  attention_mask_b1sattention_mask_bs1attention_mask_bssextended_attention_masks        rZ   attn_mask_postprocess!T5TrainStep.attn_mask_postprocess  s@     ,55a8+55a8/D"4s":&&r\   c                 f    [         R                  " [         R                  " SX 4US95      nUS:  nU$ Nr   r   r  )r   trilones)r   r   rU  s      rZ   get_decoder_maskT5TrainStep.get_decoder_mask  s1    EJJ:/JSY$Z['#-r\   c                     U R                   u  p4U R                  S5      n[        R                  " X1S4US9nXe-  nUS:  nU$ r  )r  r  r   r  )	rU  dec_seq_lengthr   r   _r  r  r  r  s	            rZ   get_enc_dec_maskT5TrainStep.get_enc_dec_mask  sS    &,,
 ,55a8"ZZQ(GPVW/D"4s":&&r\   c                     S nS nUR                   R                  R                  b   UR                   R                  R                  $ U(       a	   SSKJn  U$ U$ ! [
         a     U$ f = f)Nc                 N   / SQn[         R                  nU b  [        U 5      nOSn[        R                  " XU5      nUS   R                  5       nUS   R                  5       nUS   R                  5       nUS   R                  5       nUS   S:  n	US	   S:  n
US
   S:  nXVXXU4$ )r>  )text_enctext_decrA  rC  enc_maskdec_maskenc_dec_maskNr  r  rA  rC  r  r  r  r  rE  )r   rK  rL  r   rM  
tokens_enc
tokens_decrA  rC  r  r  r  s               rZ   rQ  6T5TrainStep.get_batch_func.<locals>.get_batch_megatron  s     kD{{H (M*$33DIF  
+002J
+002JH%**,F{+113Ij)C/Hj)C/H!.1C7L9hR^^^r\   c                 2   [        U 5      n[        U[        R                  R	                  5       5      nUS   R                  5       nUS   R                  5       nUS:g  R                  [        R                  5      nSU;   a  US   R                  5       nOkUR                  UR                  UR                  [        R
                  S9nUSSS24   R                  5       USS	S24'   S
US'   UR                  US:H  S
5        [        R                  US   R                  5       5      n[        R                  UR                  S	   UR                  5      n[        R!                  US   R                  5       UR                  S	   UR                  5      nX%XCXgU4$ )r>  rT  rA  rW  decoder_input_ids)r   r   .Nre  r   r   ).r   rU  )rG  r   r   rY  rZ  rI  r[  rJ  	new_zerosr  r   clonemasked_fill_r  r  r  r  )	r   r   r  rA  rC  r  r  r  r  s	            rZ   r\  9T5TrainStep.get_batch_func.<locals>.get_batch_transformer  st   &D!$

(A(A(CDDk*//1J(^((*F4++EKK8I"d*!"56;;=
#--fll6==X]XbXb-c
&,S#2#X&6&<&<&>
37#%&
6"''
d(:A>"88>N9O9T9T9VWH"33J4D4DQ4GIZIZ[H&77%&++-z/?/?/BJDUDUL 9hR^^^r\   r   r^  )rb   rc   r_  r   r5  r   r`  s         rZ   r)  T5TrainStep.get_batch_func  ss    	_2	_. //IIU$$77QQQ 1  
 )(	  %%rb  c                     S nUR                   R                  R                  b   UR                   R                  R                  $ U$ )Nc                     UR                  5       n[        R                  " UR                  S5      U R	                  S5      -  5      U R                  5       -  nUn[        U/5      nUSUS   04$ )Nre  rg  r   )rJ  r   rh  ri  rj  r>   )rC  rm  rn  rp  rr  rs  s         rZ   r6  ,T5TrainStep.get_loss_func.<locals>.loss_funcB  sh    $**,Hiib 1I4E4Eb4I IJY]]_\GDG	RO)_Q%7888r\   r|  )r}   rl   r6  s      rZ   r/  T5TrainStep.get_loss_funcA  s?    	9 //DDP$$77LLLr\   c                    ^  U 4S jnU$ )Nc           
      t   > T
R                  U 5      u  p#pEpgnU" X#XgUSUS9n	U	[        T
R                  U5      4$ )r  Nr  r  )r   rY   r  r  rC  rP  r  r  r  rm  r}   s             rZ   r7  7T5TrainStep.get_forward_step_func.<locals>.forward_stepP  sU     ^b]k]k^ZJI(l "LX\hqM !'$..)"DDDr\   r   r  s   ` rZ   r,  !T5TrainStep.get_forward_step_funcO  s    	E r\   r  )r   r   r   r   r   r   staticmethodr  r  r  r)  r/  r,  r   r  r  s   @rZ   r  r    s^    
6 
' 
'  
 	' 	'=)~ r\   r  c                      [        5       n [        5         U R                  S:X  a  [        SU R                   S35        [        U R                  U R                  5        g )Nr   z> setting random seeds to z ...)r$   r3   rQ   rR   r   r4   data_parallel_random_init)rW   s    rZ   finish_mpu_initr  `  sF    :D yyA~*499+T:;TYY > >?r\   c                 N   U R                  S5        [        R                  R                  5       (       d   S5       e[	        USS9nUR                  5        HM  u  pE[        X4S 5      b/  UR                  S:X  a  [        SU S[        X45       SU SU 3SS	9  [        X4U5        MO     UR                  (       d  UR                  S
S5      (       a  UR                  c   S5       e[        U5        [        U5        [        U5        [        5         [!        5         [#        5         [%        5         ['        5       n[        USS 5      c  [)        UR*                  U5      Ul        UR.                  S:X  a)  UR0                  (       a  UR2                  S:X  a  SUl        OSUl        SUl        g )NzInitializing Megatron-LMzMegatron requires CUDA.T)ignore_unknown_argsr   z*WARNING: overriding default arguments for r   r   )flushuse_checkpoint_argsFz/--use-checkpoints-args requires --load argumentr  rC   r	   )rR   r   rY  is_availabler+   r   r   rQ   r   r  getloadr-   r,   r0   r  r2   r1   r5   r$   r7   orig_vocab_sizer  rS   rP   rU   rT   	iteration)rl   extra_args_providerargs_defaultsrW   r   r   s         rZ   
initializer  m  s   01::""$$?&??$ )tDD $))+
4d#/yyA~@QwtGYFZZ`ad`eefglfmn 	5! , =#4#45JE#R#Ryy$W&WW$!$'$     :Dt($/7!9$:N:NPT!Uv%$*?*?DOOWXDX $ %DNr\   c                      ^  \ rS rSrSrU 4S jrS rS rS rS r	S r
S	 rS
 rS rS rS r        SS jrSrU =r$ )MegatronEnginei  z
Megatron-LM model wrapper

Args:
    accelerator (:class:`~accelerate.Accelerator`): The accelerator object to use.
    model: Megatron-LM model
    optimizer: Megatron-LM optimizer
    lr_scheduler: Megatron-LM lr scheduler
c                   > [         TU ]  5         X l        US   U l        X0l        X@l        [        5       nUR                  R                  R                  bK  UR                  R                  R                  " U40 UR                  R                  R                  D6U l        O{UR                  S:X  a  [        X5      U l        OZUR                  S:X  a  [        X5      U l        O9UR                  S:X  a  [        X5      U l        O[!        SUR                   35      eSU R                  l        0 U l        0 U l        SU l        SU l        SU l        S U l        UR0                  b  [3        5         g g )Nr   rC   rK   rL   rO   FT)r  r   module
base_modelrn   r^   r$   rb   rc   custom_train_step_classcustom_train_step_kwargstrain_step_handlerrS   r2  r  r  rV   r  total_loss_dicteval_total_loss_dictr  report_memory_flag$num_floating_point_operations_so_farmodule_configtensorboard_dirr6   )r}   rl   rY   rn   r^   rW   r  s         rZ   r   MegatronEngine.__init__  sD   (""z//GGS&1&7&7&J&J&b&b'#))<<UU'D# !!V+&3K&FD#!!U*&2;&ED#!!T)&1+&DD#78L8L7MNOO&+#  "$&!"&451!+%' ,r\   c                   ^ ^ [        5       n[        T R                  S   5      nT R                  R                  Ul        [        T R                  S   [        5      (       a  UR                  (       a  UR                  b   S5       eT R                   Vs/ s H  o3R                  PM     snUl	        [        T R                  5      S:X  a  UR                  S   Ul	        UR                  (       aX  T R                   Vs/ s H  o3R                  PM     snUl        [        T R                  5      S:X  a  UR                  S   Ul        UR                  (       ax  UR                   (       ag  [#        [        T R                  5      5       V^s/ s H
  mUU 4S jPM     snUl        [        T R                  5      S:X  a  UR$                  S   Ul        [&        Ul        U$ s  snf s  snf s  snf )Nr   zWhen overlap_grad_reduce is True, config.no_sync_func must be None; a custom no_sync_func is not supported when overlapping grad-reducer   c                 <   > TR                   R                  TU 5      $ r   )rn   finish_param_sync)xmodel_indexr}   s    rZ   <lambda>2MegatronEngine.get_module_config.<locals>.<lambda>  s    $..::;Jr\   )r$   r   r  rn   
scale_lossgrad_scale_funcr   LocalDDPoverlap_grad_reduceno_sync_funcno_syncrj   delay_grad_reducestart_grad_syncgrad_sync_funcoverlap_param_gatherdelay_param_gatherr   param_sync_funcr   finalize_model_grads_func)r}   rW   rD   model_chunkr  s   `   `rZ   get_module_config MegatronEngine.get_module_config  s   z!$++a.1!%!:!:dkk!nh//D4L4L&&. V. KO++"V+;#6#6+"VF4;;1$&,&9&9!&<#%%X\XcXc(dXc)D)DXc(d%t{{#q(,2,A,A!,DF)$$)@)@^cdghlhshsdt^u&^u{JJ^u&F" 4;;1$)/)?)?)B&+?( #W )e&s   G>G#G(c                     U R                    H  nUR                  5         M     U R                  c  U R                  5       U l        U R	                  5         g r   )r  trainr  r#  log_eval_resultsr}   model_modules     rZ   r&  MegatronEngine.train  sG     KKL  ( %!%!7!7!9Dr\   c                     U R                    H  nUR                  5         M     U R                  c  U R                  5       U l        g g r   )r  evalr  r#  r(  s     rZ   r,  MegatronEngine.eval  s@     KKL ( %!%!7!7!9D &r\   c                    [        5       n/ n[        U5      S:  a  UR                  S:  aq  [        SUR                  5       HV  nUR	                  UR                  5        VVs0 s H&  u  pVXVXBR                  -  US-   UR                  -   _M(     snn5        MX     OU/n[        U R                  5      S:  ad  [        U5      S:  a:  [        [        U R                  5      5       Vs/ s H  n[        U5      PM     snnU$ S /[        U R                  5      -  nU$ [        U5      S:  a  [        U5      OS nU$ s  snnf s  snf )Nr   r   )	r$   rj   r   r   r   r   r   r  iter)	r}   
batch_datarW   data_chunksr   r   vr  batch_data_iterators	            rZ   get_batch_data_iterator&MegatronEngine.get_batch_data_iterator  sC   zz?Q%%)q$"8"89A&& )3(8(8(:(: %:%:!:a!etG\G\=\]](: :  *lt{{a z?Q& -2#dkk2B,CD,Cqk",CD   #"	 Vc$++..   #" 8;:7J${"3PT""! Es   #-D;(Ec           	          U R                  U5      n[        U R                  R                  UU R                  U R
                  U R                  U R                  S9u  p4pVUS:H  U R
                  l        X4XV4$ )z`
Training step for Megatron-LM

Args:
    batch_data (:obj:`dict`): The batch data to train on.
)forward_step_funcr   rY   rn   opt_param_schedulerrD   r   )	r4  r<   r  r7  r  rn   r^   r  r  )r}   r0  r3  loss_reducedr  	grad_normnum_zeros_in_grads          rZ   r<   MegatronEngine.train_step
  ss     #:::FCM"55BB-++nn $%%D
@I '3a&7#9GGr\   c           
         [        5       nU R                  U5      n[        5       nU" U R                  R                  UU R
                  [        5       UR                  UR                  SS9nUR                  S:  a  [        R                  R                  5         U=R                  [        R                  " 5       UR                  -  [        5       -  -  sl        [        R                   " SS9(       as  0 nUS    Hf  nU Vs/ s H  oU   PM	     n	n[#        U	S   R$                  5      S:X  a  ['        U	5      [#        U	5      -  Xg'   MN  [        R(                  " U	5      Xg'   Mh     U$ 0 $ s  snf )ze
Evaluation step for Megatron-LM

Args:
    batch_data (:obj:`dict`): The batch data to evaluate on.
T)r7  r   rY   num_microbatchesr   r   forward_onlyr   )ignore_virtualr   )r$   r4  r   r  r7  r  r   r   r   empty_unused_memory_levelr   rY  empty_cacher   r   r   is_pipeline_last_stagerj   r  rh  r  )
r}   r0  rW   r3  forward_backward_func
loss_dictsr9  r   r  losses_reduced_for_keys
             rZ   	eval_stepMegatronEngine.eval_step!  sC    z":::F 9 ;*"55BB-++13!22

 ))Q.JJ""$##,,.1F1FFI]I__	
# %%T:L!!}:D)E*QC&*&)E-a06671<(+,B(CcJ`Fa(aL%(-5K(LL% %  	 *Fs   E(c                    [        5       nU R                  S   R                  (       Ga9  U R                  " S
0 UD6u  p4pVU =R                  S-  sl        [
        R                  " 5       UR                  -  [        5       -  nU=R                  U-  sl	        U =R                  [        X'5      -  sl
        UR                  b  U R                  R                  5       R                  5       nS n	UR                   (       a  [#        U R$                  5      n	['        UU R(                  U R                  R*                  S   S   U R                  UU R,                  UUU	U5
      U l        OU R.                  " S
0 UD6nUR                  b  U H  n
U R0                  R3                  U
[4        R6                  R9                  S/5      5      X:   -   U R0                  U
'   U R0                  R3                  U
S-   [4        R6                  R9                  S/5      5      [4        R6                  R9                  S/5      -   U R0                  U
S-   '   M     [4        R:                  " S[4        R6                  R=                  5       S9nU H'  n
[?        X:   R@                  5      S:X  d  M   XU
   -  nM)     S nSU;   a  US   nU RB                  RD                  b  U RB                  RE                  XS	9$ U$ )Nr   r   lr        
_num_iters      ?r  rx  )rr  rx  r   )#r$   r  trainingr<   r  r   r   r   r   r   r
  r:   r  rn   get_loss_scaleitemlog_params_normr?   rY   r=   r  param_groupsr	  rG  r  r  r   rY  FloatTensorr   rZ  rj   r  r  r9  )r}   r0  rW   	loss_dictr  r:  r;  r   
loss_scaleparams_normr   rr  rx  s                rZ   forwardMegatronEngine.forwardI  s    z;;q>"""DHOODaV`DaAIYNNaN99;d>S>SSVjVllJ'':5'559VW[9hh5##/!^^::<AAC
"''"5djj"AK*6((NN//248NN++ %+' 44I##/$C1155c5::;Q;QSVRW;XY\e\jj --c2 EID]D]DaDal*EJJ,B,BC5,IE

..u5E6D--cL.@A	 % ||C

(A(A(CDC9>''(A-#&  y x(F""55A**==4=WWr\   c                 &   [        5       nUR                  b  U R                  S:X  a  g [        5       n[        5       nSU R                   S3nU R                   H  nUR                  S5      (       a  M  U R                  U   U R                  US-      -  nX4 SU S3-  n[        R                  " [        SUR                  5       5      5      nUR                  (       a
  X4 SU S3-  nU(       d  M  UR                  U S3UR                  5       U R                  5        UR                  (       d  M  UR                  U S	3X`R                  5        M     [        U5      S
-   n[        SU-  5        [        U5        [        SU-  5        0 U l        g )Nr   zvalidation loss at iteration z | rL  z value:    z PPL: z validationz validation pplr   -)r$   r  r  r%   r  endswithmathexpminrP  rP   
add_scalarrj   r'   )r}   rW   writerstringr   r   ppllengths           rZ   r'  MegatronEngine.log_eval_results  sa   z'4>>Q+>z')00@D,,C||L))--c2T5N5NsUaOa5bbEXeWC00F((3r5::<01C$$EuC00v!!SE"5uzz|T^^T(((%%_&=sNNS - Vqf%f%$&!r\   c                 :   U R                  5         [        5       nXl        [        R                  R                  5         [        U R                  U R                  U R                  U R                  U R                  S9  [        R                  R                  5         g )N)r
  )r'  r$   saver   r   barrierr/   r  r  rn   r^   r
  )r}   
output_dirrW   s      rZ   r/   MegatronEngine.save_checkpoint  sm    z	!!#NNKKNNNN151Z1Z	
 	!!#r\   c                    [        5       nXl        SUl        SUl        [        R
                  R                  5         [        U R                  U R                  U R                  5      u  p4[        R
                  R                  5         X0l        X@l        UR                  (       a,  U R                  S:X  a  U R                  R                  5         g g g r  )r$   r  r   r   r   r   rh  r.   r  rn   r^   r  r
  fp16reload_model_params)r}   	input_dirrW   r  r
  s        rZ   r.   MegatronEngine.load_checkpoint  s    z	&'#&'#!!#:I$++W[WeWegkgugu:v7	!!#"4X1991,NN..0 -9r\   c
                 
   [        5       nUR                  S:w  a  [        S5      eUR                  S:  a  [	        S5      eUR
                  (       a  [	        S5      eUR                  b  [	        S5      eUR                  c  [	        S5      eUc  Uc  [	        S	5      eUc  S
nOSUs=:  a  S::  d  O  [	        S5      eUc  SnOSUs=::  a  S::  d  O  [	        S5      eUc  SnO1US:  a  US:  a  [	        S5      eSUs=::  a  S
::  d  O  [	        S5      eU
R                  SS5      nSUs=::  a  S
::  d  O  [	        S5      eU
R                  SS5      nSUs=::  a  S
::  d  O  [	        S5      eU
R                  SS5      n[        U[        5      (       d  [	        S5      eUnUbE  [        U[        5      (       d  [	        S5      eUS:  a  [	        S5      eUR                  S   S:  a  g[        5       nU
R                  SUR                  5      nUb   [        U[        5      (       d  [	        S5      eU	c  S
n	SnSnSn[        R                   R#                  5       S:X  Ga  Uc>  [        R$                  R'                  UR                  S   /UR                  S   -  5      nOUR)                  SS 9R%                  5       nUc  X1R                  S   -
  nUS::  a  [	        S!5      eU(       a  XAR                  S   -   S-   nS"[*        R,                  " US"-  5      -  nX1R                  S   S-   -
  n[        R$                  R'                  UR                  /U-  /UR                  S   -  5      n[        R.                  " [        R0                  " USS2S4   SS 9UR%                  5       U/SS 9nOXAR                  S   -   nS"[*        R,                  " US"-  5      -  nX1R                  S   -
  n[        R$                  R'                  UR                  /U-  /UR                  S   -  5      n[        R.                  " UR%                  5       U/SS 9nUR3                  S5      UR3                  S5      /n[5        S#USS$9nUR7                  5       n[9        U[        R:                  USS%9n[9        US   [        R:                  USS%9nU
R                  S&S5      n[        R<                  R?                  U5        [A        U RB                  [D        [F        [H        45      nUb  [K        UUUUUSU	S'9u  nnU$ [M        UUUSUUUUUS(S)9
u  n  nU$ )*a%  
Generate method for GPT2 model. This method is used for inference. Supports both greedy and beam search along
with sampling. Refer the Megatron-LM repo for more details

Args:
    inputs (torch.Tensor): input ids
    attention_mask (torch.Tensor, optional): attention mask. Defaults to None.
    max_length (int, optional): max length of the generated sequence. Defaults to None.
    Either this or max_new_tokens should be provided.
    max_new_tokens (int, optional): max number of tokens to be generated. Defaults to None.
    Either this or max_length should be provided.
    num_beams (int, optional): number of beams to use for beam search. Defaults to None.
    temperature (float, optional): temperature for sampling. Defaults to 1.0.
    top_k (int, optional): top k tokens to consider for sampling. Defaults to 0.0.
    top_p (float, optional): tokens in top p probability are considered for sampling. Defaults to 0.0.
    length_penalty (float, optional): length penalty for beam search. Defaults to None.
    kwargs: additional key-value arguments
rK   z1Generate method is not implemented for this modelr   z1Generate method requires data parallelism to be 1z9Generate method requires sequence parallelism to be FalseNz2Checkpoint activations cannot be set for inferencez$Vocab file is required for inferencez;`max_length` or `max_new_tokens` are required for inferencerM  rK  g      Y@zAtemperature must be a positive number less than or equal to 100.0r   i  z:top_k must be a positive number less than or equal to 1000z/top_p and top_k sampling cannot be set togetherz'top_p must be less than or equal to 1.0top_p_decayz-top_p_decay must be less than or equal to 1.0top_p_boundz-top_p_bound must be less than or equal to 1.0add_BOSFzadd_BOS must be a booleanzbeam_width must be an integerz!beam_width must be greater than 0z,When doing beam_search, batch size must be 1
stop_tokenzstop_token must be an integerre  )axisz%max_new_tokens must be greater than 0   r	   )int_listrQ   )r   rQ   random_seed)rt  num_return_genlength_penaltyT)return_output_log_probstop_ktop_prq  rr  temperature#use_eod_token_for_early_termination)'r$   rS   NotImplementedErrordata_parallel_sizerV   sequence_parallelrecompute_granularityr  r  r   r   rw  r  r&   r  r   r   r  rY  
LongTensorrh  r]  ceilr  r  sizer   tolistr   rF  randommanual_seedrA   r  torchDDPr  r    r   r   )r}   inputsrU  
max_lengthmax_new_tokens	num_beamsr~  r|  r}  rz  r   rW   rq  rr  rs  
beam_widthr  rt  
sizes_listprompts_tokens_tensorprompts_length_tensorr  sizes_tensorsizescontext_tokens_tensorcontext_length_tensorrx  unwrapped_modelrN  r  s                                 rZ   megatron_generate MegatronEngine.megatron_generate  s!   B z5(%&YZZ""Q&PQQ!!XYY%%1QRR??"CDD ."8Z[[K,u,`aa=Eu$$YZZ=ES[US[NOO5'C' !JKKjj4{)c)LMMjj4{)c)LMM**Y.7D))899
!j#.. !@AAA~ !DEE||A"E!O	ZZimm<
!j#.. !@AA! N
 $ $%%'1,%(-

(=(=v||A>ORXR^R^_`Ra>a(b%(6(:(:(:(C(H(H(J%%!+ll1o!=" !HII+ll1o=A
:>!::
!+||A/B!C**//)-->1Q0RU[UaUabcUd0de(-__WQT]<fkkmWU\^)%
 ,ll1o=
:>!::
!+ll1o!=**//)-->1Q0RU[UaUabcUd0de(-fkkmW5MTV(W% &**1-%**1-J *!jqI ##% 0Lahi j 0q5;;Odkl m jj2  -&t8]8[\!=%%% -IFA,  K%%(-'''48LFAq r\   )r  r  r  r  r  r
  rn   r	  r^   r  r  )NNNNNNNN)r   r   r   r   r   r   r#  r&  r,  r4  r<   rG  rW  r'  r/   r.   r  r   r  r  s   @rZ   r   r     sh    (>4 :#2H.&P=~'4$1  p pr\   r   c                     [        U 5      $ )z
Average losses across data parallel group.

Args:
    losses (List[Tensor]): List of losses to average across data parallel group.
)r>   )r  s    rZ   %avg_losses_across_data_parallel_groupr  q  s     5V<<r\   c                     S n[        XSS9$ )z
Recursively gather tensor in a nested list/tuple/dictionary of tensors from data parallel ranks.

Args:
    tensor (nested list/tuple/dictionary of `torch.Tensor`):
        The data to gather across data parallel ranks.

c                    U R                   S:X  a  U R                  5       S    n [        [        R                  R                  [        R                  " 5       S95       Vs/ s H  n[        R                  " U 5      PM     nn[        R                  R                  X [        R                  " 5       S9  [        R                  " USS9$ s  snf )Nr   r   r  )ndimr  r   r   r   get_world_sizer   get_data_parallel_group
empty_like
all_gatherr  )r   r  output_tensorss      rZ   _gpu_gather_one;gather_across_data_parallel_groups.<locals>._gpu_gather_one  s    ;;!\\^D)F 5,,;;#B]B]B_;`a
a V$a 	 
 	$$^3C^C^C`$ayyQ//
s    CT)error_on_other_type)r   )r   r  s     rZ   "gather_across_data_parallel_groupsr  |  s    0 _$OOr\   )TTTT)xrw   r]  r  abcr   	functoolsr   r   torch.nn.functionalnn
functionalrk  torch.nnr   r   r   torch.nn.parallel.distributedr   r  rn   r
   r^   r   importsr   
operationsr   r   megatron.corer   r   megatron.core.distributedr  r   megatron.core.enumsr   )megatron.core.num_microbatches_calculatorr   megatron.core.optimizerr   megatron.core.parallel_stater   r   megatron.core.pipeline_parallelr   megatron.core.utilsr   0megatron.inference.text_generation.communicationr   r   -megatron.inference.text_generation.generationr   r   "megatron.legacy.data.dataset_utilsr   megatron.legacy.modelr   r    r!   r"   $megatron.legacy.model.classificationr#   megatron.trainingr$   r%   r&   r'   megatron.training.argumentsr(   r)   r*   r+   r,   megatron.training.checkpointingr-   r.   r/   megatron.training.global_varsr0   megatron.training.initializer1   r2   r3   r4   r5   r6   %megatron.training.tokenizer.tokenizerr7   megatron.training.trainingr8   r9   r:   r;   r<   r=   megatron.training.utilsr>   r?   r@   rA   r[   rq   rs   r   r   r   rf   r  r  rg   r$  r2  r  r  r  r  Moduler   r  r  r   r\   rZ   <module>r     s{     	      A A M , , - 9 2M>-N>pI4e SQQC   lkB  O  2j'>jL jLZ$>LD+!5 + b .!5  "K% K\A$ AHN# Nb	@ 15B -`PUXX__ Ph=Pr\   