
    9i                         d dl Z d dlZd dlmZ d dlmZ d dlZd dlmc m	Z
 d dlmZmZmZ d dlmZ ddlmZ ddlmZ d	d
lmZmZ d	dlmZmZ  e       r
d dlmZmZmZ  e       rd dl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( d dl)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/m0Z0m1Z1 d dl2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9m:Z:m;Z;m<Z< d dl=m>Z>m?Z?m@Z@mAZAmBZB d dl=mZC d dlDmEZE d dlFmGZG d dlHmIZI d dlJmKZKmLZL d dlMmNZNmOZO d dlPmQZQ d dlRmSZSmTZTmUZU d dlVmWZWmXZXmYZYmZZZ d6dZ[d Z\ G d d      Z]d  Z^ G d! d"e      Z_d# Z` G d$ d%      Za G d& d'e      Zbd( Zc G d) d*e      Zd G d+ d,ed      Ze G d- d.ed      Zf G d/ d0ed      Zgdi fd1Zh G d2 d3ej                  j                        Zjd4 Zkd5 Zly)7    N)ABC)partial)BCEWithLogitsLossCrossEntropyLossMSELoss)DistributedDataParallel   )AcceleratedOptimizer)AcceleratedScheduler   )is_megatron_lm_availableis_transformers_available)recursively_applysend_to_device)!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSequenceClassifierOutput)get_argsget_num_microbatchesget_tensorboard_writer
get_timersget_tokenizermpuprint_rank_0print_rank_last)_add_data_args_add_validation_args
parse_argsvalidate_args)load_args_from_checkpointload_checkpointsave_checkpoint) MegatronPretrainingRandomSamplerMegatronPretrainingSampler)set_global_variables)_compile_dependencies_init_autoresume_set_random_seedset_jit_fusion_optionswrite_args_to_tensorboard)	BertModelFloat16ModuleGPTModel	ModelTypeT5Model)Classification)get_megatron_optimizer)get_forward_backward_func)broadcast_int_listbroadcast_tensor)%beam_search_and_return_on_first_stage/generate_tokens_probs_and_return_on_first_stage)_vocab_size_with_padding)	get_modelget_optimizer_param_schedulertraining_log))average_losses_across_data_parallel_groupcalc_params_l2_normget_ltor_masks_and_position_idsunwrap_modelc                    t               }|j                  rdnd}|j                  dk(  r't        d|j                   d| d       t        d       |j                  dk(  rS|j                  r,|j
                  rd	nd}t        ||j
                  d
| |      }|S t        |j                  d	| |      }|S |j                  dk(  rt        dd
| |      }|S |j                  dk(  rt        dd
| |||      }|S t        d|j                         )zBuild the model.zpre-trainingzfine-tuningr   z	Building z model in the z mode.zThe Megatron LM model weights are initialized at random in `accelerator.prepare`. Please use `accelerator.load_checkpoint` to load a pre-trained checkpoint matching the distributed setup.bertr	   T)num_tokentypesadd_binary_headparallel_outputpre_processpost_process)num_classesrA   rD   rE   gpt)rA   rC   rD   rE   t5)rA   rC   rD   rE   add_encoderadd_decoderUnsupported model type: )r   pretraining_flagrankprintmodel_type_namebert_binary_headr+   r0   
num_labelsr-   r/   
ValueError)rD   rE   rI   rJ   argsmoderA   models           f/var/www/html/backtest/airagagent/rag_env/lib/python3.12/site-packages/accelerate/utils/megatron_lm.pymodel_provider_funcrW   R   s5   :D!22>DyyA~	$../~dV6JKx	
 v%  "&"7"7QQN- $ 5 5 $')E0 L! # OOA;eqE  L 
			&4[gst L 
			% #%##
 L 3D4H4H3IJKK    c                 Z   | j                  d       t               }| j                  j                  j                  r| j                  j                  j
                  t        d      | j                  j                  j
                  }| j                  j                  j	                  |      }|S |j                  dv rt        j                  }nN|j                  dk(  r?t        j                  }|j                  #|j                  dkD  r|j                  dz  |_        t        t              }|S )NzPreparing modelzaYou must provide a `custom_model_provider_function` when using a `custom_prepare_model_function`.)r@   rG   rH   r   r	   )rN   r   statemegatron_lm_plugincustom_prepare_model_functioncustom_model_provider_functionrR   rO   r.   encoder_or_decoderencoder_and_decoder"pipeline_model_parallel_split_rankpipeline_model_parallel_sizer8   rW   )acceleratorrS   custom_model_provider_funcrU   
model_types        rV   prepare_modelre   z   s
   '(:D++IIU//NNVs  &1%6%6%I%I%h%h"!!44RRSmn L ?2"55J!!T)"66J66>4CdCdghCh:>:[:[_`:`7-z:LrX   c                   .    e Zd ZdZd Zd Zd Zd Zd Zy)MegatronLMDummyDataLoaderz
    Dummy dataloader presents model parameters or param groups, this is primarily used to follow conventional training

    Args:
        **dataset_kwargs: Megatron data arguments.
    c                     t        j                         }t        |      }t        |      }|j	                         }t        |d         | _        | j                  j                  |       d| j                  d<   y )Nr   Tmegatron_dataset_flag)argparseArgumentParserr   r   parse_known_argsvarsdataset_argsupdate)selfdataset_kwargsparser	data_argss       rV   __init__z"MegatronLMDummyDataLoader.__init__   sh    ((*'%f-++-	 1.  05912rX   c                 v    t               }| j                  j                         D ]  \  }}t        |||        y N)r   rn   itemssetattr)rp   rS   keyvalues       rV   set_megatron_data_argsz0MegatronLMDummyDataLoader.set_megatron_data_args   s7    z++113 	&JCD#u%	&rX   c                     d }|S )Nc                    t               }|j                  |j                  |j                  | |j                   |j
                  d}|j                  dk(  r?|j                  |j                  |j                  |j                  |j                  d       n|j                  dk(  r|j                  d|j                  i       ng|j                  dk(  r@|j                  |j                  |j                  |j                  |j                  dd       nt        d|j                         |j                  dk(  rd	d
lm} nd	d
lm}  |di |\  }}}|||fS )z&Build train, valid, and test datasets.)data_prefix	data_implsplits_stringtrain_valid_test_num_samplesskip_warmupseedr@   )max_seq_lengthmasked_lm_probshort_seq_probbinary_headrG   
seq_lengthrH   )r   max_seq_length_decr   r   dataset_typerK   r   )build_train_valid_test_datasets )r   	data_pathr   splitmmap_warmupr   rO   ro   r   	mask_probr   rP   encoder_seq_lengthdecoder_seq_lengthrR   megatron.data.gpt_datasetr   megatron.data.dataset_utils)train_val_test_num_samplesrS   rn   r   train_dsvalid_dstest_dss          rV   "train_valid_test_datasets_providerzlMegatronLMDummyDataLoader.get_train_valid_test_datasets_provider.<locals>.train_valid_test_datasets_provider   sC   :D#~~!^^!%0J$($4$4 4		L ##v-##*.//*...*.*=*='+'<'<	 %%.##$doo
 %%-##*.*A*A.2.E.E*...*.*=*=(, !#;D<P<P;Q!RSS##u,UW*I*YL*Y'HhXw..rX   r   )rp   r   s     rV   &get_train_valid_test_datasets_providerz@MegatronLMDummyDataLoader.get_train_valid_test_datasets_provider   s    +	/Z 21rX   c           	      P   |y t               }|j                  |j                  z  }|j                  dk(  r>t	        t        |      ||t        j                         t        j                               }n}|j                  dk(  rJt        |t        |      ||t        j                         t        j                         |j                        }n$t        dj                  |j                              t        j                  j                  j!                  |||j"                  d      S )Nsingle)total_samplesconsumed_samplesmicro_batch_sizedata_parallel_rankdata_parallel_sizecyclic)r   r   r   r   r   data_shardingz${} dataloader type is not supported.T)batch_samplernum_workers
pin_memory)r   r   num_micro_batchesdataloader_typer$   lenr   get_data_parallel_rankget_data_parallel_world_sizer#   r   	Exceptionformattorchutilsdata
DataLoaderr   )rp   datasetr   rS   r   r   s         rV   build_pretraining_data_loaderz7MegatronLMDummyDataLoader.build_pretraining_data_loader   s   ?z0043I3II 8+6!'l!1!1#&#=#=#?#&#C#C#EM !!X-<!'l!1!1#&#=#=#?#&#C#C#E"00M BII$J^J^_`` {{**=d>N>N[_ + 
 	
rX   c                    d }t               }d\  }}}t        d       |j                  dkD  r@|j                  dk(  r1|j                  J d       |j                  |j
                  z  |_        |j                  dkD  rS|j                  dk(  rD|j                  8|j                  |j                  z  |j                  z  |j
                  z  |_        t        j                         dk(  r|j                  r|j                  }n|j                  |j
                  z  }|j                  |j                  z  dz   |j                  z  }|j                  }|||j
                  z  ||j
                  z  g}	t        d       t        dj                  |	d                t        d	j                  |	d                t        d
j                  |	d                | j                         }
 |
|	      \  }}}| j                  ||j                        }| j                  ||j                        }| j                  |d      }|d uxr |j                  dkD  }|d uxr |j                  dkD  }|d uxr |j                  dkD  }t        j                   j#                  t%        |      t%        |      t%        |      g      }n!t        j                   j#                  g d      }t        j&                  j)                  |t        j*                         t        j,                                |d   j/                         |_        |d   j/                         |_        |d   j/                         |_        |j6                  }|dv sJ |"|dk(  rt9        |      nt9         ||            }nd }|"|dk(  rt9        |      nt9         ||            }nd }|"|dk(  rt9        |      nt9         ||            }nd }|||fS )Nc              3   $   K   	 | D ]  }|  wrv   r   )iterxs     rV   cyclic_iterzTMegatronLMDummyDataLoader.build_train_valid_test_data_iterators.<locals>.cyclic_iter   s"      AG s   )NNNz3> building train, validation, and test datasets ...r   z?only backward compatiblity support for iteration-based trainingr   z( > datasets target sizes (minimum size):z    train:      {}z    validation: {}z    test:       {}r	   r   r   r   group)r   r   r   )r   r   	iterationconsumed_train_samplestrain_samplesglobal_batch_sizeconsumed_valid_sampleseval_interval
eval_itersr   get_tensor_model_parallel_ranktrain_itersr   r   r   r   cuda
LongTensorintdistributed	broadcast"get_tensor_model_parallel_src_rankget_tensor_model_parallel_groupitemdo_traindo_validdo_testr   r   )rp   r   rS   train_dataloadervalid_dataloadertest_dataloaderr   r   
test_itersr   r   r   r   r   r   r   r   flagsdl_typetrain_data_iteratorvalid_data_iteratortest_data_iterators                         rV   %build_train_valid_test_data_iteratorsz?MegatronLMDummyDataLoader.build_train_valid_test_data_iterators   s   	
 z@R=	+_JK >>A$"="="B%%-p/pp-*...4;Q;Q*QD'>>A$"="="B!!)^^t'9'99T__LtOeOee +
 --/14!! $ 2 2 $ 0 043I3I I**d.@.@@1DWJJT333T333*&
 CD-445OPQ5RST-445OPQ5RST-445OPQ5RST 261\1\1^.*LMg*h'Hh  $AA(DLgLgh#AA(DLgLgh"@@!LO (t3L8H8H18LH't3K!8KH%T1Idoo6IGJJ))3x=#h-W*VWEJJ)))4E 	##399;3CfCfCh 	$ 	
 aaQx}} &&....'*1X*=%&4TdHeCf   #''*1X*=%&4TdHeCf   #'&:AX:Mo!6SWXcdsXtSu!%"$79KKKrX   N)	__name__
__module____qualname____doc__rt   r{   r   r   r   r   rX   rV   rg   rg      s$    :&
.2`
BZLrX   rg   c           
         | j                  d       t               }|j                  sDddlm}m} t               }|j                  |j                  z  }|D ci c]  }|t        ||||          }}|d   Pt        |d   t        j                  j                  j                        r||d   _        n|d= |d= |d= ||d   _        n|d= ||d<   t        j                  j                  j                  |j                   fi |} ||| j"                  t%        j&                         t%        j(                         | j*                  d| j,                  j/                         | j0                  	      S |j2                   |j2                  \  |_        |_        |_        nd
\  |_        |_        |_        |j;                         \  }}	}
||	|
fS c c}w )NzPreparing dataloaderr	   )_PYTORCH_DATALOADER_KWARGSprepare_data_loader
batch_sizesamplershuffler   T)num_processesprocess_indexsplit_batchesput_on_device	rng_typesdispatch_batchesr   )rN   r   ri   data_loaderr   r   r   r   getattr
isinstancer   r   r   BatchSamplerr   r   r   devicer   r   r   r   r   copyr   r   r   r   consumed_test_samplesr   )rb   
dataloaderrS   r   r   r   kkwargsr   r   r   s              rV   r   r   T  s   ,-:D%%Qz0043I3IITnoq!WZ,Fq,IJJoo,'&+U[[-=-=-J-JK/?y!,9%9%<(5E'2'#3F< [[%%001C1CNvN
"::<446%33!++002(99	
 		
   ,
 %%	++* dk`D')DdF`
 <<>		
"$79KKKM ps   Gc                   <     e Zd Z fdZddZd Zed        Z xZS )MegatronLMOptimizerWrapperc                 *    t         |   |dd        y )NF)device_placementscalersuperrt   )rp   	optimizer	__class__s     rV   rt   z#MegatronLMOptimizerWrapper.__init__  s    U4HrX   c                      y rv   r   )rp   set_to_nones     rV   	zero_gradz$MegatronLMOptimizerWrapper.zero_grad      rX   c                      y rv   r   rp   s    rV   stepzMegatronLMOptimizerWrapper.step  r   rX   c                 .    | j                   j                  S )zTWhether or not the optimizer step was done, or skipped because of gradient overflow.)r   skipped_iterr  s    rV   step_was_skippedz+MegatronLMOptimizerWrapper.step_was_skipped  s     ~~***rX   rv   )	r   r   r   rt   r   r  propertyr  __classcell__r   s   @rV   r   r     s'    I + +rX   r   c                     | j                  d       t               }t        ||j                  |j                  |j
                        }|S )NzPreparing optimizer)rN   r   r1   no_wd_decay_condscale_lr_condlr_mult)rb   rU   rS   r   s       rV   prepare_optimizerr    sA    +,:D&ud.C.CTEWEWY]YeYefIrX   c                       e Zd ZdZddZy)MegatronLMDummySchedulera  
    Dummy scheduler presents model parameters or param groups, this is primarily used to follow conventional training
    loop when scheduler config is specified in the deepspeed config file.

    Args:
        optimizer (`torch.optim.optimizer.Optimizer`):
            The optimizer to wrap.
        total_num_steps (int):
            Total number of steps.
        warmup_num_steps (int):
            Number of steps for warmup.
        **kwargs:
            Other arguments.
    Nc                 <    || _         || _        || _        || _        y rv   )r   total_num_stepswarmup_num_stepsr   )rp   r   r  r  r   s        rV   rt   z!MegatronLMDummyScheduler.__init__  s     ". 0rX   Nr   )r   r   r   r   rt   r   rX   rV   r  r    s    rX   r  c                   $     e Zd Z fdZd Z xZS )MegatronLMSchedulerWrapperc                 &    t         |   ||       y rv   r   )rp   	scheduler
optimizersr   s      rV   rt   z#MegatronLMSchedulerWrapper.__init__  s    J/rX   c                      y rv   r   )rp   rS   r   s      rV   r  zMegatronLMSchedulerWrapper.step  s    rX   )r   r   r   rt   r  r  r	  s   @rV   r  r    s    0rX   r  c                 >    | j                  d       t        |      }|S )NzPreparing scheduler)rN   r9   )rb   r   r  s      rV   prepare_schedulerr    s!    +,-i8IrX   c                   4     e Zd ZdZ fdZd Zd Zd Z xZS )AbstractTrainStepz;Abstract class for batching, forward pass and loss handler.c                 0    t         |           || _        y rv   )r   rt   name)rp   r   r   s     rV   rt   zAbstractTrainStep.__init__  s    	rX   c                      y rv   r   r  s    rV   get_batch_funcz AbstractTrainStep.get_batch_func  r   rX   c                      y rv   r   r  s    rV   get_forward_step_funcz'AbstractTrainStep.get_forward_step_func  r   rX   c                      y rv   r   r  s    rV   get_loss_funczAbstractTrainStep.get_loss_func  r   rX   )	r   r   r   r   rt   r"  r$  r&  r  r	  s   @rV   r  r    s    ErX   r  c                   4     e Zd ZdZ fdZd Zd Zd Z xZS )BertTrainStepzg
    Bert train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                 N   t         |   d       | j                  |j                        | _        | j                  |j                  |j                        | _        | j                  |j                  |j                        | _        |j                  sd | _        y t        | _        y )Nr(  )r   rt   r"  ri   	get_batchr&  rL   rQ   	loss_funcr$  rP   forward_stepmodel_return_dictmodel_output_classr   rp   rS   r   s     rV   rt   zBertTrainStep.__init__  s    ),,T-G-GH++D,A,A4??S 66t7L7LdNcNcd%%&*D#&>D#rX   c                     d }d }|r|S |S )Nc                 l   g d}t         j                  }| t        |       }nd}t        j                  |||      }|d   j                         }|d   j                         }|d   j                         }|d   j                         }|d   j                         }	|d   j                         }
|||||	|
fS )	Build the batch.)texttypeslabels	is_random	loss_maskpadding_maskNr3  r4  r6  r7  r5  r8  r   int64nextr   broadcast_datalongfloat)data_iteratorkeysdatatyper   data_btokensr4  sentence_orderr7  	lm_labelsr8  s              rV   get_batch_megatronz8BertTrainStep.get_batch_func.<locals>.get_batch_megatron  s     YD{{H (M*''dH=F F^((*F7O((*E#K0557N{+113Ix(--/I!.1668L5.)YTTrX   c                    t        |       }t        |t        j                  j	                               }|d   j                         }|d   j                         }d|v r|d   j                         }nd}d|v r9|d   j                         }|d   dk7  j                  t        j                        }nd}d}d|v r|d   j                         }nd}||||||fS )r2  	input_idsattention_masktoken_type_idsNr5  next_sentence_label)r;  r   r   r   current_devicer=  tor>  )r?  r   rC  r8  r4  rE  r7  rD  s           rV   get_batch_transformerz;BertTrainStep.get_batch_func.<locals>.get_batch_transformer  s    &D!$

(A(A(CDD +&++-F 01668L4'-.3354 N//1	!(^t377D	 	 	$,!%&;!<!A!A!C!%5.)YTTrX   r   rp   ri   rF  rO  s       rV   r"  zBertTrainStep.get_batch_func  s    	U0	U2 !%%((rX   c                 $     d } fd}|r|S |S )Nc                    |\  }}|j                         }| j                         } t        j                  |j                  d      | j	                  d      z        | j                         z  }|tt        j                  |j                  dd      j                         |j                  d      d      }|j                         }||z   }t        ||g      }||d   |d   dfS |}t        |g      }|d|d   ifS )Nr	   )ignore_indexr   r   )lm losszsop lossrU  )r>  r   sumviewreshapeFcross_entropyr;   )	r7  rD  output_tensorlm_loss_
sop_logitslm_losssop_losslossaveraged_lossess	            rV   loss_func_pretrainz7BertTrainStep.get_loss_func.<locals>.loss_func_pretrain  s    #0 Hj~~'H!)Iiib 1I4E4Eb4I IJY]]_\G%??:??2q+A+G+G+I>K^K^_aKbqst#>>+)"KWV^L_"`);YZI[\\\ "KWI"Vi);<<<rX   c                    dk(  r2t               } ||j                  d      | j                  d            }nj                  dkD  r_| j                  t        j
                  t        j                  fv r3t               } ||j                  d      | j                  d            }nt               } |||       }t        |g      }|d|d   ifS )Nr   rS  r`  r   )
r   rW  rQ   dtyper   r=  r   r   r   r;   )r5  logitsloss_fctr`  ra  rQ   rp   s        rV   loss_func_finetunez7BertTrainStep.get_loss_func.<locals>.loss_func_finetune1  s    Q"9BRA1$&,,5::uyy:Q*Q+-B
 ;V[[_M,./GOO&/!"4555rX   r   )rp   rL   rQ   rb  rg  s   ` `  rV   r&  zBertTrainStep.get_loss_func  s    	=&	6 %%%%rX   c                       fd}|S )Nc                     j                  |       \  }}}}}}
sd}r% |||||      }|t        j                  ||      fS  ||||      }	|	t        j                  |      fS )Forward step.Ntokentype_idsrE  )rl  r*  r   r+  )r?  rU   rC  r4  rD  r7  r5  r8  r[  re  rP   rL   rp   s             rV   r,  z9BertTrainStep.get_forward_step_func.<locals>.forward_stepE  sw    MQ^^\iMjJFE>9fl# %fl%[a b$gdnni&XXXv|5Iwt~~v>>>rX   r   )rp   rL   rP   r,  s   ``` rV   r$  z#BertTrainStep.get_forward_step_funcD  s    	? rX   	r   r   r   r   rt   r"  r&  r$  r  r	  s   @rV   r(  r(    s    ?5)n%&NrX   r(  c                   4     e Zd ZdZ fdZd Zd Zd Z xZS )GPTTrainStepzf
    GPT train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                    t         |   d       | j                  |j                        | _        | j                         | _        | j                         | _        |j                  dz
  | _
        |j                  t               }|j                  | _
        |j                  | _        |j                  | _        |j                   | _        |j"                  sd | _        y t&        | _        y )Nrp  r   )r   rt   r"  ri   r*  r&  r+  r$  r,  padded_vocab_size	eod_token
vocab_filer   eodreset_position_idsreset_attention_maskeod_mask_lossr-  r.  r   )rp   rS   	tokenizerr   s      rV   rt   zGPTTrainStep.__init__]  s    (,,T-G-GH++- 668//!3??&%I&]]DN"&"9"9$($=$=!!//%%&*D#&GD#rX   c                 $      fd} fd}|r|S |S )Nc                    dg}t         j                  }| t        |       }nd}t        j                  |||      }|d   j                         }|ddddf   j                         }|ddddf   j                         }t        |j                  j                  j                  j                        \  }}	}
|||	||
fS )zGenerate a batchr3  Nr   rS  )r   r:  r;  r   r<  r=  
contiguousr=   rs  rv  rw  rx  )r?  r@  rA  r   rB  tokens_r5  rC  rI  r7  position_idsrp   s              rV   rF  z7GPTTrainStep.get_batch_func.<locals>.get_batch_megatrono  s     8D{{H (M*''dH=F Vn))+GQU^..0FQV_//1F 7V(?(?AZAZ\`\n\n73NI| 69nlJJrX   c                 4   t        |       }d|d   i}t        |t        j                  j	                               }|d   j                         }t        j                  |j                  d   df|j                  |j                        	j                  z   }t        j                  ||gd      }|d d dd f   j                         }|d d d df   j                         }t        |	j                  	j                  	j                  d      \  }}}|||||fS )NrH  r   r   )rd  r   dimrS  T)r;  r   r   r   rM  r=  zerosshaperd  r   rs  concatr|  r=   rv  rw  )
r?  r   r}  paddingr5  rC  rI  r7  r~  rp   s
            rV   rO  z:GPTTrainStep.get_batch_func.<locals>.get_batch_transformer  s   &Dk!23D!$

(A(A(CDD;',,.Gkk7==#3Q"7w}}U\UcUcdgkguguuGllGW#51=GQU^..0FQV_//1F6U(?(?AZAZ\`73NI| 69nlJJrX   r   rP  s   `   rV   r"  zGPTTrainStep.get_batch_funcn  s     	K2	K  !%%((rX   c                 &    t               fd}|S )Nc                 f   j                   r|\  }}n|}|j                         }| j                  d      j                         } t        j                  |j                  d      | z        | j	                         z  }t        |g      }d|d   i}j                   r|j                  di       ||fS )NrS  rU  r   re  )return_logitsr>  rW  r   rV  r;   ro   )r7  r[  lossesre  r`  averaged_lossoutput_dictrS   s          rV   r+  z-GPTTrainStep.get_loss_func.<locals>.loss_func  s    !!!.&\\^F!r*002I99V[[_y89IMMOKD FtfMM$mA&67K!!""Hf#56$$rX   )r   )rp   r+  rS   s     @rV   r&  zGPTTrainStep.get_loss_func  s    z	%" rX   c                       fd}|S )Nc                 z    j                  |       \  }}}}} |||||      }|t        j                  |      fS )rj  )r5  rm  )	r?  rU   rC  r5  r7  rI  r~  r[  rp   s	           rV   r,  z8GPTTrainStep.get_forward_step_func.<locals>.forward_step  sG     GKnnUbFcCFFI~|!&,vVM '$..)"DDDrX   r   rp   r,  s   ` rV   r$  z"GPTTrainStep.get_forward_step_func  s    	E rX   rn  r	  s   @rV   rp  rp  U  s    H"-)^,	rX   rp  c                   d     e Zd ZdZ fdZed        Zed        Zed        Zd Z	d Z
d Z xZS )	T5TrainStepze
    T5 train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                     t         |   d       | j                  |j                        | _        | j                         | _        | j                         | _        |j                  sd | _
        y t        | _
        y )Nr  )r   rt   r"  ri   r*  r&  r+  r$  r,  r-  r.  r   r/  s     rV   rt   zT5TrainStep.__init__  sb    ',,T-G-GH++- 668%%&*D#&5D#rX   c                 ^    | j                  d      }| j                  d      }||z  }|dk  }|S )Nr   r	         ?)	unsqueeze)rI  attention_mask_b1sattention_mask_bs1attention_mask_bssextended_attention_masks        rV   attn_mask_postprocessz!T5TrainStep.attn_mask_postprocess  sC     ,55a8+55a8/2DD"4s":&&rX   c                 j    t        j                  t        j                  d| | f|            }|dk  }|S Nr   r   r  )r   trilones)r   r   rI  s      rV   get_decoder_maskzT5TrainStep.get_decoder_mask  s3    EJJ:z/JSY$Z['#-rX   c                     | j                   \  }}| j                  d      }t        j                  ||df|      }||z  }|dk  }|S r  )r  r  r   r  )	rI  dec_seq_lengthr   r   _r  r  r  r  s	            rV   get_enc_dec_maskzT5TrainStep.get_enc_dec_mask  sZ    &,,
A ,55a8"ZZ^Q(GPVW/2DD"4s":&&rX   c                     d }d }|r|S |S )Nc                 R   g d}t         j                  }| t        |       }nd}t        j                  |||      }|d   j                         }|d   j                         }|d   j                         }|d   j                         }|d   dk  }	|d	   dk  }
|d
   dk  }|||||	|
|fS )r2  )text_enctext_decr5  r7  enc_maskdec_maskenc_dec_maskNr  r  r5  r7  r  r  r  r  r9  )r?  r@  rA  r   rB  
tokens_enc
tokens_decr5  r7  r  r  r  s               rV   rF  z6T5TrainStep.get_batch_func.<locals>.get_batch_megatron  s     kD{{H (M*''dH=F  
+002J
+002JH%**,F{+113Ij)C/Hj)C/H!.1C7Lz9fhR^^^rX   c                 :   t        |       }t        |t        j                  j	                               }|d   j                         }|d   j                         }|dk7  j                  t        j                        }d|v r|d   j                         }nn|j                  |j                  |j                  t        j
                        }|dddf   j                         |dd	df<   d
|d<   |j                  |dk(  d
       t        j                  |d   j                               }t        j                  |j                  d	   |j                        }t        j!                  |d   j                         |j                  d	   |j                        }|||||||fS )r2  rH  r5  rK  decoder_input_ids)r   rd  .NrS  r   r   ).r   rI  )r;  r   r   r   rM  r=  rN  r>  	new_zerosr  r   clonemasked_fill_r  r  r  r  )	r?  r   r  r5  r7  r  r  r  r  s	            rV   rO  z9T5TrainStep.get_batch_func.<locals>.get_batch_transformer
  sz   &D!$

(A(A(CDDk*//1J(^((*F4++EKK8I"d*!"56;;=
#--fll6==X]XbXb-c
&,S#2#X&6&<&<&>
37#%&
6"''
d(:A>"88>N9O9T9T9VWH"33J4D4DQ4GIZIZ[H&77%&++-z/?/?/BJDUDUL z9fhR^^^rX   r   rP  s       rV   r"  zT5TrainStep.get_batch_func  s    	_2	_. !%%((rX   c                     d }|S )Nc                     |j                         }t        j                  |j                  d      | j	                  d      z        | j                         z  }|}t        |g      }|d|d   ifS )NrS  rU  r   )r>  r   rV  rW  rX  r;   )r7  r[  r\  r^  r`  ra  s         rV   r+  z,T5TrainStep.get_loss_func.<locals>.loss_func'  sh    $**,Hiib 1I4E4Eb4I IJY]]_\GDG	RO)_Q%7888rX   r   )rp   r+  s     rV   r&  zT5TrainStep.get_loss_func&  s    	9 rX   c                       fd}|S )Nc           	          
j                  |       \  }}}}}}} ||||||d|      }	|	t        
j                  |      fS )rj  Nrk  rm  )r?  rU   r  r  r7  rE  r  r  r  r[  rp   s             rV   r,  z7T5TrainStep.get_forward_step_func.<locals>.forward_step3  s_     ^b]k]k^ZJ
Iy(Hl "J(LX\hqM !'$..)"DDDrX   r   r  s   ` rV   r$  z!T5TrainStep.get_forward_step_func2  s    	E rX   )r   r   r   r   rt   staticmethodr  r  r  r"  r&  r$  r  r	  s   @rV   r  r    sY    6 
' 
'  
 	' 	'4)l
rX   r  c           
      >   | j                  d       t        j                  j                         sJ d       t	        |d      }|j                         D ]W  \  }}t        ||d       8|j                  dk(  r)t        dj                  |t        ||      |      d       t        |||       Y |j                  s|j                  d	d
      r|j                  J d       t        |       t        |       t        |       d }t!               } |        t#                t%                t'                t!               }t)        |j*                  |      |_        |j.                  dk(  r*|j0                  r|j2                  dk(  rd|_        d|_        y d
|_        d|_        y )NzInitializing Megatron-LMzMegatron requires CUDA.T)ignore_unknown_argsr   z[WARNING: overriding default arguments for {key}:{v}                         with {key}:{v2})ry   vv2)flushuse_checkpoint_argsFz/--use-checkpoints-args requires --load argumentc                     t               } t        j                  j                         }t        j                  j                         | _        t        j                  j                         | _        |dkD  r| j                  |z  }| j                  | j                  |k(  sJ d       || _	        t        j                         rt        d       n@t        j                  | j                  | j                  | j                   | j"                         | j                  dk(  r$t        dj%                  | j&                               t)        | j&                  | j*                         y )Nr   z:expected local-rank to be the same as rank % device-count.z%model parallel is already initializedz > setting random seeds to {} ...)r   r   r   device_countr   get_rankrM   get_world_size
world_size
local_rankr   model_parallel_is_initializedrN   initialize_model_paralleltensor_model_parallel_sizera   $virtual_pipeline_model_parallel_sizer`   r   r   r(   data_parallel_random_init)rS   r  r   s      rV   finish_mpu_initz#initialize.<locals>.finish_mpu_initc  s   zzz..0%%..0	++::<!YY-F*&0n2nn0"( 002=>--3355==;;	 99>4;;DIIFGD$B$BCrX   r@   r	   )rN   r   r   is_availabler   rw   r   rM   r   rx   r  getloadr    r   r%   r   r'   r&   r)   r7   orig_vocab_sizerr  rO   rL   rQ   rP   r   )rb   extra_args_providerargs_defaultsrS   ry   rz   r  s          rV   
initializer  D  s   01::""$?&??$ )tDD $))+ 
"
U4d#/yyA~))/74#5% *0 *  	c5!
" =#4#45JE#Ryy$W&WW$!$'$ D< :D    :D5d6J6JDQDv%$*?*?DOOWXDX $ DN !&DNrX   c                   j     e Zd ZdZ fdZd Zd Zd Zd Zd Z	d Z
d	 Zd
 Z	 	 	 	 	 	 	 	 ddZ xZS )MegatronEnginez
    Megatron-LM model wrapper

    Args:
        accelerator (:class:`~accelerate.Accelerator`): The accelerator object to use.
        model: Megatron-LM model
        optimizer: Megatron-LM optimizer
        lr_scheduler: Megatron-LM lr scheduler
    c                    t         t        |           || _        |d   | _        || _        || _        t               }|j                  j                  j                  K |j                  j                  j                  |fi |j                  j                  j                  | _        nx|j                  dk(  rt        |      | _        nX|j                  dk(  rt        |      | _        n8|j                  dk(  rt!        |      | _        nt#        d|j                         d| j
                  _        i | _        i | _        d| _        d| _        |j.                  t1                y y )Nr   r@   rG   rH   rK   FT)r   r  rt   module
base_modelr   r  r   rZ   r[   custom_train_step_classcustom_train_step_kwargstrain_step_handlerrO   r(  rp  r  rR   r  total_loss_dicteval_total_loss_dictr   report_memory_flagtensorboard_dirr*   )rp   rb   rU   r   r  rS   r   s         rV   rt   zMegatronEngine.__init__  s>   nd,.(""z//GGS&bk&7&7&J&J&b&b'#))<<UU'D# !!V+&3D&9D#!!U*&24&8D#!!T)&1$&7D#78L8L7MNOO&+#  "$&!"&+%' ,rX   c                 f    | j                   D ]  }|j                           | j                          y rv   )r  trainlog_eval_resultsrp   model_modules     rV   r  zMegatronEngine.train  s-     KK 	!L 	!rX   c                 F    | j                   D ]  }|j                           y rv   )r  evalr  s     rV   r  zMegatronEngine.eval  s!     KK 	 L	 rX   c                    t               }t               }t        |      dkD  rg }|j                  dkD  rot	        d|j                        D ]U  }|j                  |j                         D ci c](  \  }}||||j                  z  |dz   |j                  z   * c}}       W n|g}t        | j                        dkD  r]t        |      dkD  r5t	        t        | j                              D cg c]  }t               c}ndgt        | j                        z  }	nt        |      dkD  rt              nd}	|j                  dk(  r-|j                  r!| j                  D ]  }
|
j                           | j                  j                          t               } || j                   j"                  |	| j                  | j                  dd      }|j$                  dk\  rt&        j(                  j+                           |d      j-                          | j                  j/                  ||        |d      j1                           |d      j-                          | j                  j3                  ||      \  }}} |d      j1                          |rH |d	      j-                          | j                  j5                  ||        |d	      j1                          |rO| j6                  @t9               |j                  z  |j:                  z  }| j6                  j3                  |
       d}nd}| | j                  _        |j$                  dk\  rt&        j(                  j+                          |xj>                  tA        jB                         |j                  z  t9               z  z  c_        tA        jD                  d      rri }|d   D ]b  }|D cg c]  }||   	 }}t        |d   jF                        dk(  rtI        |      t        |      z  ||<   Kt'        jJ                  |      ||<   d ||||fS i |||fS c c}}w c c}w c c}w )z
        Training step for Megatron-LM

        Args:
            batch_data (:obj:`dict`): The batch data to train on.
        r   r   NlocalF)forward_onlyzbackward-reduce-model-gradsr   zbackward-gather-model-params)	incrementr	   Tignore_virtual)&r   r   r   r   rangeappendrw   r   r  r   DDP_impl#use_contiguous_buffers_in_local_ddpzero_grad_bufferr   r   r2   r  r,  empty_unused_memory_levelr   r   empty_cachestartreduce_model_gradsstopr  gather_model_paramsr  r   r   r  r   r   r   is_pipeline_last_stager  rV  r  )rp   
batch_datarS   timersdata_chunksir   r  r  batch_data_iterator	partitionforward_backward_funclosses_reducedupdate_successful	grad_normnum_zeros_in_gradr  r  loss_reducedry   r   losses_reduced_for_keys                         rV   
train_stepzMegatronEngine.train_step  s    zz?QK%%)q$"8"89 A&& )3(8(8(: $1 qT%:%:!:a!etG\G\=\]]  *lt{{a z?Q& -2#dkk2B,CDqk"DVc$++..   8;:7J${"3PT ==G#(P(P![[ -	**,-  " !: ;.##00KKNN
 ))Q.JJ""$ 	,-335))$7,-224 	{!!#:>..:M:MdTZ:[79&7{  " 1288:NN..tV<12779 ~~)02T5J5JJTMdMdd	##i#8LL*;&;# ))Q.JJ""$##,,.1F1FFI]I__	
# %%T:L%a( M:H)IQ!C&)I&)I-a06671<(+,B(CcJ`Fa(aL%(-5K(LL%M  y:KKK<,===g EF *Js   +-P6,P<Qc                 <   t               }g }|j                  dkD  rot        d|j                        D ]U  }|j                  |j	                         D ci c](  \  }}||||j
                  z  |dz   |j
                  z   * c}}       W n|g}t        | j                        dkD  r5t        t        | j                              D cg c]  }t        |       }}nt        |      }t               }	 |	| j                  j                  || j                  ddd      }
|j                  dk\  rt        j                  j                          |xj                   t#        j$                         |j
                  z  t'               z  z  c_        t#        j(                  d      rni }|
d   D ]b  }|
D cg c]  }||   	 }}t        |d   j*                        dk(  rt-        |      t        |      z  ||<   Kt        j.                  |      ||<   d |S i S c c}}w c c}w c c}w )z
        Evaluation step for Megatron-LM

        Args:
            batch_data (:obj:`dict`): The batch data to evaluate on.
        r   r   NT)r   r  r  r  )r   r   r  r  rw   r   r   r  r   r2   r  r,  r  r   r   r  r   r   r   r   r  r  rV  r  )rp   r  rS   r  r  r   r  r  r  r  
loss_dictsr   ry   r   r  s                  rV   	eval_stepzMegatronEngine.eval_step-  s    z!!A%1d445 ""cmcscscuv[_[\^_Q!d333q1u@U@U6UVVv
 &,Kt{{a>CCDT>U"V4#4"V"V"&{"3 9 ;*##00KK

 ))Q.JJ""$##,,.1F1FFI]I__	
# %%T:L!!} M:D)EQ!C&)E&)E-a06671<(+,B(CcJ`Fa(aL%(-5K(LL%M  IK w #W0 *Fs   -HH,Hc                    t               }| j                  d   j                  r | j                  d
i |\  }}}}| xj                  dz  c_        |j
                  }| j                  j                         j                         }d }|j                  rt        | j                        }t        || j                  | j                  j                  d   d   | j                  || j                  ||||
      | _        n | j                   d
i |}|j
                  |D ]  }	| j"                  j%                  |	t&        j(                  j+                  dg            ||	   z   | j"                  |	<   | j"                  j%                  |	dz   t&        j(                  j+                  dg            t&        j(                  j+                  dg      z   | j"                  |	dz   <    t'        j,                  d|j.                        }
|D ]&  }	t1        ||	   j2                        dk(  s|
||	   z  }
( d }d|v r|d   }| j4                  j6                  | j4                  j7                  |
|	      S |
S )Nr   r   lr        
_num_iters      ?r  re  )r`  re  r   )r   r  trainingr  r   r  r   get_loss_scaler   log_params_normr<   rU   r:   r  param_groupsr  r  r  r  r   r   FloatTensortensorr  r   r  r  r.  )rp   r  rS   	loss_dictr  r  r  
loss_scaleparams_normry   r`  re  s               rV   forwardzMegatronEngine.forwarda  sF    z;;q>""DSDOODaV`DaAI|Y0ANNaN##/!^^::<AAC
"''"5djj"AK*6((NN//248NN++ %+' '44I##/$ 6C1155c5::;Q;QSVRW;XY\efi\jj --c2 EID]D]DaDal*EJJ,B,BC5,IE

..u5E6D--cL.@A	6 ||C8 	'C9S>''(A-	#&	' y x(F""55A**==4PV=WWrX   c                    t               }|j                  | j                  dk(  ry t               }t               }d| j                   d}| j                  D ]  }|j                  d      r| j                  |   | j                  |dz      z  }|| d| dz  }t        j                  t        d|j                                     }|j                  r|| d| dz  }|s|j                  | d|j                         | j                         |j                  s|j                  | d	|| j                          t        |      d
z   }t        d|z         t        |       t        d|z         i | _        y )Nr   zvalidation loss at iteration z | r	  z value:    z PPL: z validationz validation pplr   -)r   r  r   r   r  endswithmathexpminr   rL   
add_scalarr   r   )rp   rS   writerstringry   rz   ppllengths           rV   r  zMegatronEngine.log_eval_results  sf   z'4>>Q+>z')00@D,, 	TC||L)--c2T5N5NsUaOa5bbEXeWC00F((3r5::<01C$$SEuC00!!SE"5uzz|T^^T((%%_&=sDNNS	T Vqf%f%$&!rX   c                 *   | j                          t               }||_        t        j                  j                          t        | j                  | j                  | j                  | j                         t        j                  j                          y rv   )r  r   saver   r   barrierr"   r   r  r   r  )rp   
output_dirrS   s      rV   r"   zMegatronEngine.save_checkpoint  s^    z	!!#T^^T^^T!!#rX   c                    t               }||_        d|_        d|_        t        j
                  j                          t        | j                  | j                  | j                        }t        j
                  j                          || _        |j                  r+| j                  dk(  r| j                  j                          y y y r  )r   r  r   r   r   r   r#  r!   r  r   r  r   fp16reload_model_params)rp   	input_dirrS   r   s       rV   r!   zMegatronEngine.load_checkpoint  s    z	&'#&'#!!##DKKP	!!#"991,NN..0 -9rX   c
                 
   t               }|j                  dk7  rt        d      |j                  dkD  rt	        d      |j
                  rt	        d      |j                  t	        d      |j                  t	        d      ||t	        d	      |d
}n"d|cxk  rdk  st	        d       t	        d      |d}n"d|cxk  rdk  st	        d       t	        d      |d}n7|dkD  r|dkD  rt	        d      d|cxk  rd
k  st	        d       t	        d      |
j                  dd      }d|cxk  rd
k  st	        d       t	        d      |
j                  dd      }d|cxk  rd
k  st	        d       t	        d      |
j                  dd      }t        |t              st	        d      |}|>t        |t              st	        d      |dk  rt	        d      |j                  d   dkD  ryt               }|
j                  d|j                        }|t        |t              st	        d      |	d
}	d}d}d}t        j                   j#                         dk(  r|>t        j$                  j'                  |j                  d   g|j                  d   z        }n |j)                  d       j%                         }|||j                  d   z
  }|dk  rt	        d!      |r||j                  d   z   dz   }d"t+        j,                  |d"z        z  }||j                  d   dz   z
  }t        j$                  j'                  |j                  g|z  g|j                  d   z        }t        j.                  t        j0                  |dddf   d       |j%                         |gd       }n||j                  d   z   }d"t+        j,                  |d"z        z  }||j                  d   z
  }t        j$                  j'                  |j                  g|z  g|j                  d   z        }t        j.                  |j%                         |gd       }|j3                  d      |j3                  d      g}t5        d#|d$      }|j7                         }t9        |t        j:                  |d%      }t9        |d   t        j:                  |d%      }|
j                  d&d      }t        j<                  j?                  |       tA        | jB                  tD        tF        tH        f      }|tK        |||||d|	'      \  }}|S tM        |||d|||||d()
      \  }}}|S )*a  
        Generate method for GPT2 model. This method is used for inference. Supports both greedy and beam search along
        with sampling. Refer the Megatron-LM repo for more details

        Args:
            inputs (torch.Tensor): input ids
            attention_mask (torch.Tensor, optional): attention mask. Defaults to None.
            max_length (int, optional): max length of the generated sequence. Defaults to None.
            Either this or max_new_tokens should be provided.
            max_new_tokens (int, optional): max number of tokens to be generated. Defaults to None.
            Either this or max_length should be provided.
            num_beams (int, optional): number of beams to use for beam search. Defaults to None.
            temperature (float, optional): temperature for sampling. Defaults to 1.0.
            top_k (int, optional): top k tokens to consider for sampling. Defaults to 0.0.
            top_p (float, optional): tokens in top p probability are considered for sampling. Defaults to 0.0.
            length_penalty (float, optional): length penalty for beam search. Defaults to None.
            kwargs: additional key-value arguments
        rG   z1Generate method is not implemented for this modelr   z1Generate method requires data parallelism to be 1z9Generate method requires sequence parallelism to be FalseNz2Checkpoint activations cannot be set for inferencez$Vocab file is required for inferencez;`max_length` or `max_new_tokens` are required for inferencer
  r  g      Y@zAtemperature must be a positive number less than or equal to 100.0r   i  z:top_k must be a positive number less than or equal to 1000z/top_p and top_k sampling cannot be set togetherz'top_p must be less than or equal to 1.0top_p_decayz-top_p_decay must be less than or equal to 1.0top_p_boundz-top_p_bound must be less than or equal to 1.0add_BOSFzadd_BOS must be a booleanzbeam_width must be an integerz!beam_width must be greater than 0z,When doing beam_search, batch size must be 1
stop_tokenzstop_token must be an integerrS  )axisz%max_new_tokens must be greater than 0   r	   )int_listrM   )r  rM   random_seed)r-  num_return_genlength_penaltyT)return_output_log_probstop_ktop_pr*  r+  temperature#use_eod_token_for_early_termination)'r   rO   NotImplementedErrorr   rR   sequence_parallelrecompute_granularityrt  r  r   boolr   r  r   ru  r   r   r  r   r   rV  r  ceilr  r  sizer3   tolistr4   r:  randommanual_seedr>   r  torchDDPLocalDDPr,   r5   r6   )rp   inputsrI  
max_lengthmax_new_tokens	num_beamsr7  r5  r6  r3  r   rS   r*  r+  r,  
beam_widthry  r-  
sizes_listprompts_tokens_tensorprompts_length_tensorr  sizes_tensorsizescontext_tokens_tensorcontext_length_tensorr1  unwrapped_modelrC  r  s                                 rV   megatron_generatez MegatronEngine.megatron_generate  sw   B z5(%&YZZ""Q&PQQ!!XYY%%1QRR??"CDD ."8Z[[K,u,`aa -`aa=Eu$$YZZ %YZZ=ES[US[NOO5'C' !JKK ( !JKKjj4{)c)LMM *LMMjj4{)c)LMM *LMM**Y.7D)899
!j#. !@AAA~ !DEE||A"E!O	ZZimm<
!j#. !@AA! N
 $ $%%'1,%(-

(=(=v||A>ORXR^R^_`Ra>a(b%(6(:(:(:(C(H(H(J%%!+fll1o!=" !HII+fll1o=A
:>!::
!+v||A/B!C**//)-->1Q0RU[UaUabcUd0de(-__WQT]<fkkmWU\^)%
 ,fll1o=
:>!::
!+fll1o!=**//)-->1Q0RU[UaUabcUd0de(-fkkmW5MTV(W% &**1-%**1-J *!jqI ##% 0Lahi j 0q5;;Odkl m jj2  -&t8]8[\!=%%% -IFA,  K%%(-'''48LFAq rX   )NNNNNNNN)r   r   r   r   rt   r  r  r  r  r  r  r"   r!   rQ  r  r	  s   @rV   r  r    sY    (: 
 c>J2h;z'4$
1 prX   r  c                     t        |       S )z
    Average losses across data parallel group.

    Args:
        losses (List[Tensor]): List of losses to average across data parallel group.
    )r;   )r  s    rV   %avg_losses_across_data_parallel_grouprS    s     5V<<rX   c                 $    d }t        || d      S )z
    Recursively gather tensor in a nested list/tuple/dictionary of tensors from data parallel ranks.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to gather across data parallel ranks.

    c                    | j                   dk(  r| j                         d    } t        t        j                  j                  t        j                                     D cg c]  }t        j                  |        }}t        j                  j                  || t        j                                t        j                  |d      S c c}w )Nr   r   r  )ndimr  r  r   r   r  r   get_data_parallel_group
empty_like
all_gathercat)r  r  output_tensorss      rV   _gpu_gather_onez;gather_across_data_parallel_groups.<locals>._gpu_gather_one  s    ;;!\\^D)F 5,,;;#B]B]B_;`a
 V$
 
 	$$^V3C^C^C`$ayyQ//
s    C	T)error_on_other_type)r   )r  r\  s     rV   "gather_across_data_parallel_groupsr^    s    0 _f$OOrX   )TTTT)mrj   r  abcr   	functoolsr   r   torch.nn.functionalnn
functionalrY  torch.nnr   r   r   torch.nn.parallel.distributedr   rB  r   r
   r  r   importsr   r   
operationsr   r   transformers.modeling_outputsr   r   r   megatronr   r   r   r   r   r   r   r   megatron.argumentsr   r   r   r   megatron.checkpointingr    r!   r"   megatron.data.data_samplersr#   r$   megatron.global_varsr%   megatron.initializer&   r'   r(   r)   r*   megatron.modelr+   r,   r-   r.   r/   rC  megatron.model.classificationr0   megatron.optimizerr1   megatron.schedulesr2   &megatron.text_generation.communicationr3   r4   #megatron.text_generation.generationr5   r6   megatron.tokenizer.tokenizerr7   megatron.trainingr8   r9   r:   megatron.utilsr;   r<   r=   r>   rW   re   rg   r   r   r  r  r  r  r  r(  rp  r  r  Moduler  rS  r^  r   rX   rV   <module>ry     sg          A A M , , H 9   	 	 	 cbbbh9  VUB<9<[ FXX %P,AL ALH.Ld+!5 +  .!5  "~% ~Bg$ gTA# AJ 15B PfeUXX__ eR=PrX   