@@ -116,6 +116,9 @@ def add_parser_arguments(self):
116116 self ._parser .add_argument ('--data_home' , type = str , default = '/tmp' , help = 'Data home.' )
117117 self ._parser .add_argument ('--vocab_path' , type = str , default = '/tmp/gpt2-vocab.json' , help = 'Vocab path.' )
118118 self ._parser .add_argument ('--merge_path' , type = str , default = '/tmp/gpt2-merges.txt' , help = 'Merge path.' )
119+ self ._parser .add_argument (
120+ '--split' , type = str , default = '949,50,1' , help = 'Split dataset ratio for train/val/test.'
121+ )
119122 self ._parser .add_argument ('--prescale_grad' , action = 'store_true' , help = 'Prescale grad.' )
120123 self ._parser .add_argument (
121124 '--hostfile' , type = str , default = None , help = 'Hostfile to run the mutli-node benchmark.'
@@ -128,6 +131,13 @@ def add_parser_arguments(self):
128131 def _preprocess (self ):
129132 if not super ()._preprocess ():
130133 return False
134+ if not self ._args .code_base :
135+ if self ._args .deepspeed :
136+ self ._args .code_base = os .path .join (
137+ os .getenv ('SB_MICRO_PATH' ), 'third_party/Megatron/Megatron-DeepSpeed/'
138+ )
139+ else :
140+ self ._args .code_base = os .path .join (os .getenv ('SB_MICRO_PATH' ), 'third_party/Megatron/Megatron-LM' )
131141
132142 if not os .path .exists (self ._args .code_base ) or \
133143 not os .path .exists (os .path .join (self ._args .code_base , 'pretrain_gpt.py' )):
@@ -156,35 +166,35 @@ def _preprocess(self):
156166
157167 def _parse_log (self , output ):
158168 """Parse log output and get the performance."""
159- tflops_pattern = re .compile (r'TFLOPs: (\d+\.\d+)' )
169+ tflops_pattern = re .compile (r'( TFLOPs|TFLOP/s/GPU\)) : (\d+\.\d+)' )
160170 elapsed_time_pattern = re .compile (r'elapsed time per iteration \(ms\): (\d+\.\d+)' )
161- mem_allocated_pattern = re .compile (r'MemAllocated=([\d.]+)[KMGTPEZY]?B ' )
162- max_mem_allocated_pattern = re .compile (r'MaxMemAllocated=([\d.]+)[KMGTPEZY]?B ' )
171+ mem_allocated_pattern = re .compile (r'allocated: (\d+\.\d+) ' )
172+ max_mem_allocated_pattern = re .compile (r'max allocated: (\d+\.\d+) ' )
163173 lines = output .splitlines ()
164174 tflops = []
165175 mem_allocated = []
166176 max_mem_allocated = []
167177 iteration_times = []
168178 for line in lines :
169- if 'TFLOPs ' in line :
179+ if 'elapsed time per iteration ' in line :
170180 tflops_matches = tflops_pattern .search (line )
171181 elapsed_time_match = elapsed_time_pattern .search (line )
172182 if tflops_matches :
173- tflops_values = float (tflops_matches .group (1 ))
183+ tflops_values = float (tflops_matches .group (2 ))
174184 tflops .append (tflops_values )
175185 if elapsed_time_match :
176186 elapsed_time_value = float (elapsed_time_match .group (1 ))
177187 iteration_times .append (elapsed_time_value )
178188
179- if 'MaxMemAllocated ' in line :
189+ if 'max allocated ' in line :
180190 mem_allocated_match = mem_allocated_pattern .search (line )
181191 max_mem_allocated_match = max_mem_allocated_pattern .search (line )
182192 if mem_allocated_match :
183- mem_allocated_value = float (mem_allocated_match .group (1 ))
193+ mem_allocated_value = float (mem_allocated_match .group (1 )) / 1024
184194 mem_allocated .append (mem_allocated_value )
185195
186196 if max_mem_allocated_match :
187- max_mem_allocated_value = float (max_mem_allocated_match .group (1 ))
197+ max_mem_allocated_value = float (max_mem_allocated_match .group (1 )) / 1024
188198 max_mem_allocated .append (max_mem_allocated_value )
189199
190200 return iteration_times , tflops , mem_allocated , max_mem_allocated
@@ -224,7 +234,9 @@ def __prepare_deespeed_config(self, precision_megatron):
224234 --deepspeed \
225235 --deepspeed_config { self ._config_json_path } \
226236 --zero-stage { self ._args .zero_stage } \
227- --pipeline-model-parallel-size { self ._args .pipeline_model_parallel_size } '
237+ --pipeline-model-parallel-size { self ._args .pipeline_model_parallel_size } \
238+ --train-tokens { self ._args .train_tokens } \
239+ --data-impl { self ._args .data_impl } '
228240
229241 if self ._args .pipeline_model_parallel_size <= 1 :
230242 deepspeed_options = f'{ deepspeed_options } --no-pipeline-parallel'
@@ -255,11 +267,10 @@ def _megatron_command(self, precision): # noqa: C901
255267 --num-attention-heads { self ._args .num_attn_heads } \
256268 --seq-length { self ._args .seq_len } \
257269 --max-position-embeddings { self ._args .seq_len } \
258- --train-tokens { self ._args .train_tokens } \
259270 --train-samples { self ._args .num_steps * self ._args .batch_size } \
260271 --lr { self ._args .lr } \
261272 --min-lr { self ._args .min_lr } \
262- --split 949,50,1 \
273+ --split { self . _args . split } \
263274 --log-interval { self ._args .log_interval } \
264275 --eval-interval { self ._args .eval_interval } \
265276 --eval-iters { self ._args .eval_iters } \
@@ -273,7 +284,8 @@ def _megatron_command(self, precision): # noqa: C901
273284 --optimizer adam \
274285 --use-distributed-optimizer \
275286 { precision_megatron } \
276- --seed { self ._args .seed } '
287+ --seed { self ._args .seed } \
288+ --log-throughput'
277289
278290 if self ._args .sequence_parallel :
279291 megatron_options = f'{ megatron_options } --sequence-parallel'
@@ -298,6 +310,8 @@ def _megatron_command(self, precision): # noqa: C901
298310 script_path = os .path .join (self ._args .code_base , 'pretrain_gpt.py' )
299311 if self ._args .deepspeed :
300312 deepspeed_option = self .__prepare_deespeed_config (precision_megatron .lstrip ('--' ))
313+ # No --log-throughput in Megatron-DeepSpeed by 20231219
314+ megatron_options = megatron_options .replace ('--log-throughput' , '' ).strip ()
301315 if self ._num_nodes > 1 :
302316 command = f'torchrun { self ._distributed_args } ' + \
303317 f'{ script_path } { megatron_options } { self ._data_options } { deepspeed_option } '
@@ -379,6 +393,7 @@ def _init_distributed_setting(self):
379393
380394 return False
381395 self ._num_nodes = int (os .getenv ('OMPI_COMM_WORLD_SIZE' )) // int (os .getenv ('OMPI_COMM_WORLD_LOCAL_SIZE' ))
396+ master_addr = 'localhost'
382397 if self ._num_nodes > 1 :
383398 if not self ._args .hostfile :
384399 sb_hostfile = os .path .join (os .environ .get ('SB_WORKSPACE' , '.' ), 'hostfile' )
@@ -395,12 +410,13 @@ def _init_distributed_setting(self):
395410 if self ._num_nodes != len (hosts ):
396411 logger .error ('MPI init failed since hostfile not match the MPI setting.' )
397412 return False
413+ master_addr = hosts [0 ].split ()[0 ]
398414
399- addr = os .getenv ('MASTER_ADDR' , hosts [ 0 ]. split ()[ 0 ] )
400- port = os .getenv ('MASTER_PORT' , '29500' )
401- node_rank = int (os .environ ['OMPI_COMM_WORLD_RANK' ]) // int (os .environ ['OMPI_COMM_WORLD_LOCAL_SIZE' ])
402- self ._distributed_args = f'--nproc_per_node { self ._args .num_gpus } --nnodes { self ._num_nodes } ' + \
403- f'--node_rank { node_rank } --master_addr { addr } --master_port { port } '
415+ addr = os .getenv ('MASTER_ADDR' , master_addr )
416+ port = os .getenv ('MASTER_PORT' , '29500' )
417+ node_rank = int (os .environ ['OMPI_COMM_WORLD_RANK' ]) // int (os .environ ['OMPI_COMM_WORLD_LOCAL_SIZE' ])
418+ self ._distributed_args = f'--nproc_per_node { self ._args .num_gpus } --nnodes { self ._num_nodes } ' + \
419+ f'--node_rank { node_rank } --master_addr { addr } --master_port { port } '
404420 return True
405421
406422 def _generate_dataset (self ):
@@ -448,8 +464,7 @@ def _generate_dataset(self):
448464 self ._data_options = f'\
449465 --vocab-file { self ._vocab_path } \
450466 --merge-file { self ._merges_path } \
451- --data-path { self ._data_path } \
452- --data-impl { self ._args .data_impl } '
467+ --data-path { self ._data_path } '
453468
454469 logger .info ('Dataset preparation successfully.' )
455470 return True
0 commit comments