Skip to content

Commit ce1860b

Browse files
authored
Bug Fix - Bug fix for latest megatron-lm benchmark (#600)
**Description** Bug fix to sync latest megatron-lm code.
1 parent c2e7a54 commit ce1860b

12 files changed

Lines changed: 99 additions & 89 deletions

File tree

.gitmodules

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,9 @@
2424
[submodule "third_party/msccl"]
2525
path = third_party/msccl
2626
url = https://github.com/Azure/msccl
27+
[submodule "third_party/Megatron/Megatron-LM"]
28+
path = third_party/Megatron/Megatron-LM
29+
url = https://github.com/NVIDIA/Megatron-LM.git
30+
[submodule "third_party/Megatron/Megatron-DeepSpeed"]
31+
path = third_party/Megatron/Megatron-DeepSpeed
32+
url = https://github.com/microsoft/Megatron-DeepSpeed.git

dockerfile/directx12.dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ RUN curl -s -L https://dist.nuget.org/win-x86-commandline/latest/nuget.exe -o "%
5454
# Run the setup script to install the visual studio components
5555
RUN "%SB_HOME%\\dockerfile\\directx\\install-components.bat"
5656

57+
RUN powershell -Command "Set-ItemProperty -Path HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem -Name LongPathsEnabled -Value 1;"
58+
RUN git config --system core.longpaths true
5759
# Install Superbench
5860
RUN python -m pip install setuptools==65.0.0 && \
5961
python -m pip install --no-cache-dir .[amdworker] && \

dockerfile/rocm5.7.x.dockerfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx10
109109

110110
# Install OpenMPI
111111
ENV OPENMPI_VERSION=4.1.x
112+
ENV MPI_HOME=/usr/local/mpi
112113
# Check if Open MPI is installed
113114
RUN cd /tmp && \
114115
git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \
@@ -145,9 +146,9 @@ RUN cd /opt/ && \
145146
RUN cd /opt/rocm/share/amd_smi && \
146147
python3 -m pip install --user .
147148

148-
ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
149+
ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
149150
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
150-
LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
151+
LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
151152
SB_HOME=/opt/superbench \
152153
SB_MICRO_PATH=/opt/superbench \
153154
ANSIBLE_DEPRECATION_WARNINGS=FALSE \

dockerfile/rocm6.0.x.dockerfile

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ FROM ${BASE_IMAGE}
1010
# Lib:
1111
# - torch: 2.0.1
1212
# - rccl: 2.18.3+hip6.0 develop:7e1cbb4
13-
# - hipblaslt: 950ca43
13+
# - hipblaslt: release/rocm-rel-6.0
1414
# - openmpi: 4.1.x
1515
# - apex: 1.0.0
1616
# Intel:
@@ -115,6 +115,7 @@ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942:sramec
115115

116116
# Install OpenMPI
117117
ENV OPENMPI_VERSION=4.1.x
118+
ENV MPI_HOME=/usr/local/mpi
118119
# Check if Open MPI is installed
119120
RUN cd /tmp && \
120121
git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \
@@ -147,9 +148,9 @@ RUN cd /opt/ && \
147148
.. && \
148149
make -j${NUM_MAKE_JOBS}
149150

150-
ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
151+
ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
151152
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
152-
LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
153+
LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
153154
SB_HOME=/opt/superbench \
154155
SB_MICRO_PATH=/opt/superbench \
155156
ANSIBLE_DEPRECATION_WARNINGS=FALSE \

superbench/benchmarks/model_benchmarks/megatron_gpt3.py

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,9 @@ def add_parser_arguments(self):
116116
self._parser.add_argument('--data_home', type=str, default='/tmp', help='Data home.')
117117
self._parser.add_argument('--vocab_path', type=str, default='/tmp/gpt2-vocab.json', help='Vocab path.')
118118
self._parser.add_argument('--merge_path', type=str, default='/tmp/gpt2-merges.txt', help='Merge path.')
119+
self._parser.add_argument(
120+
'--split', type=str, default='949,50,1', help='Split dataset ratio for train/val/test.'
121+
)
119122
self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.')
120123
self._parser.add_argument(
121124
'--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.'
@@ -128,6 +131,13 @@ def add_parser_arguments(self):
128131
def _preprocess(self):
129132
if not super()._preprocess():
130133
return False
134+
if not self._args.code_base:
135+
if self._args.deepspeed:
136+
self._args.code_base = os.path.join(
137+
os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-DeepSpeed/'
138+
)
139+
else:
140+
self._args.code_base = os.path.join(os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-LM')
131141

132142
if not os.path.exists(self._args.code_base) or \
133143
not os.path.exists(os.path.join(self._args.code_base, 'pretrain_gpt.py')):
@@ -156,35 +166,35 @@ def _preprocess(self):
156166

157167
def _parse_log(self, output):
158168
"""Parse log output and get the performance."""
159-
tflops_pattern = re.compile(r'TFLOPs: (\d+\.\d+)')
169+
tflops_pattern = re.compile(r'(TFLOPs|TFLOP/s/GPU\)): (\d+\.\d+)')
160170
elapsed_time_pattern = re.compile(r'elapsed time per iteration \(ms\): (\d+\.\d+)')
161-
mem_allocated_pattern = re.compile(r'MemAllocated=([\d.]+)[KMGTPEZY]?B')
162-
max_mem_allocated_pattern = re.compile(r'MaxMemAllocated=([\d.]+)[KMGTPEZY]?B')
171+
mem_allocated_pattern = re.compile(r'allocated: (\d+\.\d+)')
172+
max_mem_allocated_pattern = re.compile(r'max allocated: (\d+\.\d+)')
163173
lines = output.splitlines()
164174
tflops = []
165175
mem_allocated = []
166176
max_mem_allocated = []
167177
iteration_times = []
168178
for line in lines:
169-
if 'TFLOPs' in line:
179+
if 'elapsed time per iteration' in line:
170180
tflops_matches = tflops_pattern.search(line)
171181
elapsed_time_match = elapsed_time_pattern.search(line)
172182
if tflops_matches:
173-
tflops_values = float(tflops_matches.group(1))
183+
tflops_values = float(tflops_matches.group(2))
174184
tflops.append(tflops_values)
175185
if elapsed_time_match:
176186
elapsed_time_value = float(elapsed_time_match.group(1))
177187
iteration_times.append(elapsed_time_value)
178188

179-
if 'MaxMemAllocated' in line:
189+
if 'max allocated' in line:
180190
mem_allocated_match = mem_allocated_pattern.search(line)
181191
max_mem_allocated_match = max_mem_allocated_pattern.search(line)
182192
if mem_allocated_match:
183-
mem_allocated_value = float(mem_allocated_match.group(1))
193+
mem_allocated_value = float(mem_allocated_match.group(1)) / 1024
184194
mem_allocated.append(mem_allocated_value)
185195

186196
if max_mem_allocated_match:
187-
max_mem_allocated_value = float(max_mem_allocated_match.group(1))
197+
max_mem_allocated_value = float(max_mem_allocated_match.group(1)) / 1024
188198
max_mem_allocated.append(max_mem_allocated_value)
189199

190200
return iteration_times, tflops, mem_allocated, max_mem_allocated
@@ -224,7 +234,9 @@ def __prepare_deespeed_config(self, precision_megatron):
224234
--deepspeed \
225235
--deepspeed_config {self._config_json_path} \
226236
--zero-stage {self._args.zero_stage} \
227-
--pipeline-model-parallel-size {self._args.pipeline_model_parallel_size}'
237+
--pipeline-model-parallel-size {self._args.pipeline_model_parallel_size} \
238+
--train-tokens {self._args.train_tokens} \
239+
--data-impl {self._args.data_impl}'
228240

229241
if self._args.pipeline_model_parallel_size <= 1:
230242
deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel'
@@ -255,11 +267,10 @@ def _megatron_command(self, precision): # noqa: C901
255267
--num-attention-heads {self._args.num_attn_heads} \
256268
--seq-length {self._args.seq_len} \
257269
--max-position-embeddings {self._args.seq_len} \
258-
--train-tokens {self._args.train_tokens} \
259270
--train-samples {self._args.num_steps * self._args.batch_size} \
260271
--lr {self._args.lr} \
261272
--min-lr {self._args.min_lr} \
262-
--split 949,50,1 \
273+
--split {self._args.split} \
263274
--log-interval {self._args.log_interval} \
264275
--eval-interval {self._args.eval_interval} \
265276
--eval-iters {self._args.eval_iters} \
@@ -273,7 +284,8 @@ def _megatron_command(self, precision): # noqa: C901
273284
--optimizer adam \
274285
--use-distributed-optimizer \
275286
{precision_megatron} \
276-
--seed {self._args.seed}'
287+
--seed {self._args.seed} \
288+
--log-throughput'
277289

278290
if self._args.sequence_parallel:
279291
megatron_options = f'{megatron_options} --sequence-parallel'
@@ -298,6 +310,8 @@ def _megatron_command(self, precision): # noqa: C901
298310
script_path = os.path.join(self._args.code_base, 'pretrain_gpt.py')
299311
if self._args.deepspeed:
300312
deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--'))
313+
# No --log-throughput in Megatron-DeepSpeed by 20231219
314+
megatron_options = megatron_options.replace('--log-throughput', '').strip()
301315
if self._num_nodes > 1:
302316
command = f'torchrun {self._distributed_args} ' + \
303317
f'{script_path} {megatron_options} {self._data_options} {deepspeed_option}'
@@ -379,6 +393,7 @@ def _init_distributed_setting(self):
379393

380394
return False
381395
self._num_nodes = int(os.getenv('OMPI_COMM_WORLD_SIZE')) // int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
396+
master_addr = 'localhost'
382397
if self._num_nodes > 1:
383398
if not self._args.hostfile:
384399
sb_hostfile = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'hostfile')
@@ -395,12 +410,13 @@ def _init_distributed_setting(self):
395410
if self._num_nodes != len(hosts):
396411
logger.error('MPI init failed since hostfile not match the MPI setting.')
397412
return False
413+
master_addr = hosts[0].split()[0]
398414

399-
addr = os.getenv('MASTER_ADDR', hosts[0].split()[0])
400-
port = os.getenv('MASTER_PORT', '29500')
401-
node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])
402-
self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \
403-
f'--node_rank {node_rank} --master_addr {addr} --master_port {port}'
415+
addr = os.getenv('MASTER_ADDR', master_addr)
416+
port = os.getenv('MASTER_PORT', '29500')
417+
node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])
418+
self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \
419+
f'--node_rank {node_rank} --master_addr {addr} --master_port {port}'
404420
return True
405421

406422
def _generate_dataset(self):
@@ -448,8 +464,7 @@ def _generate_dataset(self):
448464
self._data_options = f'\
449465
--vocab-file {self._vocab_path} \
450466
--merge-file {self._merges_path} \
451-
--data-path {self._data_path} \
452-
--data-impl {self._args.data_impl}'
467+
--data-path {self._data_path}'
453468

454469
logger.info('Dataset preparation successfully.')
455470
return True

superbench/benchmarks/model_benchmarks/model_base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,8 +265,8 @@ def __train(self, precision):
265265
# The unit of step time should be millisecond.
266266
step_times = self._train_step(precision)
267267
if isinstance(step_times, tuple):
268-
step_times = step_times[0]
269268
info = step_times[1]
269+
step_times = step_times[0]
270270
self._process_info(ModelAction.TRAIN, precision, info)
271271
step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times)
272272
if not step_times:

tests/benchmarks/model_benchmarks/test_megatron_gpt.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,7 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
177177
benchmark._data_options = f'\
178178
--vocab-file {self._tmp_dir}/gpt2-vocab.json \
179179
--merge-file {self._tmp_dir}/gpt2-merges.txt \
180-
--data-path {self._tmp_dir}/dataset_text_document \
181-
--data-impl mmap'
180+
--data-path {self._tmp_dir}/dataset_text_document'
182181

183182
script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py')
184183
expected_command = 'torchrun {distributed_args} {script_path} \
@@ -197,7 +196,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
197196
--num-attention-heads 32 \
198197
--seq-length 2048 \
199198
--max-position-embeddings 2048 \
200-
--train-tokens 300000000000 \
201199
--train-samples 20480 \
202200
--lr 0.00012 \
203201
--min-lr 1e-06 \
@@ -215,7 +213,8 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
215213
--optimizer adam \
216214
--use-distributed-optimizer \
217215
{precision} \
218-
--seed 1234 {data_options}'
216+
--seed 1234 \
217+
--log-throughput {data_options}'
219218

220219
precision = Precision.FLOAT32
221220
command = benchmark._megatron_command(precision)
@@ -262,12 +261,10 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
262261
benchmark._data_options = f'\
263262
--vocab-file {self._tmp_dir}/gpt2-vocab.json \
264263
--merge-file {self._tmp_dir}/gpt2-merges.txt \
265-
--data-path {self._tmp_dir}/dataset_text_document \
266-
--data-impl mmap'
264+
--data-path {self._tmp_dir}/dataset_text_document'
267265

268266
command = benchmark._megatron_command(Precision.BFLOAT16)
269-
expected_command = 'deepspeed {script_path} \
270-
--override-opt_param-scheduler \
267+
expected_command = 'deepspeed {script_path} --override-opt_param-scheduler \
271268
--adam-beta1 0.9 \
272269
--adam-beta2 0.95 \
273270
--tensor-model-parallel-size 1 \
@@ -282,7 +279,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
282279
--num-attention-heads 32 \
283280
--seq-length 2048 \
284281
--max-position-embeddings 2048 \
285-
--train-tokens 300000000000 \
286282
--train-samples 20480 \
287283
--lr 0.00012 \
288284
--min-lr 1e-06 \
@@ -306,7 +302,9 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
306302
--deepspeed \
307303
--deepspeed_config {benchmark._config_json_path} \
308304
--zero-stage 1 \
309-
--pipeline-model-parallel-size 1 --no-pipeline-parallel'
305+
--pipeline-model-parallel-size 1 \
306+
--train-tokens 300000000000 \
307+
--data-impl mmap --no-pipeline-parallel'
310308

311309
self.assertEqual(
312310
command,
@@ -346,12 +344,12 @@ def test_megatron_parse_log(self, raw_output, mock_generate_dataset):
346344
iteration_times, tflops, mem_allocated, max_mem_allocated = benchmark._parse_log(raw_output)
347345
assert (statistics.mean(iteration_times) == 75239.24)
348346
assert (statistics.mean(tflops) == 149.136)
349-
assert (statistics.mean(mem_allocated) == 17.54)
350-
assert (statistics.mean(max_mem_allocated) == 66.97)
347+
assert (statistics.mean(mem_allocated) == 17.535637855529785)
348+
assert (statistics.mean(max_mem_allocated) == 66.9744234085083)
351349

352350
info = {'tflops': tflops, 'mem_allocated': mem_allocated, 'max_mem_allocated': max_mem_allocated}
353351
benchmark._process_info(ModelAction.TRAIN, Precision.FLOAT16, info)
354352
assert (benchmark.result is not None)
355353
assert (benchmark.result['fp16_train_tflops'][0] == 149.136)
356-
assert (benchmark.result['fp16_train_mem_allocated'][0] == 17.54)
357-
assert (benchmark.result['fp16_train_max_mem_allocated'][0] == 66.97)
354+
assert (benchmark.result['fp16_train_mem_allocated'][0] == 17.535637855529785)
355+
assert (benchmark.result['fp16_train_max_mem_allocated'][0] == 66.9744234085083)

third_party/Makefile

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -177,21 +177,17 @@ directx_amf_encoding_latency:
177177
"C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Platform=x64 /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \
178178
)
179179

180-
# Install Megatron-LM
180+
# Install requirements for Megatron-LM
181181
megatron_lm:
182-
if [ ! -d "Megatron/Megatron-LM" ]; then \
183-
git clone "https://github.com/NVIDIA/Megatron-LM.git" "Megatron/Megatron-LM"; \
184-
fi
185182
cd Megatron && \
186-
python -m pip install -r requirements.txt
183+
apt install -y python3-mpi4py && \
184+
python -m pip install --no-cache-dir -r requirements.txt
187185

188-
# Install Megatron-DeepSpeed
186+
# Install requirements for Megatron-DeepSpeed
189187
megatron_deepspeed:
190-
if [ ! -d "Megatron/Megatron-DeepSpeed" ]; then \
191-
git clone "https://github.com/microsoft/Megatron-DeepSpeed.git" "Megatron/Megatron-DeepSpeed"; \
192-
fi
193188
cd Megatron && \
194-
python -m pip install -r requirements.txt && \
189+
apt install -y python3-mpi4py && \
190+
python -m pip install --no-cache-dir -r requirements.txt && \
195191
python -m pip install DeepSpeed
196192

197193
# Instal apex of ROCm due to dependency of Megatron
Submodule Megatron-DeepSpeed added at 71e8407

third_party/Megatron/Megatron-LM

Submodule Megatron-LM added at 52b7a18

0 commit comments

Comments
 (0)