Bug Fix - Bug fix for latest megatron-lm benchmark (#600)

yukirora · web-flow · commit ce1860b9b640 · 2023-12-27T21:31:41.000Z
**Description**
Bug fix to sync latest megatron-lm code.
diff --git a/.gitmodules b/.gitmodules
@@ -24,3 +24,9 @@
 [submodule "third_party/msccl"]
 	path = third_party/msccl
 	url = https://github.com/Azure/msccl
+[submodule "third_party/Megatron/Megatron-LM"]
+	path = third_party/Megatron/Megatron-LM
+	url = https://github.com/NVIDIA/Megatron-LM.git
+[submodule "third_party/Megatron/Megatron-DeepSpeed"]
+	path = third_party/Megatron/Megatron-DeepSpeed
+	url = https://github.com/microsoft/Megatron-DeepSpeed.git
diff --git a/dockerfile/directx12.dockerfile b/dockerfile/directx12.dockerfile
@@ -54,6 +54,8 @@ RUN curl -s -L https://dist.nuget.org/win-x86-commandline/latest/nuget.exe -o "%
 # Run the setup script to install the visual studio components
 RUN "%SB_HOME%\\dockerfile\\directx\\install-components.bat"
 
+RUN powershell -Command "Set-ItemProperty -Path HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem -Name LongPathsEnabled -Value 1;"
+RUN git config --system core.longpaths true
 # Install Superbench
 RUN python -m pip install setuptools==65.0.0 && \
     python -m pip install --no-cache-dir .[amdworker] && \
diff --git a/dockerfile/rocm5.7.x.dockerfile b/dockerfile/rocm5.7.x.dockerfile
@@ -109,6 +109,7 @@ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx10
 
 # Install OpenMPI
 ENV OPENMPI_VERSION=4.1.x
+ENV MPI_HOME=/usr/local/mpi
 # Check if Open MPI is installed
 RUN cd /tmp && \
     git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION}  && \
@@ -145,9 +146,9 @@ RUN cd /opt/ &&  \
 RUN cd /opt/rocm/share/amd_smi && \
     python3 -m pip install --user .
 
-ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
+ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
     LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
-    LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
+    LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
     SB_HOME=/opt/superbench \
     SB_MICRO_PATH=/opt/superbench \
     ANSIBLE_DEPRECATION_WARNINGS=FALSE \
diff --git a/dockerfile/rocm6.0.x.dockerfile b/dockerfile/rocm6.0.x.dockerfile
@@ -10,7 +10,7 @@ FROM ${BASE_IMAGE}
 # Lib:
 #   - torch: 2.0.1
 #   - rccl: 2.18.3+hip6.0 develop:7e1cbb4
-#   - hipblaslt: 950ca43
+#   - hipblaslt: release/rocm-rel-6.0
 #   - openmpi: 4.1.x
 #   - apex: 1.0.0
 # Intel:
@@ -115,6 +115,7 @@ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942:sramec
 
 # Install OpenMPI
 ENV OPENMPI_VERSION=4.1.x
+ENV MPI_HOME=/usr/local/mpi
 # Check if Open MPI is installed
 RUN cd /tmp && \
     git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION}  && \
@@ -147,9 +148,9 @@ RUN cd /opt/ &&  \
     .. && \
     make -j${NUM_MAKE_JOBS}
 
-ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
+ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
     LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
-    LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
+    LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
     SB_HOME=/opt/superbench \
     SB_MICRO_PATH=/opt/superbench \
     ANSIBLE_DEPRECATION_WARNINGS=FALSE \
diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -116,6 +116,9 @@ def add_parser_arguments(self):
         self._parser.add_argument('--data_home', type=str, default='/tmp', help='Data home.')
         self._parser.add_argument('--vocab_path', type=str, default='/tmp/gpt2-vocab.json', help='Vocab path.')
         self._parser.add_argument('--merge_path', type=str, default='/tmp/gpt2-merges.txt', help='Merge path.')
+        self._parser.add_argument(
+            '--split', type=str, default='949,50,1', help='Split dataset ratio for train/val/test.'
+        )
         self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.')
         self._parser.add_argument(
             '--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.'
@@ -128,6 +131,13 @@ def add_parser_arguments(self):
     def _preprocess(self):
         if not super()._preprocess():
             return False
+        if not self._args.code_base:
+            if self._args.deepspeed:
+                self._args.code_base = os.path.join(
+                    os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-DeepSpeed/'
+                )
+            else:
+                self._args.code_base = os.path.join(os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-LM')
 
         if not os.path.exists(self._args.code_base) or \
                 not os.path.exists(os.path.join(self._args.code_base, 'pretrain_gpt.py')):
@@ -156,35 +166,35 @@ def _preprocess(self):
 
     def _parse_log(self, output):
         """Parse log output and get the performance."""
-        tflops_pattern = re.compile(r'TFLOPs: (\d+\.\d+)')
+        tflops_pattern = re.compile(r'(TFLOPs|TFLOP/s/GPU\)): (\d+\.\d+)')
         elapsed_time_pattern = re.compile(r'elapsed time per iteration \(ms\): (\d+\.\d+)')
-        mem_allocated_pattern = re.compile(r'MemAllocated=([\d.]+)[KMGTPEZY]?B')
-        max_mem_allocated_pattern = re.compile(r'MaxMemAllocated=([\d.]+)[KMGTPEZY]?B')
+        mem_allocated_pattern = re.compile(r'allocated: (\d+\.\d+)')
+        max_mem_allocated_pattern = re.compile(r'max allocated: (\d+\.\d+)')
         lines = output.splitlines()
         tflops = []
         mem_allocated = []
         max_mem_allocated = []
         iteration_times = []
         for line in lines:
-            if 'TFLOPs' in line:
+            if 'elapsed time per iteration' in line:
                 tflops_matches = tflops_pattern.search(line)
                 elapsed_time_match = elapsed_time_pattern.search(line)
                 if tflops_matches:
-                    tflops_values = float(tflops_matches.group(1))
+                    tflops_values = float(tflops_matches.group(2))
                     tflops.append(tflops_values)
                 if elapsed_time_match:
                     elapsed_time_value = float(elapsed_time_match.group(1))
                     iteration_times.append(elapsed_time_value)
 
-            if 'MaxMemAllocated' in line:
+            if 'max allocated' in line:
                 mem_allocated_match = mem_allocated_pattern.search(line)
                 max_mem_allocated_match = max_mem_allocated_pattern.search(line)
                 if mem_allocated_match:
-                    mem_allocated_value = float(mem_allocated_match.group(1))
+                    mem_allocated_value = float(mem_allocated_match.group(1)) / 1024
                     mem_allocated.append(mem_allocated_value)
 
                 if max_mem_allocated_match:
-                    max_mem_allocated_value = float(max_mem_allocated_match.group(1))
+                    max_mem_allocated_value = float(max_mem_allocated_match.group(1)) / 1024
                     max_mem_allocated.append(max_mem_allocated_value)
 
         return iteration_times, tflops, mem_allocated, max_mem_allocated
@@ -224,7 +234,9 @@ def __prepare_deespeed_config(self, precision_megatron):
             --deepspeed \
             --deepspeed_config {self._config_json_path} \
             --zero-stage {self._args.zero_stage} \
-            --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size}'
+            --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size} \
+            --train-tokens {self._args.train_tokens} \
+            --data-impl {self._args.data_impl}'
 
         if self._args.pipeline_model_parallel_size <= 1:
             deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel'
@@ -255,11 +267,10 @@ def _megatron_command(self, precision):    # noqa: C901
             --num-attention-heads {self._args.num_attn_heads} \
             --seq-length {self._args.seq_len} \
             --max-position-embeddings {self._args.seq_len} \
-            --train-tokens {self._args.train_tokens} \
             --train-samples {self._args.num_steps * self._args.batch_size} \
             --lr {self._args.lr} \
             --min-lr {self._args.min_lr} \
-            --split 949,50,1 \
+            --split {self._args.split} \
             --log-interval {self._args.log_interval} \
             --eval-interval {self._args.eval_interval} \
             --eval-iters {self._args.eval_iters} \
@@ -273,7 +284,8 @@ def _megatron_command(self, precision):    # noqa: C901
             --optimizer adam \
             --use-distributed-optimizer \
             {precision_megatron} \
-            --seed {self._args.seed}'
+            --seed {self._args.seed} \
+            --log-throughput'
 
         if self._args.sequence_parallel:
             megatron_options = f'{megatron_options} --sequence-parallel'
@@ -298,6 +310,8 @@ def _megatron_command(self, precision):    # noqa: C901
         script_path = os.path.join(self._args.code_base, 'pretrain_gpt.py')
         if self._args.deepspeed:
             deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--'))
+            # No --log-throughput in Megatron-DeepSpeed by 20231219
+            megatron_options = megatron_options.replace('--log-throughput', '').strip()
             if self._num_nodes > 1:
                 command = f'torchrun {self._distributed_args} ' + \
                     f'{script_path} {megatron_options} {self._data_options} {deepspeed_option}'
@@ -379,6 +393,7 @@ def _init_distributed_setting(self):
 
             return False
         self._num_nodes = int(os.getenv('OMPI_COMM_WORLD_SIZE')) // int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
+        master_addr = 'localhost'
         if self._num_nodes > 1:
             if not self._args.hostfile:
                 sb_hostfile = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'hostfile')
@@ -395,12 +410,13 @@ def _init_distributed_setting(self):
             if self._num_nodes != len(hosts):
                 logger.error('MPI init failed since hostfile not match the MPI setting.')
                 return False
+            master_addr = hosts[0].split()[0]
 
-            addr = os.getenv('MASTER_ADDR', hosts[0].split()[0])
-            port = os.getenv('MASTER_PORT', '29500')
-            node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])
-            self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \
-                f'--node_rank {node_rank} --master_addr {addr} --master_port {port}'
+        addr = os.getenv('MASTER_ADDR', master_addr)
+        port = os.getenv('MASTER_PORT', '29500')
+        node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])
+        self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \
+            f'--node_rank {node_rank} --master_addr {addr} --master_port {port}'
         return True
 
     def _generate_dataset(self):
@@ -448,8 +464,7 @@ def _generate_dataset(self):
         self._data_options = f'\
             --vocab-file {self._vocab_path} \
             --merge-file {self._merges_path} \
-            --data-path {self._data_path} \
-            --data-impl {self._args.data_impl}'
+            --data-path {self._data_path}'
 
         logger.info('Dataset preparation successfully.')
         return True
diff --git a/superbench/benchmarks/model_benchmarks/model_base.py b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -265,8 +265,8 @@ def __train(self, precision):
         # The unit of step time should be millisecond.
         step_times = self._train_step(precision)
         if isinstance(step_times, tuple):
-            step_times = step_times[0]
             info = step_times[1]
+            step_times = step_times[0]
             self._process_info(ModelAction.TRAIN, precision, info)
         step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times)
         if not step_times:
diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
@@ -177,8 +177,7 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
         benchmark._data_options = f'\
             --vocab-file {self._tmp_dir}/gpt2-vocab.json \
             --merge-file {self._tmp_dir}/gpt2-merges.txt \
-            --data-path {self._tmp_dir}/dataset_text_document \
-            --data-impl mmap'
+            --data-path {self._tmp_dir}/dataset_text_document'
 
         script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py')
         expected_command = 'torchrun {distributed_args} {script_path} \
@@ -197,7 +196,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
             --num-attention-heads 32 \
             --seq-length 2048 \
             --max-position-embeddings 2048 \
-            --train-tokens 300000000000 \
             --train-samples 20480 \
             --lr 0.00012 \
             --min-lr 1e-06 \
@@ -215,7 +213,8 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
             --optimizer adam \
             --use-distributed-optimizer \
             {precision} \
-            --seed 1234 {data_options}'
+            --seed 1234 \
+            --log-throughput {data_options}'
 
         precision = Precision.FLOAT32
         command = benchmark._megatron_command(precision)
@@ -262,12 +261,10 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
         benchmark._data_options = f'\
             --vocab-file {self._tmp_dir}/gpt2-vocab.json \
             --merge-file {self._tmp_dir}/gpt2-merges.txt \
-            --data-path {self._tmp_dir}/dataset_text_document \
-            --data-impl mmap'
+            --data-path {self._tmp_dir}/dataset_text_document'
 
         command = benchmark._megatron_command(Precision.BFLOAT16)
-        expected_command = 'deepspeed {script_path} \
-            --override-opt_param-scheduler \
+        expected_command = 'deepspeed {script_path} --override-opt_param-scheduler \
             --adam-beta1 0.9 \
             --adam-beta2 0.95 \
             --tensor-model-parallel-size 1 \
@@ -282,7 +279,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
             --num-attention-heads 32 \
             --seq-length 2048 \
             --max-position-embeddings 2048 \
-            --train-tokens 300000000000 \
             --train-samples 20480 \
             --lr 0.00012 \
             --min-lr 1e-06 \
@@ -306,7 +302,9 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
             --deepspeed \
             --deepspeed_config {benchmark._config_json_path} \
             --zero-stage 1 \
-            --pipeline-model-parallel-size 1 --no-pipeline-parallel'
+            --pipeline-model-parallel-size 1 \
+            --train-tokens 300000000000 \
+            --data-impl mmap --no-pipeline-parallel'
 
         self.assertEqual(
             command,
@@ -346,12 +344,12 @@ def test_megatron_parse_log(self, raw_output, mock_generate_dataset):
         iteration_times, tflops, mem_allocated, max_mem_allocated = benchmark._parse_log(raw_output)
         assert (statistics.mean(iteration_times) == 75239.24)
         assert (statistics.mean(tflops) == 149.136)
-        assert (statistics.mean(mem_allocated) == 17.54)
-        assert (statistics.mean(max_mem_allocated) == 66.97)
+        assert (statistics.mean(mem_allocated) == 17.535637855529785)
+        assert (statistics.mean(max_mem_allocated) == 66.9744234085083)
 
         info = {'tflops': tflops, 'mem_allocated': mem_allocated, 'max_mem_allocated': max_mem_allocated}
         benchmark._process_info(ModelAction.TRAIN, Precision.FLOAT16, info)
         assert (benchmark.result is not None)
         assert (benchmark.result['fp16_train_tflops'][0] == 149.136)
-        assert (benchmark.result['fp16_train_mem_allocated'][0] == 17.54)
-        assert (benchmark.result['fp16_train_max_mem_allocated'][0] == 66.97)
+        assert (benchmark.result['fp16_train_mem_allocated'][0] == 17.535637855529785)
+        assert (benchmark.result['fp16_train_max_mem_allocated'][0] == 66.9744234085083)
diff --git a/third_party/Makefile b/third_party/Makefile
@@ -177,21 +177,17 @@ directx_amf_encoding_latency:
 		"C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Platform=x64 /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \
 	)
 
-# Install Megatron-LM
+# Install requirements for Megatron-LM
 megatron_lm:
-	if [ ! -d "Megatron/Megatron-LM" ]; then \
-        git clone "https://github.com/NVIDIA/Megatron-LM.git" "Megatron/Megatron-LM"; \
-    fi
 	cd Megatron && \
-	python -m pip install -r requirements.txt
+	apt install -y python3-mpi4py && \
+	python -m pip install --no-cache-dir -r requirements.txt
 
-# Install Megatron-DeepSpeed
+# Install requirements for Megatron-DeepSpeed
 megatron_deepspeed:
-	if [ ! -d "Megatron/Megatron-DeepSpeed" ]; then \
-        git clone "https://github.com/microsoft/Megatron-DeepSpeed.git" "Megatron/Megatron-DeepSpeed"; \
-    fi
 	cd Megatron && \
-	python -m pip install -r requirements.txt && \
+	apt install -y python3-mpi4py && \
+	python -m pip install --no-cache-dir -r requirements.txt && \
 	python -m pip install DeepSpeed
 
 # Instal apex of ROCm due to dependency of Megatron
diff --git a/third_party/Megatron/Megatron-DeepSpeed b/third_party/Megatron/Megatron-DeepSpeed
@@ -0,0 +1 @@
+Subproject commit 71e8407c98bacacb002823ea587c321fe58b28a6
diff --git a/third_party/Megatron/Megatron-LM b/third_party/Megatron/Megatron-LM
@@ -0,0 +1 @@
+Subproject commit 52b7a18a00bced8b3670eededfd58ee0c4bd7d06
diff --git a/third_party/Megatron/megatron_deepspeed_rocm6.patch b/third_party/Megatron/megatron_deepspeed_rocm6.patch
diff --git a/third_party/Megatron/requirements.txt b/third_party/Megatron/requirements.txt