From 7086eb4d0eb36f0751975d38aaca926adec6bc0f Mon Sep 17 00:00:00 2001 From: yukirora Date: Fri, 23 May 2025 06:09:45 +0000 Subject: [PATCH 1/6] init --- setup.py | 8 +- .../model_benchmarks/megatron_gpt3.py | 634 ++++++++++++------ .../model_benchmarks/pytorch_base.py | 2 +- third_party/Makefile | 20 +- 4 files changed, 461 insertions(+), 203 deletions(-) diff --git a/setup.py b/setup.py index 29688a5a5..120329c60 100644 --- a/setup.py +++ b/setup.py @@ -189,11 +189,11 @@ def run(self): 'requests>=2.27.1', 'seaborn>=0.11.2', 'tcping>=0.1.1rc1', - 'urllib3>=1.26.9', 'xlrd>=2.0.1', 'xlsxwriter>=1.3.8', 'xmltodict>=0.12.0', - 'types-requests', + 'types-requests==0.1.13', + 'urllib3==1.26.20' ], extras_require=( lambda x: { @@ -225,10 +225,10 @@ def run(self): ], 'torch': [ 'safetensors==0.4.5', - 'tokenizers<=0.20.3', + 'tokenizers>=0.20.3', 'torch>=1.7.0a0', 'torchvision>=0.8.0a0', - 'transformers>=4.28.0', + 'transformers>=4.48.1', ], 'ort': [ 'onnx>=1.10.2', diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 5c3350e95..ac4f2ffc6 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -22,13 +22,14 @@ def download_file(url, path): """Download file from url to path.""" response = requests.get(url) - with open(path, 'wb') as file: + with open(path, "wb") as file: file.write(response.content) class MegatronGPT(ModelBenchmark): """The Megatron DeepSpeed GPT pretrain benchmark class.""" - def __init__(self, name, parameters=''): + + def __init__(self, name, parameters=""): """Constructor. Args: @@ -36,97 +37,257 @@ def __init__(self, name, parameters=''): parameters (str): parameters of the benchmark. """ super().__init__(name, parameters) - self._supported_precision = [Precision.FLOAT32, Precision.FLOAT16, Precision.BFLOAT16] + self._supported_precision = [ + Precision.FLOAT32, + Precision.FLOAT16, + Precision.BFLOAT16, + ] def add_parser_arguments(self): """Add the specified arguments.""" super().add_parser_arguments() - self._parser.add_argument('--code_base', type=str, required=False, default='', help='Code base.') - self._parser.add_argument('--dataset_url', type=str, required=False, default=None, help='Dataset URL.') self._parser.add_argument( - '--vocab_url', + "--code_base", type=str, required=False, default="", help="Code base." + ) + self._parser.add_argument( + "--dataset_url", type=str, required=False, default=None, help="Dataset URL." + ) + self._parser.add_argument( + "--vocab_url", + type=str, + required=False, + default="https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json", + help="Vocab URL.", + ) + self._parser.add_argument( + "--merges_url", type=str, required=False, - default='https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json', - help='Vocab URL.' + default="https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt", + help="Merges URL.", ) self._parser.add_argument( - '--merges_url', + "--tokenizer_type", type=str, required=False, - default='https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt', - help='Merges URL.' + default="GPT2BPETokenizer", + help="Tokenizer type.", + ) + self._parser.add_argument( + "--model_size", type=int, required=False, default=6.7, help="Model size." ) self._parser.add_argument( - '--tokenizer_type', type=str, required=False, default='GPT2BPETokenizer', help='Tokenizer type.' + "--num_layers", + type=int, + required=False, + default=32, + help="Number of layers.", ) - self._parser.add_argument('--model_size', type=int, required=False, default=6.7, help='Model size.') - self._parser.add_argument('--num_layers', type=int, required=False, default=32, help='Number of layers.') - self._parser.add_argument('--hidden_size', type=int, required=False, default=4096, help='Hidden size.') self._parser.add_argument( - '--num_attn_heads', type=int, required=False, default=32, help='Number of attention heads.' + "--hidden_size", type=int, required=False, default=4096, help="Hidden size." ) - self._parser.add_argument('--micro_batch_size', type=int, required=False, default=2, help='micro batch size.') - self._parser.add_argument('--lr', type=float, required=False, default=1.2e-4, help='Learning rate.') - self._parser.add_argument('--min_lr', type=float, required=False, default=1.0e-6, help='Minimum learning rate.') - self._parser.add_argument('--init_std', type=float, required=False, default=0.009, help='Init std.') - self._parser.add_argument('--seq_len', type=int, required=False, default=2048, help='Sequence length.') self._parser.add_argument( - '--tensor_model_parallel_size', type=int, required=False, default=1, help='Tensor model parallel size.' + "--num_attn_heads", + type=int, + required=False, + default=32, + help="Number of attention heads.", ) self._parser.add_argument( - '--pipeline_model_parallel_size', type=int, required=False, default=1, help='Pipeline model parallel size.' + "--micro_batch_size", + type=int, + required=False, + default=2, + help="micro batch size.", ) self._parser.add_argument( - '--num_gpus', type=int, required=False, default=8, help='Number of GPUs per node to run the benchmark.' + "--lr", type=float, required=False, default=1.2e-4, help="Learning rate." ) self._parser.add_argument( - '--num_nodes', type=int, required=False, default=1, help='Number of nodes to run the benchmark.' + "--min_lr", + type=float, + required=False, + default=1.0e-6, + help="Minimum learning rate.", + ) + self._parser.add_argument( + "--init_std", type=float, required=False, default=0.009, help="Init std." + ) + self._parser.add_argument( + "--seq_len", type=int, required=False, default=2048, help="Sequence length." + ) + self._parser.add_argument( + "--tensor_model_parallel_size", + type=int, + required=False, + default=1, + help="Tensor model parallel size.", + ) + self._parser.add_argument( + "--pipeline_model_parallel_size", + type=int, + required=False, + default=1, + help="Pipeline model parallel size.", + ) + self._parser.add_argument( + "--num_gpus", + type=int, + required=False, + default=8, + help="Number of GPUs per node to run the benchmark.", + ) + self._parser.add_argument( + "--num_nodes", + type=int, + required=False, + default=1, + help="Number of nodes to run the benchmark.", ) - self._parser.add_argument('--sequence_parallel', action='store_true', help='Enable Sequence parallel.') self._parser.add_argument( - '--no_async_tensor_model_parallel_allreduce', - action='store_true', - help='No async tensor model parallel allreduce.' + "--sequence_parallel", action="store_true", help="Enable Sequence parallel." ) self._parser.add_argument( - '--use_rotary_position_embeddings', action='store_true', help='Use rotary position embeddings.' + "--no_async_tensor_model_parallel_allreduce", + action="store_true", + help="No async tensor model parallel allreduce.", ) self._parser.add_argument( - '--no_gradient_accumulation_fusion', action='store_true', help='No gradient accumulation fusion.' + "--use_rotary_position_embeddings", + action="store_true", + help="Use rotary position embeddings.", ) - self._parser.add_argument('--use_flash_attn', action='store_true', help='Use flash attention.') - self._parser.add_argument('--no_masked_softmax_fusion', action='store_true', help='No masked softmax fusion.') - self._parser.add_argument('--no_bias_gelu_fusion', action='store_true', help='No bias gelu fusion.') - self._parser.add_argument('--no_bias_dropout_fusion', action='store_true', help='No bias dropout fusion.') self._parser.add_argument( - '--train_tokens', type=int, required=False, default=300000000000, help='Train tokens.' + "--no_gradient_accumulation_fusion", + action="store_true", + help="No gradient accumulation fusion.", + ) + self._parser.add_argument( + "--use_flash_attn", action="store_true", help="Use flash attention." + ) + self._parser.add_argument( + "--no_masked_softmax_fusion", + action="store_true", + help="No masked softmax fusion.", + ) + self._parser.add_argument( + "--no_bias_gelu_fusion", action="store_true", help="No bias gelu fusion." + ) + self._parser.add_argument( + "--no_bias_dropout_fusion", + action="store_true", + help="No bias dropout fusion.", + ) + self._parser.add_argument( + "--train_tokens", + type=int, + required=False, + default=300000000000, + help="Train tokens.", ) # lr configs # Parallelism configs - self._parser.add_argument('--zero_stage', type=int, default=1, help='Zero stage.') + self._parser.add_argument( + "--zero_stage", type=int, default=1, help="Zero stage." + ) # Misc configs - self._parser.add_argument('--log-interval', type=int, required=False, default=1, help='Log interval.') - self._parser.add_argument('--eval_iters', type=int, default=0, help='Eval iters.') - self._parser.add_argument('--eval_interval', type=int, default=10, help='Eval interval.') - self._parser.add_argument('--num_save', type=int, default=10000, help='Num save.') - self._parser.add_argument('--save_interval', type=int, default=10000, help='Save interval.') + self._parser.add_argument( + "--log-interval", type=int, required=False, default=1, help="Log interval." + ) + self._parser.add_argument( + "--eval_iters", type=int, default=0, help="Eval iters." + ) + self._parser.add_argument( + "--eval_interval", type=int, default=10, help="Eval interval." + ) + self._parser.add_argument( + "--num_save", type=int, default=10000, help="Num save." + ) + self._parser.add_argument( + "--save_interval", type=int, default=10000, help="Save interval." + ) # Output and data configs - self._parser.add_argument('--seed', type=int, default=1234, help='Seed.') - self._parser.add_argument('--data_home', type=str, default='/tmp', help='Data home.') - self._parser.add_argument('--vocab_path', type=str, default='/tmp/gpt2-vocab.json', help='Vocab path.') - self._parser.add_argument('--merge_path', type=str, default='/tmp/gpt2-merges.txt', help='Merge path.') + self._parser.add_argument("--seed", type=int, default=1234, help="Seed.") self._parser.add_argument( - '--split', type=str, default='949,50,1', help='Split dataset ratio for train/val/test.' + "--data_home", type=str, default="/tmp", help="Data home." + ) + self._parser.add_argument( + "--vocab_path", type=str, default="/tmp/gpt2-vocab.json", help="Vocab path." + ) + self._parser.add_argument( + "--merge_path", type=str, default="/tmp/gpt2-merges.txt", help="Merge path." + ) + self._parser.add_argument( + "--split", + type=str, + default=None, + help="Split dataset ratio for train/val/test.", ) - self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.') self._parser.add_argument( - '--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.' + "--prescale_grad", action="store_true", help="Prescale grad." ) - self._parser.add_argument('--data_impl', type=str, default='mmap', help='Data impl.') - self._parser.add_argument('--data_prefix', type=str, default='dataset_text_document', help='Data prefix.') - self._parser.add_argument('--deepspeed', action='store_true', help='Use deepspeed.') - self._parser.add_argument('--extra', type=str, default=None, help='Extra options for Megatron.') + self._parser.add_argument( + "--hostfile", + type=str, + default=None, + help="Hostfile to run the mutli-node benchmark.", + ) + self._parser.add_argument( + "--data_impl", type=str, default="mmap", help="Data impl." + ) + self._parser.add_argument( + "--data_prefix", + type=str, + default="dataset_text_document", + help="Data prefix.", + ) + self._parser.add_argument( + "--deepspeed", action="store_true", help="Use deepspeed." + ) + # list of extra options + self._parser.add_argument( + "--extra", type=str, default="", help="Extra options." + ) + self._parser.add_argument( + "--mock_data", action="store_true", help="Use mock data." + ) + self._parser.add_argument( + "--model", + type=str, + default="gpt", + help='Model to run. Current supported: "gpt" and "deepseek".', + ) + self._parser.add_argument( + "--dataloader_type", + type=str, + default=None, + help="Data loader type to load data.", + ) + self._parser.add_argument( + "--max_padding_length", + type=int, + default=None, + help="Max padding legth to embedding.", + ) + self._parser.add_argument( + "--expert_model_parallel_size", + type=int, + default=None, + help="Expert model parallel size.", + ) + self._parser.add_argument( + "--num_experts", + type=int, + default=None, + help="Number of experts.", + ) + self._parser.add_argument( + "--transformer_impl", + type=str, + default=None, + help="Transformer implementation.", + ) + def _preprocess(self): if not super()._preprocess(): @@ -134,28 +295,39 @@ def _preprocess(self): if not self._args.code_base: if self._args.deepspeed: self._args.code_base = os.path.join( - os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-DeepSpeed/' + os.getenv("SB_MICRO_PATH"), + "third_party/Megatron/Megatron-DeepSpeed/", ) else: - self._args.code_base = os.path.join(os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-LM') + self._args.code_base = os.path.join( + os.getenv("SB_MICRO_PATH"), "third_party/Megatron/Megatron-LM" + ) - if not os.path.exists(self._args.code_base) or \ - not os.path.exists(os.path.join(self._args.code_base, 'pretrain_gpt.py')): - logger.error('Code base is not valid.') + if not os.path.exists(self._args.code_base) or not os.path.exists( + os.path.join(self._args.code_base, f"pretrain_{self._args.model}.py") + ): + logger.error("Code base is not valid.") self._result.set_return_code(ReturnCode.INVALID_ARGUMENT) return False - data_parallel_size = self._args.num_gpus * self._num_nodes \ - // self._args.pipeline_model_parallel_size // self._args.tensor_model_parallel_size - if self._args.micro_batch_size < 1 or \ - self._args.micro_batch_size > (self._args.batch_size // data_parallel_size): - logger.error('Micro Batch size * data parallel size is larger than global batch size.') + data_parallel_size = ( + self._args.num_gpus + * self._num_nodes + // self._args.pipeline_model_parallel_size + // self._args.tensor_model_parallel_size + ) + if self._args.micro_batch_size < 1 or self._args.micro_batch_size > ( + self._args.batch_size // data_parallel_size + ): + logger.error( + "Micro Batch size * data parallel size is larger than global batch size." + ) self._result.set_return_code(ReturnCode.INVALID_ARGUMENT) return False for precision in self._args.precision: if precision not in self._supported_precision: - logger.error('Precision %s is not supported.' % precision) + logger.error("Precision %s is not supported." % precision) self._result.set_return_code(ReturnCode.INVALID_ARGUMENT) return False @@ -166,17 +338,19 @@ def _preprocess(self): def _parse_log(self, output): """Parse log output and get the performance.""" - tflops_pattern = re.compile(r'(TFLOPs|TFLOP/s/GPU\)): (\d+\.\d+)') - elapsed_time_pattern = re.compile(r'elapsed time per iteration \(ms\): (\d+\.\d+)') - mem_allocated_pattern = re.compile(r'allocated: (\d+\.\d+)') - max_mem_allocated_pattern = re.compile(r'max allocated: (\d+\.\d+)') + tflops_pattern = re.compile(r"(TFLOPs|TFLOP/s/GPU\)): (\d+\.\d+)") + elapsed_time_pattern = re.compile( + r"elapsed time per iteration \(ms\): (\d+\.\d+)" + ) + mem_allocated_pattern = re.compile(r"allocated: (\d+\.\d+)") + max_mem_allocated_pattern = re.compile(r"max allocated: (\d+\.\d+)") lines = output.splitlines() tflops = [] mem_allocated = [] max_mem_allocated = [] iteration_times = [] for line in lines: - if 'elapsed time per iteration' in line: + if "elapsed time per iteration" in line: tflops_matches = tflops_pattern.search(line) elapsed_time_match = elapsed_time_pattern.search(line) if tflops_matches: @@ -186,7 +360,7 @@ def _parse_log(self, output): elapsed_time_value = float(elapsed_time_match.group(1)) iteration_times.append(elapsed_time_value) - if 'max allocated' in line: + if "max allocated" in line: mem_allocated_match = mem_allocated_pattern.search(line) max_mem_allocated_match = max_mem_allocated_pattern.search(line) if mem_allocated_match: @@ -194,70 +368,73 @@ def _parse_log(self, output): mem_allocated.append(mem_allocated_value) if max_mem_allocated_match: - max_mem_allocated_value = float(max_mem_allocated_match.group(1)) / 1024 + max_mem_allocated_value = ( + float(max_mem_allocated_match.group(1)) / 1024 + ) max_mem_allocated.append(max_mem_allocated_value) return iteration_times, tflops, mem_allocated, max_mem_allocated def __prepare_deespeed_config(self, precision_megatron): """Prepare deepspeed configs.""" - self._config_json_path = os.path.join(self._args.data_home, 'ds_config_gpt.json') + self._config_json_path = os.path.join( + self._args.data_home, "ds_config_gpt.json" + ) # Load deepspeed config template json file precision_template = { - 'enabled': True, - 'loss_scale': 0, - 'loss_scale_window': 500, - 'hysteresis': 2, - 'min_loss_scale': 1, - 'initial_scale_power': 11 + "enabled": True, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 11, } ds_config_template = { - 'train_batch_size': self._args.batch_size, - 'train_micro_batch_size_per_gpu': self._args.micro_batch_size, - 'steps_per_print': self._args.log_interval, - 'zero_optimization': { - 'stage': self._args.zero_stage - }, - 'gradient_clipping': 1.0, - 'prescale_gradients': self._args.prescale_grad, + "train_batch_size": self._args.batch_size, + "train_micro_batch_size_per_gpu": self._args.micro_batch_size, + "steps_per_print": self._args.log_interval, + "zero_optimization": {"stage": self._args.zero_stage}, + "gradient_clipping": 1.0, + "prescale_gradients": self._args.prescale_grad, } if len(precision_megatron) > 0: ds_config_template[precision_megatron] = precision_template # Write to config json file - with open(self._config_json_path, 'w') as file: + with open(self._config_json_path, "w") as file: json.dump(ds_config_template, file, indent=4) - deepspeed_options = f'\ + deepspeed_options = f"\ --deepspeed \ --deepspeed_config {self._config_json_path} \ --zero-stage {self._args.zero_stage} \ --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size} \ --train-tokens {self._args.train_tokens} \ - --data-impl {self._args.data_impl}' + --data-impl {self._args.data_impl}" if self._args.pipeline_model_parallel_size <= 1: - deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel' + deepspeed_options = f"{deepspeed_options} --no-pipeline-parallel" return deepspeed_options - def _megatron_command(self, precision): # noqa: C901 + def _megatron_command(self, precision): # noqa: C901 """Generate megatron command.""" if precision == Precision.FLOAT32: - precision_megatron = '' + precision_megatron = "" elif precision == Precision.FLOAT16: - precision_megatron = '--fp16' + precision_megatron = "--fp16" elif precision == Precision.BFLOAT16: - precision_megatron = '--bf16' + precision_megatron = "--bf16" - megatron_options = f'\ - --override-opt_param-scheduler \ + megatron_options = f"\ --adam-beta1 0.9 \ --adam-beta2 0.95 \ + --tokenizer-type {self._args.tokenizer_type} \ --tensor-model-parallel-size {self._args.tensor_model_parallel_size} \ + --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size} \ --init-method-std {self._args.init_std} \ - --lr-decay-samples 43945312 \ + --lr-decay-samples 38400 \ --lr-warmup-samples {self._args.num_warmup * self._args.batch_size} \ --lr-decay-style cosine \ --micro-batch-size {self._args.micro_batch_size} \ @@ -270,80 +447,104 @@ def _megatron_command(self, precision): # noqa: C901 --train-samples {self._args.num_steps * self._args.batch_size} \ --lr {self._args.lr} \ --min-lr {self._args.min_lr} \ - --split {self._args.split} \ --log-interval {self._args.log_interval} \ --eval-interval {self._args.eval_interval} \ --eval-iters {self._args.eval_iters} \ --save-interval {self._args.save_interval} \ --weight-decay 0.1 \ --clip-grad 1.0 \ - --hysteresis 2 \ --num-workers {self._args.num_workers} \ --attention-dropout 0.0 \ --hidden-dropout 0.0 \ - --optimizer adam \ --use-distributed-optimizer \ {precision_megatron} \ --seed {self._args.seed} \ - --log-throughput' + --log-throughput" if self._args.sequence_parallel: - megatron_options = f'{megatron_options} --sequence-parallel' + megatron_options = f"{megatron_options} --sequence-parallel" if self._args.no_async_tensor_model_parallel_allreduce: - megatron_options = f'{megatron_options} --no-async-tensor-model-parallel-allreduce' + megatron_options = ( + f"{megatron_options} --no-async-tensor-model-parallel-allreduce" + ) if self._args.use_rotary_position_embeddings: - megatron_options = f'{megatron_options} --use-rotary-position-embeddings' + megatron_options = f"{megatron_options} --use-rotary-position-embeddings" if self._args.no_gradient_accumulation_fusion: - megatron_options = f'{megatron_options} --no-gradient-accumulation-fusion' + megatron_options = f"{megatron_options} --no-gradient-accumulation-fusion" if self._args.use_flash_attn: - megatron_options = f'{megatron_options} --use-flash-attn' + megatron_options = f"{megatron_options} --use-flash-attn" if self._args.no_masked_softmax_fusion: - megatron_options = f'{megatron_options} --no-masked-softmax-fusion' + megatron_options = f"{megatron_options} --no-masked-softmax-fusion" if self._args.no_bias_gelu_fusion: - megatron_options = f'{megatron_options} --no-bias-gelu-fusion' + megatron_options = f"{megatron_options} --no-bias-gelu-fusion" if self._args.no_bias_dropout_fusion: - megatron_options = f'{megatron_options} --no-bias-dropout-fusion' - if self._args.extra: - megatron_options = f'{megatron_options} {self._args.extra}' - - command = '' - script_path = os.path.join(self._args.code_base, 'pretrain_gpt.py') + megatron_options = f"{megatron_options} --no-bias-dropout-fusion" + if self._args.split: + megatron_options = f"{megatron_options} --split {self._args.split}" + if self._args.extra and len(self._args.extra) > 0: + #self._args.extra = self._args.extra.split(",") + print( + f"Extra options: {self._args.extra}, {type(self._args.extra)}" + ) + megatron_options = f"{megatron_options} {self._args.extra}" + if self._args.expert_model_parallel_size: + megatron_options = f"{megatron_options} --expert-model-parallel-size {self._args.expert_model_parallel_size}" + if self._args.num_experts: + megatron_options = f"{megatron_options} --num-experts {self._args.num_experts}" + if self._args.max_padding_length: + megatron_options = f"{megatron_options} --max-padding-length {self._args.max_padding_length}" + if self._args.transformer_impl: + megatron_options = f"{megatron_options} --transformer-impl {self._args.transformer_impl}" + + command = "" + script_path = os.path.join( + self._args.code_base, f"pretrain_{self._args.model}.py" + ) if self._args.deepspeed: - deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--')) + deepspeed_option = self.__prepare_deespeed_config( + precision_megatron.lstrip("--") + ) # No --log-throughput in Megatron-DeepSpeed by 20231219 - megatron_options = megatron_options.replace('--log-throughput', '').strip() + megatron_options = megatron_options.replace("--log-throughput", "").strip() if self._num_nodes > 1: - command = f'torchrun {self._distributed_args} ' + \ - f'{script_path} {megatron_options} {self._data_options} {deepspeed_option}' + command = ( + f"torchrun {self._distributed_args} " + + f"{script_path} {megatron_options} {self._data_options} {deepspeed_option}" + ) else: - command = f'deepspeed {script_path} {megatron_options} {self._data_options} {deepspeed_option}' + command = f"deepspeed {script_path} {megatron_options} {self._data_options} {deepspeed_option}" else: - command = f'torchrun {self._distributed_args} {script_path} {megatron_options} {self._data_options}' + command = f"torchrun {self._distributed_args} {script_path} {megatron_options} {self._data_options}" return command - def _train_step(self, precision): # noqa: E501 + def _train_step(self, precision): # noqa: E501 """Train the model and get the performance.""" command = self._megatron_command(precision) - local_rank = os.environ.pop('OMPI_COMM_WORLD_LOCAL_RANK', None) - logger.info('Running command: {}.'.format(command)) + local_rank = os.environ.pop("OMPI_COMM_WORLD_LOCAL_RANK", None) + logger.info("Running command: {}.".format(command)) output = run_command(command, flush_output=True) - os.environ['OMPI_COMM_WORLD_LOCAL_RANK'] = local_rank + os.environ["OMPI_COMM_WORLD_LOCAL_RANK"] = local_rank iteration_times = [] info = {} # last rank will print the result, first rank will print the memory usage - if self._num_nodes == 1 or \ - int(os.environ['OMPI_COMM_WORLD_RANK']) == int(os.environ['OMPI_COMM_WORLD_SIZE']) - 1 \ - or int(os.environ['OMPI_COMM_WORLD_RANK']) == 0: - iteration_times, tflops, mem_allocated, max_mem_allocated = self._parse_log(output.stdout) + if ( + self._num_nodes == 1 + or int(os.environ["OMPI_COMM_WORLD_RANK"]) + == int(os.environ["OMPI_COMM_WORLD_SIZE"]) - 1 + or int(os.environ["OMPI_COMM_WORLD_RANK"]) == 0 + ): + iteration_times, tflops, mem_allocated, max_mem_allocated = self._parse_log( + output.stdout + ) if len(tflops) > 0: - info['tflops'] = tflops + info["tflops"] = tflops if len(mem_allocated) > 0: - info['mem_allocated'] = mem_allocated + info["mem_allocated"] = mem_allocated if len(max_mem_allocated) > 0: - info['max_mem_allocated'] = max_mem_allocated + info["max_mem_allocated"] = max_mem_allocated if not iteration_times: iteration_times = [-1 for i in range(self._args.num_steps)] @@ -356,6 +557,7 @@ def _sync_result(self, data): data (list): the data to be reduced. """ from mpi4py import MPI + comm = MPI.COMM_WORLD data = np.array(data, dtype=np.float64) # Reduce the data to a single value on rank 0 @@ -365,16 +567,20 @@ def _sync_result(self, data): def _process_info(self, model_action, precision, info): """Process the result of model benchmarking.""" - precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'bfloat16': 'bf16'} + precision_metric = {"float16": "fp16", "float32": "fp32", "bfloat16": "bf16"} if precision.value in precision_metric.keys(): precision = precision_metric[precision.value] for key, values in info.items(): - metric = '{}_{}_{}'.format(precision, model_action, key) + metric = "{}_{}_{}".format(precision, model_action, key) self._result.add_raw_data(metric, values, self._args.log_raw_data) self._result.add_result(metric, statistics.mean(values)) logger.info( - 'Average {} - round: {}, model: {}, precision: {}, value: {:.6f}.'.format( - key, self._curr_run_index, self._name, precision, statistics.mean(values) + "Average {} - round: {}, model: {}, precision: {}, value: {:.6f}.".format( + key, + self._curr_run_index, + self._name, + precision, + statistics.mean(values), ) ) @@ -388,35 +594,49 @@ def _init_distributed_setting(self): Return: True if distributed library is initialized successfully. """ - if not os.getenv('OMPI_COMM_WORLD_SIZE'): - logger.error('MPI is not enabled.') + if not os.getenv("OMPI_COMM_WORLD_SIZE"): + logger.error("MPI is not enabled.") return False - self._num_nodes = int(os.getenv('OMPI_COMM_WORLD_SIZE')) // int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE')) - master_addr = 'localhost' + self._num_nodes = int(os.getenv("OMPI_COMM_WORLD_SIZE")) // int( + os.getenv("OMPI_COMM_WORLD_LOCAL_SIZE") + ) + master_addr = "localhost" if self._num_nodes > 1: if not self._args.hostfile: - sb_hostfile = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'hostfile') + sb_hostfile = os.path.join( + os.environ.get("SB_WORKSPACE", "."), "hostfile" + ) if os.path.exists(sb_hostfile): - hosts = open(sb_hostfile).read().split('\n') - hosts = [f'{host} slots={self._args.num_gpus}' for host in hosts if host != ''] - self._args.hostfile = os.path.join(self._args.data_home, 'hostfile') - with open(self._args.hostfile, 'w') as file: - file.write('\n'.join(hosts)) + hosts = open(sb_hostfile).read().split("\n") + hosts = [ + f"{host} slots={self._args.num_gpus}" + for host in hosts + if host != "" + ] + self._args.hostfile = os.path.join(self._args.data_home, "hostfile") + with open(self._args.hostfile, "w") as file: + file.write("\n".join(hosts)) if not os.path.exists(self._args.hostfile): - logger.error('Hostfile not found.') + logger.error("Hostfile not found.") return False - hosts = open(self._args.hostfile, 'r').readlines() + hosts = open(self._args.hostfile, "r").readlines() if self._num_nodes != len(hosts): - logger.error('MPI init failed since hostfile not match the MPI setting.') + logger.error( + "MPI init failed since hostfile not match the MPI setting." + ) return False master_addr = hosts[0].split()[0] - addr = os.getenv('MASTER_ADDR', master_addr) - port = os.getenv('MASTER_PORT', '29500') - node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE']) - self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \ - f'--node_rank {node_rank} --master_addr {addr} --master_port {port}' + addr = os.getenv("MASTER_ADDR", master_addr) + port = os.getenv("MASTER_PORT", "29500") + node_rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) // int( + os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"] + ) + self._distributed_args = ( + f"--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} " + + f"--node_rank {node_rank} --master_addr {addr} --master_port {port}" + ) return True def _generate_dataset(self): @@ -425,48 +645,70 @@ def _generate_dataset(self): Return: True if dataset is created successfully. """ - self._vocab_path = str(Path(self._args.data_home) / 'gpt2-vocab.json') - download_file(self._args.vocab_url, self._vocab_path) - self._merges_path = str(Path(self._args.data_home) / 'gpt2-merges.txt') - download_file(self._args.merges_url, self._merges_path) - - if not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.bin')) \ - or not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.idx')): - if self._args.dataset_url: - self._raw_data_path = str(Path(self._args.data_home) / 'data.json') - download_file(self._args.dataset_url, self._raw_data_path) - command = ( - 'python3 ' - f'{os.path.join(self._args.code_base, "tools/preprocess_data.py")} ' - f'--input {self._raw_data_path} ' - f'--tokenizer-type {self._args.tokenizer_type} ' - f'--output-prefix {os.path.join(self._args.data_home, "dataset")} ' - f'--workers {str(self._args.num_workers)} ' - f'--vocab-file {self._vocab_path} ' - f'--merge-file {self._merges_path}' - ) + self._data_options = "" + if self._args.mock_data: + logger.info(f"Using mock data.") + self._data_options = "--mock-data" + else: - # split documents - run_command(command, flush_output=True) - # binarize dataset - run_command(command, flush_output=True) - if not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.bin')) \ - or not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.idx')): - logger.error('Dataset failed to generate.') + self._vocab_path = str(Path(self._args.data_home) / "gpt2-vocab.json") + download_file(self._args.vocab_url, self._vocab_path) + self._merges_path = str(Path(self._args.data_home) / "gpt2-merges.txt") + download_file(self._args.merges_url, self._merges_path) + + if not os.path.exists( + os.path.join(self._args.data_home, f"{self._args.data_prefix}.bin") + ) or not os.path.exists( + os.path.join(self._args.data_home, f"{self._args.data_prefix}.idx") + ): + if self._args.dataset_url: + self._raw_data_path = str(Path(self._args.data_home) / "data.json") + download_file(self._args.dataset_url, self._raw_data_path) + command = ( + "python3 " + f'{os.path.join(self._args.code_base, "tools/preprocess_data.py")} ' + f"--input {self._raw_data_path} " + f"--tokenizer-type {self._args.tokenizer_type} " + f'--output-prefix {os.path.join(self._args.data_home, "dataset")} ' + f"--workers {str(self._args.num_workers)} " + f"--vocab-file {self._vocab_path} " + f"--merge-file {self._merges_path}" + ) + + # split documents + run_command(command, flush_output=True) + # binarize dataset + run_command(command, flush_output=True) + if not os.path.exists( + os.path.join( + self._args.data_home, f"{self._args.data_prefix}.bin" + ) + ) or not os.path.exists( + os.path.join( + self._args.data_home, f"{self._args.data_prefix}.idx" + ) + ): + logger.error("Dataset failed to generate.") + self._result.set_return_code( + ReturnCode.DATASET_GENERATION_FAILURE + ) + return False + + else: + logger.error("No dataset or dataset url provided.") self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE) return False - else: - logger.error('No dataset or dataset url provided.') - self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE) - return False - - self._data_path = os.path.join(self._args.data_home, f'{self._args.data_prefix}') - self._data_options = f'\ - --vocab-file {self._vocab_path} \ - --merge-file {self._merges_path} \ - --data-path {self._data_path}' - logger.info('Dataset preparation successfully.') + self._data_path = os.path.join( + self._args.data_home, f"{self._args.data_prefix}" + ) + self._data_options = f"\ + --vocab-file {self._vocab_path} \ + --merge-file {self._merges_path} \ + --data-path {self._data_path}" + if self._args.dataloader_type: + self._data_options += f" --dataloader-type {self._args.dataloader_type}" + logger.info("Dataset preparation successfully.") return True def _set_force_fp32(self): @@ -519,5 +761,11 @@ def _cal_params_count(self): # Register GPT3 benchmark. -BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.CUDA) -BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.ROCM) +BenchmarkRegistry.register_benchmark( + "megatron-gpt", MegatronGPT, parameters="", platform=Platform.CUDA +) +BenchmarkRegistry.register_benchmark( + "megatron-gpt", MegatronGPT, parameters="", platform=Platform.ROCM +) + + diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py index f0cb52319..d824fafe7 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_base.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py @@ -207,7 +207,7 @@ def _create_optimizer(self): elif self._optimizer_type == Optimizer.ADAM: self._optimizer = torch.optim.Adam(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08) elif self._optimizer_type == Optimizer.ADAMW: - self._optimizer = transformers.AdamW(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08) + self._optimizer = torch.optim.AdamW(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08) else: self._optimizer = None diff --git a/third_party/Makefile b/third_party/Makefile index e8149afbe..b6bec4be7 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -16,13 +16,13 @@ ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0") NUM_MAKE_JOBS ?= $(shell nproc --ignore=2) -.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm nvbandwidth +.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm # Build targets. all: cuda rocm cuda_with_msccl: cuda cuda_msccl cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth -rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm +rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm cpu: common cpu_perftest cpu_stream common: fio @@ -134,9 +134,7 @@ rocm_rocblas: sb_micro_path # Since it takes several hours to build, avoid to build again if hipblaslt-bench exsists. rocm_hipblaslt: sb_micro_path @if [ ! -e $(SB_MICRO_PATH)/bin/hipblaslt-bench ] && [ -z `which hipblaslt-bench` ]; then \ - if [ -d hipBLASLt ]; then rm -rf hipBLASLt; fi; \ - git clone -b ${HIPBLASLT_BRANCH} https://github.com/ROCmSoftwarePlatform/hipBLASLt.git ./hipBLASLt; \ - cd ./hipBLASLt && ./install.sh -dc; \ + cd ./hipBLASLt && python3 -m pip install -r tensilelite/requirements.txt && ./install.sh -dc; \ cp -v $(SB_MICRO_PATH)/third_party/hipBLASLt/build/release/clients/staging/hipblaslt-bench $(SB_MICRO_PATH)/bin/; \ fi @@ -210,6 +208,18 @@ megatron_deepspeed: python -m pip install --no-cache-dir -r requirements.txt && \ python -m pip install DeepSpeed +rocm_megatron_lm: + cd Megatron && mkdir -p rocm && cd rocm && \ + if [ ! -d "Megatron-LM" ]; then \ + git clone -b rocm_dev https://github.com/ROCm/Megatron-LM.git ; \ + fi + cp Megatron/rocm/Megatron-LM/examples/deepseek_v2/pretrain_deepseek.py Megatron/rocm/Megatron-LM/ + git clone https://github.com/caaatch22/grouped_gemm.git &&\ + cd grouped_gemm &&\ + git checkout 8a9b438 &&\ + git submodule update --init --recursive &&\ + pip install . + # Instal apex of ROCm due to dependency of Megatron apex_rocm: $(eval TORCH_VERSION ?= $(shell python -c "import torch; print(torch.__version__)")) From f30d25597f6c793f4ed86389cfb0fcbb7cf8d8a2 Mon Sep 17 00:00:00 2001 From: yukirora Date: Fri, 23 May 2025 11:39:42 +0000 Subject: [PATCH 2/6] add test --- setup.py | 2 +- .../model_benchmarks/megatron_gpt3.py | 819 +++++++++--------- .../model_benchmarks/test_megatron_gpt.py | 267 +++++- third_party/Makefile | 6 +- 4 files changed, 645 insertions(+), 449 deletions(-) diff --git a/setup.py b/setup.py index 120329c60..fe3d7d462 100644 --- a/setup.py +++ b/setup.py @@ -193,7 +193,7 @@ def run(self): 'xlsxwriter>=1.3.8', 'xmltodict>=0.12.0', 'types-requests==0.1.13', - 'urllib3==1.26.20' + 'urllib3>=1.26.5' ], extras_require=( lambda x: { diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index ac4f2ffc6..41b64ee60 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -22,14 +22,13 @@ def download_file(url, path): """Download file from url to path.""" response = requests.get(url) - with open(path, "wb") as file: + with open(path, 'wb') as file: file.write(response.content) class MegatronGPT(ModelBenchmark): """The Megatron DeepSpeed GPT pretrain benchmark class.""" - - def __init__(self, name, parameters=""): + def __init__(self, name, parameters=''): """Constructor. Args: @@ -37,297 +36,230 @@ def __init__(self, name, parameters=""): parameters (str): parameters of the benchmark. """ super().__init__(name, parameters) - self._supported_precision = [ - Precision.FLOAT32, - Precision.FLOAT16, - Precision.BFLOAT16, - ] + self._supported_precision = [Precision.FLOAT32, Precision.FLOAT16, Precision.BFLOAT16] def add_parser_arguments(self): """Add the specified arguments.""" super().add_parser_arguments() + # Model configs + self._parser.add_argument('--model_size', type=int, required=False, default=6.7, help='Model size.') + self._parser.add_argument('--num_layers', type=int, required=False, default=32, help='Number of layers.') + self._parser.add_argument('--hidden_size', type=int, required=False, default=4096, help='Hidden size.') self._parser.add_argument( - "--code_base", type=str, required=False, default="", help="Code base." - ) - self._parser.add_argument( - "--dataset_url", type=str, required=False, default=None, help="Dataset URL." - ) - self._parser.add_argument( - "--vocab_url", - type=str, - required=False, - default="https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json", - help="Vocab URL.", - ) - self._parser.add_argument( - "--merges_url", - type=str, - required=False, - default="https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt", - help="Merges URL.", - ) - self._parser.add_argument( - "--tokenizer_type", - type=str, - required=False, - default="GPT2BPETokenizer", - help="Tokenizer type.", - ) - self._parser.add_argument( - "--model_size", type=int, required=False, default=6.7, help="Model size." - ) - self._parser.add_argument( - "--num_layers", - type=int, - required=False, - default=32, - help="Number of layers.", - ) - self._parser.add_argument( - "--hidden_size", type=int, required=False, default=4096, help="Hidden size." - ) - self._parser.add_argument( - "--num_attn_heads", - type=int, - required=False, - default=32, - help="Number of attention heads.", - ) - self._parser.add_argument( - "--micro_batch_size", - type=int, - required=False, - default=2, - help="micro batch size.", + '--num_attn_heads', type=int, required=False, default=32, help='Number of attention heads.' ) + self._parser.add_argument('--micro_batch_size', type=int, required=False, default=2, help='micro batch size.') + self._parser.add_argument('--lr', type=float, required=False, default=1.2e-4, help='Learning rate.') + self._parser.add_argument('--min_lr', type=float, required=False, default=1.0e-6, help='Minimum learning rate.') + self._parser.add_argument('--init_std', type=float, required=False, default=0.009, help='Init std.') + self._parser.add_argument('--seq_len', type=int, required=False, default=2048, help='Sequence length.') self._parser.add_argument( - "--lr", type=float, required=False, default=1.2e-4, help="Learning rate." + '--tensor_model_parallel_size', type=int, required=False, default=1, help='Tensor model parallel size.' ) self._parser.add_argument( - "--min_lr", - type=float, - required=False, - default=1.0e-6, - help="Minimum learning rate.", + '--pipeline_model_parallel_size', type=int, required=False, default=1, help='Pipeline model parallel size.' ) self._parser.add_argument( - "--init_std", type=float, required=False, default=0.009, help="Init std." + '--num_gpus', type=int, required=False, default=8, help='Number of GPUs per node to run the benchmark.' ) self._parser.add_argument( - "--seq_len", type=int, required=False, default=2048, help="Sequence length." + '--num_nodes', type=int, required=False, default=1, help='Number of nodes to run the benchmark.' ) + self._parser.add_argument('--sequence_parallel', action='store_true', help='Enable Sequence parallel.') self._parser.add_argument( - "--tensor_model_parallel_size", - type=int, - required=False, - default=1, - help="Tensor model parallel size.", + '--no_async_tensor_model_parallel_allreduce', + action='store_true', + help='No async tensor model parallel allreduce.' ) self._parser.add_argument( - "--pipeline_model_parallel_size", - type=int, - required=False, - default=1, - help="Pipeline model parallel size.", + '--use_rotary_position_embeddings', action='store_true', help='Use rotary position embeddings.' ) self._parser.add_argument( - "--num_gpus", - type=int, - required=False, - default=8, - help="Number of GPUs per node to run the benchmark.", + '--no_gradient_accumulation_fusion', action='store_true', help='No gradient accumulation fusion.' ) + self._parser.add_argument('--use_flash_attn', action='store_true', help='Use flash attention.') + self._parser.add_argument('--no_masked_softmax_fusion', action='store_true', help='No masked softmax fusion.') + self._parser.add_argument('--no_bias_gelu_fusion', action='store_true', help='No bias gelu fusion.') + self._parser.add_argument('--no_bias_dropout_fusion', action='store_true', help='No bias dropout fusion.') self._parser.add_argument( - "--num_nodes", - type=int, - required=False, - default=1, - help="Number of nodes to run the benchmark.", - ) - self._parser.add_argument( - "--sequence_parallel", action="store_true", help="Enable Sequence parallel." - ) - self._parser.add_argument( - "--no_async_tensor_model_parallel_allreduce", - action="store_true", - help="No async tensor model parallel allreduce.", - ) - self._parser.add_argument( - "--use_rotary_position_embeddings", - action="store_true", - help="Use rotary position embeddings.", - ) - self._parser.add_argument( - "--no_gradient_accumulation_fusion", - action="store_true", - help="No gradient accumulation fusion.", - ) - self._parser.add_argument( - "--use_flash_attn", action="store_true", help="Use flash attention." - ) - self._parser.add_argument( - "--no_masked_softmax_fusion", - action="store_true", - help="No masked softmax fusion.", - ) - self._parser.add_argument( - "--no_bias_gelu_fusion", action="store_true", help="No bias gelu fusion." - ) - self._parser.add_argument( - "--no_bias_dropout_fusion", - action="store_true", - help="No bias dropout fusion.", - ) - self._parser.add_argument( - "--train_tokens", - type=int, - required=False, - default=300000000000, - help="Train tokens.", + '--train_tokens', type=int, required=False, default=300000000000, help='Train tokens.' ) + self._parser.add_argument('--lr_decay_samples', type=int, default=43945312, help='Use lr decay samples.') + self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.') # lr configs # Parallelism configs - self._parser.add_argument( - "--zero_stage", type=int, default=1, help="Zero stage." - ) + self._parser.add_argument('--zero_stage', type=int, default=1, help='Zero stage.') # Misc configs - self._parser.add_argument( - "--log-interval", type=int, required=False, default=1, help="Log interval." - ) - self._parser.add_argument( - "--eval_iters", type=int, default=0, help="Eval iters." - ) - self._parser.add_argument( - "--eval_interval", type=int, default=10, help="Eval interval." - ) - self._parser.add_argument( - "--num_save", type=int, default=10000, help="Num save." - ) - self._parser.add_argument( - "--save_interval", type=int, default=10000, help="Save interval." - ) + self._parser.add_argument('--log-interval', type=int, required=False, default=1, help='Log interval.') + self._parser.add_argument('--eval_iters', type=int, default=0, help='Eval iters.') + self._parser.add_argument('--eval_interval', type=int, default=10, help='Eval interval.') + self._parser.add_argument('--num_save', type=int, default=10000, help='Num save.') + self._parser.add_argument('--save_interval', type=int, default=10000, help='Save interval.') # Output and data configs - self._parser.add_argument("--seed", type=int, default=1234, help="Seed.") + self._parser.add_argument('--seed', type=int, default=1234, help='Seed.') + self._parser.add_argument('--data_home', type=str, default='/tmp', help='Data home.') + self._parser.add_argument('--vocab_path', type=str, default='/tmp/gpt2-vocab.json', help='Vocab path.') + self._parser.add_argument('--merge_path', type=str, default='/tmp/gpt2-merges.txt', help='Merge path.') self._parser.add_argument( - "--data_home", type=str, default="/tmp", help="Data home." + '--split', type=str, default='949,50,1', help='Split dataset ratio for train/val/test.' ) + self._parser.add_argument('--dataset_url', type=str, required=False, default=None, help='Dataset URL.') self._parser.add_argument( - "--vocab_path", type=str, default="/tmp/gpt2-vocab.json", help="Vocab path." - ) - self._parser.add_argument( - "--merge_path", type=str, default="/tmp/gpt2-merges.txt", help="Merge path." - ) - self._parser.add_argument( - "--split", + '--vocab_url', type=str, - default=None, - help="Split dataset ratio for train/val/test.", - ) - self._parser.add_argument( - "--prescale_grad", action="store_true", help="Prescale grad." + required=False, + default='https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json', + help='Vocab URL.' ) self._parser.add_argument( - "--hostfile", + '--merges_url', type=str, - default=None, - help="Hostfile to run the mutli-node benchmark.", - ) - self._parser.add_argument( - "--data_impl", type=str, default="mmap", help="Data impl." + required=False, + default='https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt', + help='Merges URL.' ) + self._parser.add_argument('--data_impl', type=str, default='mmap', help='Data impl.') + self._parser.add_argument('--data_prefix', type=str, default='dataset_text_document', help='Data prefix.') + self._parser.add_argument('--mock_data', action='store_true', help='Use mock data.') self._parser.add_argument( - "--data_prefix", + '--dataloader_type', type=str, - default="dataset_text_document", - help="Data prefix.", - ) - self._parser.add_argument( - "--deepspeed", action="store_true", help="Use deepspeed." - ) - # list of extra options - self._parser.add_argument( - "--extra", type=str, default="", help="Extra options." + default=None, + help='Data loader type to load data.', ) self._parser.add_argument( - "--mock_data", action="store_true", help="Use mock data." + '--max_padding_length', + type=int, + default=None, + help='Max padding legth to embedding.', ) self._parser.add_argument( - "--model", + '--data_cache_path', type=str, - default="gpt", - help='Model to run. Current supported: "gpt" and "deepseek".', + default=None, + help='Data cache path.', ) self._parser.add_argument( - "--dataloader_type", + '--dataset', type=str, default=None, - help="Data loader type to load data.", + help='Dataset to use.', + ) + # Model architecture + self._parser.add_argument('--ffn_hidden_size', type=int, help='FFN hidden layer size.') + self._parser.add_argument('--swiglu', action='store_true', help='Enable SwiGLU activation.') + self._parser.add_argument('--no_bias_swiglu_fusion', action='store_true', help='Disable bias SwiGLU fusion.') + self._parser.add_argument('--disable_bias_linear', action='store_true', help='Disable bias in linear layers.') + self._parser.add_argument('--normalization', type=str, help='Normalization method.') + self._parser.add_argument('--norm_epsilon', type=float, help='Normalization epsilon.') + self._parser.add_argument( + '--untie_embeddings_and_output_weights', action='store_true', help='Untie embeddings and output weights.' + ) + self._parser.add_argument('--extra_vocab_size', type=int, help='Extra vocabulary size.') + self._parser.add_argument('--transformer_impl', type=str, default=None, help='Transformer implementation.') + # Loss settings + self._parser.add_argument('--eod_mask_loss', action='store_true', help='Enable EOD mask loss.') + self._parser.add_argument('--hysteresis', type=int, default=2, help='Hysteresis for loss scale.') + # Optimizer + self._parser.add_argument( + '--optimizer', + type=str, + default='adam', + help='Optimizer to use. Current supported: "adam" and "fused_adam".', ) self._parser.add_argument( - "--max_padding_length", - type=int, - default=None, - help="Max padding legth to embedding.", + '--override_opt_param_scheduler', action='store_true', help='Enable the opt_param scheduler.' ) + # LoRA settings + self._parser.add_argument('--kv_lora_rank', type=int, help='KV LoRA rank.') + # MoE configuration self._parser.add_argument( - "--expert_model_parallel_size", + '--expert_model_parallel_size', type=int, default=None, - help="Expert model parallel size.", + help='Expert model parallel size.', ) self._parser.add_argument( - "--num_experts", + '--num_experts', type=int, default=None, - help="Number of experts.", + help='Number of experts.', + ) + self._parser.add_argument('--moe_ffn_hidden_size', type=int, help='MoE FFN hidden size.') + self._parser.add_argument('--enable_shared_expert', action='store_true', help='Enable shared expert in MoE.') + self._parser.add_argument('--moe_layer_freq', type=int, help='MoE layer frequency.') + self._parser.add_argument('--num_shared_experts', type=int, help='Number of shared experts.') + self._parser.add_argument('--moe_router_topk', type=int, help='Top-k routing for MoE.') + self._parser.add_argument('--moe_aux_loss_coeff', type=float, help='Auxiliary loss coefficient.') + self._parser.add_argument( + '--moe_router_load_balancing_type', type=str, help='Load balancing type for MoE router.' + ) + # Tokenizer & Position Encoding + self._parser.add_argument( + '--tokenizer_type', type=str, required=False, default='GPT2BPETokenizer', help='Tokenizer type.' + ) + self._parser.add_argument('--patch_tokenizer_type', type=str, help='Tokenizer type.') + self._parser.add_argument('--position_embedding_type', type=str, help='Position embedding type.') + self._parser.add_argument('--no_rope_fusion', action='store_true', help='Disable RoPE fusion.') + self._parser.add_argument('--rotary_base', type=int, help='Rotary base value.') + self._parser.add_argument('--rotary_scaling_factor', type=int, help='Rotary scaling factor.') + self._parser.add_argument('--qk_nope_head_dim', type=int, help='QK NoPE head dimension.') + self._parser.add_argument('--qk_rope_head_dim', type=int, help='QK RoPE head dimension.') + self._parser.add_argument('--v_head_dim', type=int, help='V head dimension.') + # Checkpoint and loading + self._parser.add_argument('--load', type=str, help='Model to load.') + self._parser.add_argument('--no_load_optim', action='store_true', help='Disable optimizer loading.') + self._parser.add_argument('--no_load_rng', action='store_true', help='Disable RNG loading.') + self._parser.add_argument('--ckpt_format', type=str, help='Checkpoint format.') + # Other settings + self._parser.add_argument('--code_base', type=str, required=False, default='', help='Code base.') + self._parser.add_argument( + '--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.' + ) + self._parser.add_argument('--deepspeed', action='store_true', help='Use deepspeed.') + self._parser.add_argument('--extra', type=str, default=None, help='Extra options for Megatron.') + self._parser.add_argument( + '--model', + type=str, + default='gpt', + help='Model to run. Current supported: "gpt" and "deepseek".', ) self._parser.add_argument( - "--transformer_impl", + '--train_mode', type=str, default=None, - help="Transformer implementation.", + help='Train mode to run. Current supported: "pretrain" and "finetune".', ) - def _preprocess(self): if not super()._preprocess(): return False if not self._args.code_base: if self._args.deepspeed: self._args.code_base = os.path.join( - os.getenv("SB_MICRO_PATH"), - "third_party/Megatron/Megatron-DeepSpeed/", + os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-DeepSpeed/' ) else: - self._args.code_base = os.path.join( - os.getenv("SB_MICRO_PATH"), "third_party/Megatron/Megatron-LM" - ) + self._args.code_base = os.path.join(os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-LM') if not os.path.exists(self._args.code_base) or not os.path.exists( - os.path.join(self._args.code_base, f"pretrain_{self._args.model}.py") + os.path.join(self._args.code_base, f'pretrain_{self._args.model}.py') ): - logger.error("Code base is not valid.") + logger.error('Code base is not valid.') self._result.set_return_code(ReturnCode.INVALID_ARGUMENT) return False - data_parallel_size = ( - self._args.num_gpus - * self._num_nodes - // self._args.pipeline_model_parallel_size - // self._args.tensor_model_parallel_size - ) - if self._args.micro_batch_size < 1 or self._args.micro_batch_size > ( - self._args.batch_size // data_parallel_size - ): - logger.error( - "Micro Batch size * data parallel size is larger than global batch size." - ) + data_parallel_size = self._args.num_gpus * self._num_nodes \ + // self._args.pipeline_model_parallel_size // self._args.tensor_model_parallel_size + if self._args.micro_batch_size < 1 or \ + self._args.micro_batch_size > (self._args.batch_size // data_parallel_size): + logger.error('Micro Batch size * data parallel size is larger than global batch size.') self._result.set_return_code(ReturnCode.INVALID_ARGUMENT) return False for precision in self._args.precision: if precision not in self._supported_precision: - logger.error("Precision %s is not supported." % precision) + logger.error('Precision %s is not supported.' % precision) self._result.set_return_code(ReturnCode.INVALID_ARGUMENT) return False @@ -338,19 +270,17 @@ def _preprocess(self): def _parse_log(self, output): """Parse log output and get the performance.""" - tflops_pattern = re.compile(r"(TFLOPs|TFLOP/s/GPU\)): (\d+\.\d+)") - elapsed_time_pattern = re.compile( - r"elapsed time per iteration \(ms\): (\d+\.\d+)" - ) - mem_allocated_pattern = re.compile(r"allocated: (\d+\.\d+)") - max_mem_allocated_pattern = re.compile(r"max allocated: (\d+\.\d+)") + tflops_pattern = re.compile(r'(TFLOPs|TFLOP/s/GPU\)): (\d+\.\d+)') + elapsed_time_pattern = re.compile(r'elapsed time per iteration \(ms\): (\d+\.\d+)') + mem_allocated_pattern = re.compile(r'allocated: (\d+\.\d+)') + max_mem_allocated_pattern = re.compile(r'max allocated: (\d+\.\d+)') lines = output.splitlines() tflops = [] mem_allocated = [] max_mem_allocated = [] iteration_times = [] for line in lines: - if "elapsed time per iteration" in line: + if 'elapsed time per iteration' in line: tflops_matches = tflops_pattern.search(line) elapsed_time_match = elapsed_time_pattern.search(line) if tflops_matches: @@ -360,7 +290,7 @@ def _parse_log(self, output): elapsed_time_value = float(elapsed_time_match.group(1)) iteration_times.append(elapsed_time_value) - if "max allocated" in line: + if 'max allocated' in line: mem_allocated_match = mem_allocated_pattern.search(line) max_mem_allocated_match = max_mem_allocated_pattern.search(line) if mem_allocated_match: @@ -368,73 +298,186 @@ def _parse_log(self, output): mem_allocated.append(mem_allocated_value) if max_mem_allocated_match: - max_mem_allocated_value = ( - float(max_mem_allocated_match.group(1)) / 1024 - ) + max_mem_allocated_value = float(max_mem_allocated_match.group(1)) / 1024 max_mem_allocated.append(max_mem_allocated_value) return iteration_times, tflops, mem_allocated, max_mem_allocated def __prepare_deespeed_config(self, precision_megatron): """Prepare deepspeed configs.""" - self._config_json_path = os.path.join( - self._args.data_home, "ds_config_gpt.json" - ) + self._config_json_path = os.path.join(self._args.data_home, 'ds_config_gpt.json') # Load deepspeed config template json file precision_template = { - "enabled": True, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 11, + 'enabled': True, + 'loss_scale': 0, + 'loss_scale_window': 500, + 'min_loss_scale': 1, + 'initial_scale_power': 11 } + if self._args.hysteresis is not None: + precision_template['hysteresis'] = self._args.hysteresis ds_config_template = { - "train_batch_size": self._args.batch_size, - "train_micro_batch_size_per_gpu": self._args.micro_batch_size, - "steps_per_print": self._args.log_interval, - "zero_optimization": {"stage": self._args.zero_stage}, - "gradient_clipping": 1.0, - "prescale_gradients": self._args.prescale_grad, + 'train_batch_size': self._args.batch_size, + 'train_micro_batch_size_per_gpu': self._args.micro_batch_size, + 'steps_per_print': self._args.log_interval, + 'zero_optimization': { + 'stage': self._args.zero_stage + }, + 'gradient_clipping': 1.0, + 'prescale_gradients': self._args.prescale_grad, } if len(precision_megatron) > 0: ds_config_template[precision_megatron] = precision_template # Write to config json file - with open(self._config_json_path, "w") as file: + with open(self._config_json_path, 'w') as file: json.dump(ds_config_template, file, indent=4) - deepspeed_options = f"\ + deepspeed_options = f'\ --deepspeed \ --deepspeed_config {self._config_json_path} \ --zero-stage {self._args.zero_stage} \ --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size} \ --train-tokens {self._args.train_tokens} \ - --data-impl {self._args.data_impl}" + --data-impl {self._args.data_impl}' if self._args.pipeline_model_parallel_size <= 1: - deepspeed_options = f"{deepspeed_options} --no-pipeline-parallel" + deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel' return deepspeed_options - def _megatron_command(self, precision): # noqa: C901 + def _append_parallel_flags(self, opts): + if self._args.sequence_parallel: + opts += ' --sequence-parallel' + if self._args.no_async_tensor_model_parallel_allreduce: + opts += ' --no-async-tensor-model-parallel-allreduce' + if self._args.pipeline_model_parallel_size > 1: + opts += f' --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size}' + + return opts + + def _append_architecture_flags(self, opts): + if self._args.swiglu: + opts += ' --swiglu' + if self._args.no_bias_swiglu_fusion: + opts += ' --no-bias-swiglu-fusion' + if self._args.disable_bias_linear: + opts += ' --disable-bias-linear' + if self._args.normalization: + opts += f' --normalization {self._args.normalization}' + if self._args.norm_epsilon: + opts += f' --norm-epsilon {self._args.norm_epsilon}' + if self._args.untie_embeddings_and_output_weights: + opts += ' --untie-embeddings-and-output-weights' + if self._args.transformer_impl: + opts += f' --transformer-impl {self._args.transformer_impl}' + if self._args.extra_vocab_size: + opts += f' --extra-vocab-size {self._args.extra_vocab_size}' + if self._args.ffn_hidden_size: + opts += f' --ffn-hidden-size {self._args.ffn_hidden_size}' + return opts + + def _append_moe_flags(self, opts): + if self._args.moe_ffn_hidden_size: + opts += f' --moe-ffn-hidden-size {self._args.moe_ffn_hidden_size}' + if self._args.enable_shared_expert: + opts += ' --enable-shared-expert' + if self._args.moe_layer_freq: + opts += f' --moe-layer-freq {self._args.moe_layer_freq}' + if self._args.num_shared_experts: + opts += f' --num-shared-experts {self._args.num_shared_experts}' + if self._args.moe_router_topk: + opts += f' --moe-router-topk {self._args.moe_router_topk}' + if self._args.moe_aux_loss_coeff: + opts += f' --moe-aux-loss-coeff {self._args.moe_aux_loss_coeff}' + if self._args.moe_router_load_balancing_type: + opts += f' --moe-router-load-balancing-type {self._args.moe_router_load_balancing_type}' + if self._args.expert_model_parallel_size: + opts += f' --expert-model-parallel-size {self._args.expert_model_parallel_size}' + if self._args.num_experts: + opts += f' --num-experts {self._args.num_experts}' + return opts + + def _append_optimizer_flags(self, opts): + if self._args.optimizer: + opts += f' --optimizer {self._args.optimizer}' + if getattr(self._args, 'override_opt_param_scheduler', True): + opts += ' --override-opt_param-scheduler' + if self._args.hysteresis is not None: + opts += f' --hysteresis {self._args.hysteresis}' + return opts + + def _append_checkpoint_flags(self, opts): + if self._args.load: + opts += f' --load {self._args.load}' + if self._args.no_load_optim: + opts += ' --no-load-optim' + if self._args.no_load_rng: + opts += ' --no-load-rng' + if self._args.ckpt_format: + opts += f' --ckpt-format {self._args.ckpt_format}' + return opts + + def _append_tokenizer_flags(self, opts): + if self._args.tokenizer_type: + opts += f' --tokenizer-type {self._args.tokenizer_type}' + if self._args.patch_tokenizer_type: + opts += f' --patch-tokenizer-type {self._args.patch_tokenizer_type}' + if self._args.position_embedding_type: + opts += f' --position-embedding-type {self._args.position_embedding_type}' + if self._args.no_rope_fusion: + opts += ' --no-rope-fusion' + if self._args.rotary_base: + opts += f' --rotary-base {self._args.rotary_base}' + if self._args.rotary_scaling_factor: + opts += f' --rotary-scaling-factor {self._args.rotary_scaling_factor}' + if self._args.qk_nope_head_dim: + opts += f' --qk-nope-head-dim {self._args.qk_nope_head_dim}' + if self._args.qk_rope_head_dim: + opts += f' --qk-rope-head-dim {self._args.qk_rope_head_dim}' + if self._args.v_head_dim: + opts += f' --v-head-dim {self._args.v_head_dim}' + if self._args.kv_lora_rank: + opts += f' --kv-lora-rank {self._args.kv_lora_rank}' + return opts + + def _append_misc_flags(self, opts): + if self._args.eod_mask_loss: + opts += ' --eod-mask-loss' + if self._args.use_rotary_position_embeddings: + opts += ' --use-rotary-position-embeddings' + if self._args.no_gradient_accumulation_fusion: + opts += ' --no-gradient-accumulation-fusion' + if self._args.use_flash_attn: + opts += ' --use-flash-attn' + if self._args.no_masked_softmax_fusion: + opts += ' --no-masked-softmax-fusion' + if self._args.no_bias_gelu_fusion: + opts += ' --no-bias-gelu-fusion' + if self._args.no_bias_dropout_fusion: + opts += ' --no-bias-dropout-fusion' + if self._args.train_mode: + opts += f' --train-mode {self._args.train_mode}' + if self._args.max_padding_length: + opts += f' --max-padding-length {self._args.max_padding_length}' + return opts + + def _megatron_command(self, precision): # noqa: C901 """Generate megatron command.""" if precision == Precision.FLOAT32: - precision_megatron = "" + precision_megatron = '' elif precision == Precision.FLOAT16: - precision_megatron = "--fp16" + precision_megatron = '--fp16' elif precision == Precision.BFLOAT16: - precision_megatron = "--bf16" + precision_megatron = '--bf16' - megatron_options = f"\ + megatron_options = f'\ --adam-beta1 0.9 \ --adam-beta2 0.95 \ - --tokenizer-type {self._args.tokenizer_type} \ --tensor-model-parallel-size {self._args.tensor_model_parallel_size} \ - --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size} \ --init-method-std {self._args.init_std} \ - --lr-decay-samples 38400 \ + --lr-decay-samples {self._args.lr_decay_samples} \ --lr-warmup-samples {self._args.num_warmup * self._args.batch_size} \ --lr-decay-style cosine \ --micro-batch-size {self._args.micro_batch_size} \ @@ -459,92 +502,50 @@ def _megatron_command(self, precision): # noqa: C901 --use-distributed-optimizer \ {precision_megatron} \ --seed {self._args.seed} \ - --log-throughput" + --log-throughput' - if self._args.sequence_parallel: - megatron_options = f"{megatron_options} --sequence-parallel" - if self._args.no_async_tensor_model_parallel_allreduce: - megatron_options = ( - f"{megatron_options} --no-async-tensor-model-parallel-allreduce" - ) - if self._args.use_rotary_position_embeddings: - megatron_options = f"{megatron_options} --use-rotary-position-embeddings" - if self._args.no_gradient_accumulation_fusion: - megatron_options = f"{megatron_options} --no-gradient-accumulation-fusion" - if self._args.use_flash_attn: - megatron_options = f"{megatron_options} --use-flash-attn" - if self._args.no_masked_softmax_fusion: - megatron_options = f"{megatron_options} --no-masked-softmax-fusion" - if self._args.no_bias_gelu_fusion: - megatron_options = f"{megatron_options} --no-bias-gelu-fusion" - if self._args.no_bias_dropout_fusion: - megatron_options = f"{megatron_options} --no-bias-dropout-fusion" - if self._args.split: - megatron_options = f"{megatron_options} --split {self._args.split}" - if self._args.extra and len(self._args.extra) > 0: - #self._args.extra = self._args.extra.split(",") - print( - f"Extra options: {self._args.extra}, {type(self._args.extra)}" - ) - megatron_options = f"{megatron_options} {self._args.extra}" - if self._args.expert_model_parallel_size: - megatron_options = f"{megatron_options} --expert-model-parallel-size {self._args.expert_model_parallel_size}" - if self._args.num_experts: - megatron_options = f"{megatron_options} --num-experts {self._args.num_experts}" - if self._args.max_padding_length: - megatron_options = f"{megatron_options} --max-padding-length {self._args.max_padding_length}" - if self._args.transformer_impl: - megatron_options = f"{megatron_options} --transformer-impl {self._args.transformer_impl}" + megatron_options = self._append_parallel_flags(megatron_options) + megatron_options = self._append_architecture_flags(megatron_options) + megatron_options = self._append_moe_flags(megatron_options) + megatron_options = self._append_optimizer_flags(megatron_options) + megatron_options = self._append_checkpoint_flags(megatron_options) + megatron_options = self._append_tokenizer_flags(megatron_options) + megatron_options = self._append_misc_flags(megatron_options) - command = "" - script_path = os.path.join( - self._args.code_base, f"pretrain_{self._args.model}.py" - ) + script_path = os.path.join(self._args.code_base, f'pretrain_{self._args.model}.py') if self._args.deepspeed: - deepspeed_option = self.__prepare_deespeed_config( - precision_megatron.lstrip("--") - ) - # No --log-throughput in Megatron-DeepSpeed by 20231219 - megatron_options = megatron_options.replace("--log-throughput", "").strip() + deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--')) + megatron_options = megatron_options.replace('--log-throughput', '').strip() if self._num_nodes > 1: - command = ( - f"torchrun {self._distributed_args} " - + f"{script_path} {megatron_options} {self._data_options} {deepspeed_option}" - ) + command = f'torchrun {self._distributed_args} {script_path} {megatron_options} {self._data_options} {deepspeed_option}' else: - command = f"deepspeed {script_path} {megatron_options} {self._data_options} {deepspeed_option}" - + command = f'deepspeed {script_path} {megatron_options} {self._data_options} {deepspeed_option}' else: - command = f"torchrun {self._distributed_args} {script_path} {megatron_options} {self._data_options}" + command = f'torchrun {self._distributed_args} {script_path} {megatron_options} {self._data_options}' return command - def _train_step(self, precision): # noqa: E501 + def _train_step(self, precision): # noqa: E501 """Train the model and get the performance.""" command = self._megatron_command(precision) - local_rank = os.environ.pop("OMPI_COMM_WORLD_LOCAL_RANK", None) - logger.info("Running command: {}.".format(command)) + local_rank = os.environ.pop('OMPI_COMM_WORLD_LOCAL_RANK', None) + logger.info('Running command: {}.'.format(command)) output = run_command(command, flush_output=True) - os.environ["OMPI_COMM_WORLD_LOCAL_RANK"] = local_rank + os.environ['OMPI_COMM_WORLD_LOCAL_RANK'] = local_rank iteration_times = [] info = {} # last rank will print the result, first rank will print the memory usage - if ( - self._num_nodes == 1 - or int(os.environ["OMPI_COMM_WORLD_RANK"]) - == int(os.environ["OMPI_COMM_WORLD_SIZE"]) - 1 - or int(os.environ["OMPI_COMM_WORLD_RANK"]) == 0 - ): - iteration_times, tflops, mem_allocated, max_mem_allocated = self._parse_log( - output.stdout - ) + if self._num_nodes == 1 or \ + int(os.environ['OMPI_COMM_WORLD_RANK']) == int(os.environ['OMPI_COMM_WORLD_SIZE']) - 1 \ + or int(os.environ['OMPI_COMM_WORLD_RANK']) == 0: + iteration_times, tflops, mem_allocated, max_mem_allocated = self._parse_log(output.stdout) if len(tflops) > 0: - info["tflops"] = tflops + info['tflops'] = tflops if len(mem_allocated) > 0: - info["mem_allocated"] = mem_allocated + info['mem_allocated'] = mem_allocated if len(max_mem_allocated) > 0: - info["max_mem_allocated"] = max_mem_allocated + info['max_mem_allocated'] = max_mem_allocated if not iteration_times: iteration_times = [-1 for i in range(self._args.num_steps)] @@ -557,7 +558,6 @@ def _sync_result(self, data): data (list): the data to be reduced. """ from mpi4py import MPI - comm = MPI.COMM_WORLD data = np.array(data, dtype=np.float64) # Reduce the data to a single value on rank 0 @@ -567,20 +567,16 @@ def _sync_result(self, data): def _process_info(self, model_action, precision, info): """Process the result of model benchmarking.""" - precision_metric = {"float16": "fp16", "float32": "fp32", "bfloat16": "bf16"} + precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'bfloat16': 'bf16'} if precision.value in precision_metric.keys(): precision = precision_metric[precision.value] for key, values in info.items(): - metric = "{}_{}_{}".format(precision, model_action, key) + metric = '{}_{}_{}'.format(precision, model_action, key) self._result.add_raw_data(metric, values, self._args.log_raw_data) self._result.add_result(metric, statistics.mean(values)) logger.info( - "Average {} - round: {}, model: {}, precision: {}, value: {:.6f}.".format( - key, - self._curr_run_index, - self._name, - precision, - statistics.mean(values), + 'Average {} - round: {}, model: {}, precision: {}, value: {:.6f}.'.format( + key, self._curr_run_index, self._name, precision, statistics.mean(values) ) ) @@ -594,49 +590,35 @@ def _init_distributed_setting(self): Return: True if distributed library is initialized successfully. """ - if not os.getenv("OMPI_COMM_WORLD_SIZE"): - logger.error("MPI is not enabled.") + if not os.getenv('OMPI_COMM_WORLD_SIZE'): + logger.error('MPI is not enabled.') return False - self._num_nodes = int(os.getenv("OMPI_COMM_WORLD_SIZE")) // int( - os.getenv("OMPI_COMM_WORLD_LOCAL_SIZE") - ) - master_addr = "localhost" + self._num_nodes = int(os.getenv('OMPI_COMM_WORLD_SIZE')) // int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE')) + master_addr = 'localhost' if self._num_nodes > 1: if not self._args.hostfile: - sb_hostfile = os.path.join( - os.environ.get("SB_WORKSPACE", "."), "hostfile" - ) + sb_hostfile = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'hostfile') if os.path.exists(sb_hostfile): - hosts = open(sb_hostfile).read().split("\n") - hosts = [ - f"{host} slots={self._args.num_gpus}" - for host in hosts - if host != "" - ] - self._args.hostfile = os.path.join(self._args.data_home, "hostfile") - with open(self._args.hostfile, "w") as file: - file.write("\n".join(hosts)) + hosts = open(sb_hostfile).read().split('\n') + hosts = [f'{host} slots={self._args.num_gpus}' for host in hosts if host != ''] + self._args.hostfile = os.path.join(self._args.data_home, 'hostfile') + with open(self._args.hostfile, 'w') as file: + file.write('\n'.join(hosts)) if not os.path.exists(self._args.hostfile): - logger.error("Hostfile not found.") + logger.error('Hostfile not found.') return False - hosts = open(self._args.hostfile, "r").readlines() + hosts = open(self._args.hostfile, 'r').readlines() if self._num_nodes != len(hosts): - logger.error( - "MPI init failed since hostfile not match the MPI setting." - ) + logger.error('MPI init failed since hostfile not match the MPI setting.') return False master_addr = hosts[0].split()[0] - addr = os.getenv("MASTER_ADDR", master_addr) - port = os.getenv("MASTER_PORT", "29500") - node_rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) // int( - os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"] - ) - self._distributed_args = ( - f"--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} " - + f"--node_rank {node_rank} --master_addr {addr} --master_port {port}" - ) + addr = os.getenv('MASTER_ADDR', master_addr) + port = os.getenv('MASTER_PORT', '29500') + node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE']) + self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \ + f'--node_rank {node_rank} --master_addr {addr} --master_port {port}' return True def _generate_dataset(self): @@ -650,66 +632,55 @@ def _generate_dataset(self): logger.info(f"Using mock data.") self._data_options = "--mock-data" else: - - self._vocab_path = str(Path(self._args.data_home) / "gpt2-vocab.json") + self._vocab_path = str(Path(self._args.data_home) / 'gpt2-vocab.json') download_file(self._args.vocab_url, self._vocab_path) - self._merges_path = str(Path(self._args.data_home) / "gpt2-merges.txt") + self._merges_path = str(Path(self._args.data_home) / 'gpt2-merges.txt') download_file(self._args.merges_url, self._merges_path) - if not os.path.exists( - os.path.join(self._args.data_home, f"{self._args.data_prefix}.bin") - ) or not os.path.exists( - os.path.join(self._args.data_home, f"{self._args.data_prefix}.idx") - ): + if not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.bin')) \ + or not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.idx')): if self._args.dataset_url: - self._raw_data_path = str(Path(self._args.data_home) / "data.json") + self._raw_data_path = str(Path(self._args.data_home) / 'data.json') download_file(self._args.dataset_url, self._raw_data_path) command = ( - "python3 " + 'python3 ' f'{os.path.join(self._args.code_base, "tools/preprocess_data.py")} ' - f"--input {self._raw_data_path} " - f"--tokenizer-type {self._args.tokenizer_type} " + f'--input {self._raw_data_path} ' + f'--tokenizer-type {self._args.tokenizer_type} ' f'--output-prefix {os.path.join(self._args.data_home, "dataset")} ' - f"--workers {str(self._args.num_workers)} " - f"--vocab-file {self._vocab_path} " - f"--merge-file {self._merges_path}" + f'--workers {str(self._args.num_workers)} ' + f'--vocab-file {self._vocab_path} ' + f'--merge-file {self._merges_path}' ) # split documents run_command(command, flush_output=True) # binarize dataset run_command(command, flush_output=True) - if not os.path.exists( - os.path.join( - self._args.data_home, f"{self._args.data_prefix}.bin" - ) - ) or not os.path.exists( - os.path.join( - self._args.data_home, f"{self._args.data_prefix}.idx" - ) - ): - logger.error("Dataset failed to generate.") - self._result.set_return_code( - ReturnCode.DATASET_GENERATION_FAILURE - ) + if not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.bin')) \ + or not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.idx')): + logger.error('Dataset failed to generate.') + self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE) return False - else: - logger.error("No dataset or dataset url provided.") + logger.error('No dataset or dataset url provided.') self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE) return False - self._data_path = os.path.join( - self._args.data_home, f"{self._args.data_prefix}" - ) - self._data_options = f"\ + self._data_path = os.path.join(self._args.data_home, f'{self._args.data_prefix}') + self._data_options = f'\ --vocab-file {self._vocab_path} \ --merge-file {self._merges_path} \ - --data-path {self._data_path}" + --data-path {self._data_path}' + if self._args.dataloader_type: self._data_options += f" --dataloader-type {self._args.dataloader_type}" - logger.info("Dataset preparation successfully.") - return True + if self._args.split: + self._data_options += f" --split {self._args.split}" + if self._args.data_cache_path: + self._data_options += f' --data-cache-path {self._args.data_cache_path}' + if self._args.dataset: + self._data_options += f' --dataset {self._args.dataset}' def _set_force_fp32(self): """Set force FP32.""" @@ -761,11 +732,53 @@ def _cal_params_count(self): # Register GPT3 benchmark. +BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.CUDA) +BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.ROCM) BenchmarkRegistry.register_benchmark( - "megatron-gpt", MegatronGPT, parameters="", platform=Platform.CUDA + 'megatron-deepseek', + MegatronGPT, + parameters=( + '--model=deepseek ' + '--tokenizer_type=DeepSeekV2Tokenizer ' + '--transformer_impl=transformer_engine ' + '--num_layers=27 ' + '--hidden_size=1024 ' + '--seq_len=4096 ' + '--num_attn_heads=16 ' + '--moe_ffn_hidden_size=1408 ' + '--enable_shared_expert ' + '--moe_layer_freq=1 ' + '--num_shared_experts=2 ' + '--moe_router_topk=6 ' + '--moe_aux_loss_coeff=1e-2 ' + '--moe_router_load_balancing_type=aux_loss ' + '--num_experts=64 ' + '--patch_tokenizer_type=DeepSeekV2Tokenizer ' + '--position_embedding_type=rope ' + '--no_rope_fusion ' + '--rotary_base=10000 ' + '--rotary_scaling_factor=40 ' + '--qk_nope_head_dim=128 ' + '--qk_rope_head_dim=64 ' + '--v_head_dim=128 ' + '--ffn_hidden_size=10944 ' + '--swiglu ' + '--normalization=RMSNorm ' + '--norm_epsilon=1e-06 ' + '--no_bias_swiglu_fusion ' + '--disable_bias_linear ' + '--untie_embeddings_and_output_weights ' + '--extra_vocab_size=2400 ' + '--load=deepseek-ai/DeepSeek-V2-Lite ' + '--no_load_optim ' + '--no_load_rng ' + '--ckpt_format=torch ' + '--eod_mask_loss ' + '--train_mode=pretrain ' + '--data_cache_path=/root/cache ' + '--max_padding_length=4096 ' + '--kv_lora_rank=512 ' + '--dataloader_type=cyclic' + ), + platform=Platform.ROCM ) -BenchmarkRegistry.register_benchmark( - "megatron-gpt", MegatronGPT, parameters="", platform=Platform.ROCM -) - - diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index 2f1d076f5..e13174b85 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -5,6 +5,7 @@ import os from pathlib import Path +import shlex import statistics from unittest import mock import unittest @@ -15,6 +16,26 @@ from tests.helper.testcase import BenchmarkTestCase +def normalize_command(cmd): + """Convert a CLI string into a list of meaningful argument units (key-value or flag).""" + tokens = shlex.split(cmd) + units = [] + i = 0 + while i < len(tokens): + if tokens[i].startswith('--'): + if i + 1 >= len(tokens) or tokens[i + 1].startswith('--'): + units.append(tokens[i]) # flag-only + i += 1 + else: + units.append(f'{tokens[i]} {tokens[i + 1]}') # key-value pair + i += 2 + else: + # Include positional args like torchrun, script path, etc. + units.append(tokens[i]) + i += 1 + return sorted(units) + + class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase): """Tests for IBBenchmark benchmark.""" @classmethod @@ -170,17 +191,19 @@ def test_megatron_gpt_command(self, mock_generate_dataset): benchmark = benchmark_cls( self.benchmark_name, parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \ - --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document', + --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document --override_opt_param_scheduler', ) mock_generate_dataset.return_value = True benchmark._preprocess() benchmark._data_options = f'\ --vocab-file {self._tmp_dir}/gpt2-vocab.json \ --merge-file {self._tmp_dir}/gpt2-merges.txt \ - --data-path {self._tmp_dir}/dataset_text_document' + --data-path {self._tmp_dir}/dataset_text_document \ + --split 949,50,1' script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py') - expected_command = 'torchrun {distributed_args} {script_path} \ + expected_command_template = 'torchrun {distributed_args} {script_path} \ + --tokenizer-type GPT2BPETokenizer \ --override-opt_param-scheduler \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ @@ -199,7 +222,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset): --train-samples 20480 \ --lr 0.00012 \ --min-lr 1e-06 \ - --split 949,50,1 \ --log-interval 1 \ --eval-interval 10 \ --eval-iters 0 \ @@ -217,54 +239,57 @@ def test_megatron_gpt_command(self, mock_generate_dataset): --log-throughput {data_options}' precision = Precision.FLOAT32 - command = benchmark._megatron_command(precision) - self.assertEqual( - command, - expected_command.format( - precision='', - data_options=benchmark._data_options, - distributed_args=benchmark._distributed_args, - script_path=script_path - ) + expected_command = expected_command_template.format( + precision='', + data_options=benchmark._data_options, + distributed_args=benchmark._distributed_args, + script_path=script_path ) - precision = Precision.FLOAT16 command = benchmark._megatron_command(precision) - self.assertEqual( - command, - expected_command.format( - precision='--fp16', - data_options=benchmark._data_options, - distributed_args=benchmark._distributed_args, - script_path=script_path - ) + actual_units = normalize_command(command) + expected_units = normalize_command(expected_command) + self.assertEqual(actual_units, expected_units) + + precision = Precision.FLOAT16 + expected_command = expected_command_template.format( + precision='--fp16', + data_options=benchmark._data_options, + distributed_args=benchmark._distributed_args, + script_path=script_path ) - precision = Precision.BFLOAT16 command = benchmark._megatron_command(precision) - self.assertEqual( - command, - expected_command.format( - precision='--bf16', - data_options=benchmark._data_options, - distributed_args=benchmark._distributed_args, - script_path=script_path - ) + actual_units = normalize_command(command) + expected_units = normalize_command(expected_command) + self.assertEqual(actual_units, expected_units) + + precision = Precision.BFLOAT16 + expected_command = expected_command_template.format( + precision='--bf16', + data_options=benchmark._data_options, + distributed_args=benchmark._distributed_args, + script_path=script_path ) + command = benchmark._megatron_command(precision) + actual_units = normalize_command(command) + expected_units = normalize_command(expected_command) + self.assertEqual(actual_units, expected_units) os.environ['OMPI_COMM_WORLD_SIZE'] = '1' benchmark = benchmark_cls( self.benchmark_name, parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \ - --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document --deepspeed', + --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document --deepspeed --override_opt_param_scheduler', ) - mock_generate_dataset.return_value = True benchmark._preprocess() benchmark._data_options = f'\ --vocab-file {self._tmp_dir}/gpt2-vocab.json \ --merge-file {self._tmp_dir}/gpt2-merges.txt \ - --data-path {self._tmp_dir}/dataset_text_document' + --data-path {self._tmp_dir}/dataset_text_document \ + --split 949,50,1' command = benchmark._megatron_command(Precision.BFLOAT16) expected_command = 'deepspeed {script_path} --override-opt_param-scheduler \ + --tokenizer-type GPT2BPETokenizer \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --tensor-model-parallel-size 1 \ @@ -282,7 +307,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset): --train-samples 20480 \ --lr 0.00012 \ --min-lr 1e-06 \ - --split 949,50,1 \ --log-interval 1 \ --eval-interval 10 \ --eval-iters 0 \ @@ -306,16 +330,173 @@ def test_megatron_gpt_command(self, mock_generate_dataset): --train-tokens 300000000000 \ --data-impl mmap --no-pipeline-parallel' - self.assertEqual( - command, - expected_command.format( - precision='--bf16', - data_options=benchmark._data_options, - script_path=script_path, - deepseed_options=expect_ds_options - ) + expected_command = expected_command.format( + precision='--bf16', + data_options=benchmark._data_options, + deepseed_options=expect_ds_options, + script_path=script_path + ) + command = benchmark._megatron_command(Precision.BFLOAT16) + actual_units = normalize_command(command) + expected_units = normalize_command(expected_command) + self.assertEqual(actual_units, expected_units) + + def test_deepseek_v2_command(self): + # test deepspeed with megatron + os.environ['OMPI_COMM_WORLD_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_RANK'] = '0' + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '12345' + with open(self.hostfile_path, 'w') as f: + f.write('host1\n') + + benchmark_name = 'megatron-deepseek' + (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.ROCM) + assert (benchmark_cls) + benchmark = benchmark_cls( + benchmark_name, + parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} ' + '--num_warmup 0 ' + '--num_steps 10 ' + '--batch_size 256 ' + '--expert_model_parallel_size 8 ' + '--micro_batch_size 2 ' + '--mock_data ' + '--model=deepseek ' + '--tokenizer_type=DeepSeekV2Tokenizer ' + '--transformer_impl=transformer_engine ' + '--num_layers=27 ' + '--hidden_size=1024 ' + '--seq_len=4096 ' + '--ffn_hidden_size=10944 ' + '--num_attn_heads=16 ' + '--moe_ffn_hidden_size=1408 ' + '--enable_shared_expert ' + '--moe_layer_freq=1 ' + '--num_shared_experts=2 ' + '--moe_router_topk=6 ' + '--moe_aux_loss_coeff=0.01 ' + '--moe_router_load_balancing_type=aux_loss ' + '--num_experts=64 ' + '--patch_tokenizer_type=DeepSeekV2Tokenizer ' + '--position_embedding_type=rope ' + '--no_rope_fusion ' + '--rotary_base=10000 ' + '--rotary_scaling_factor=40 ' + '--qk_nope_head_dim=128 ' + '--qk_rope_head_dim=64 ' + '--v_head_dim=128 ' + '--ffn_hidden_size=10944 ' + '--swiglu ' + '--normalization=RMSNorm ' + '--norm_epsilon=1e-06 ' + '--no_bias_swiglu_fusion ' + '--disable_bias_linear ' + '--untie_embeddings_and_output_weights ' + '--extra_vocab_size=2400 ' + '--load=deepseek-ai/DeepSeek-V2-Lite ' + '--no_load_optim ' + '--no_load_rng ' + '--ckpt_format=torch ' + '--eod_mask_loss ' + '--train_mode=pretrain ' + '--data_cache_path=/root/cache ' + '--max_padding_length=4096 ' + '--kv_lora_rank=512 ' + '--dataloader_type=cyclic ' ) + benchmark._preprocess() + benchmark._data_options = f'\ + --mock-data \ + --dataloader-type cyclic \ + --data-cache-path /root/cache \ + --dataset LLama-Pretrain-Idxmap' + + precision = Precision.BFLOAT16 + command = benchmark._megatron_command(precision) + + expected_command = ( + 'torchrun {script_path} --bf16 \ + --init-method-std 0.009 \ + --adam-beta1 0.9 \ + --hidden-dropout 0.0 \ + --min-lr 1e-06 \ + --lr 0.00012 \ + --optimizer adam \ + --log-interval 1 \ + --eval-interval 10 \ + --seed 1234 \ + --eval-iters 0 \ + --max-position-embeddings 4096 \ + --hysteresis 2 \ + --lr-decay-style cosine \ + --lr-decay-samples 43945312 \ + --clip-grad 1.0 \ + --save-interval 10000 \ + --adam-beta2 0.95 \ + --moe-aux-loss-coeff 0.01 \ + --log-throughput \ + --num-workers 8 \ + --use-distributed-optimizer \ + --attention-dropout 0.0 \ + --tensor-model-parallel-size 1 \ + --lr-warmup-samples 0 \ + --weight-decay 0.1 \ + --train-samples 2560 \ + --no-load-optim \ + --load deepseek-ai/DeepSeek-V2-Lite \ + --no-load-rng \ + --ffn-hidden-size 10944 \ + --patch-tokenizer-type DeepSeekV2Tokenizer \ + --swiglu \ + --normalization RMSNorm \ + --norm-epsilon 1e-06 \ + --no-bias-swiglu-fusion \ + --no-rope-fusion \ + --position-embedding-type rope \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --ckpt-format torch \ + --rotary-base 10000 \ + --rotary-scaling-factor 40 \ + --eod-mask-loss \ + --moe-ffn-hidden-size 1408 \ + --enable-shared-expert \ + --moe-layer-freq 1 \ + --num-shared-experts 2 \ + --moe-router-topk 6 \ + --kv-lora-rank 512 \ + --qk-nope-head-dim 128 \ + --qk-rope-head-dim 64 \ + --v-head-dim 128 \ + --moe-router-load-balancing-type aux_loss \ + --train-mode pretrain \ + --extra-vocab-size 2400 \ + --global-batch-size 256 \ + --micro-batch-size 2 \ + --num-layers 27 \ + --hidden-size 1024 \ + --seq-length 4096 \ + --num-attention-heads 16 \ + --tokenizer-type DeepSeekV2Tokenizer \ + --transformer-impl transformer_engine \ + --num-experts 64 \ + --expert-model-parallel-size 8 \ + --max-padding-length 4096 \ + {data_options} \ + {disitributed_args}' + ).format( + script_path=str(Path(self._tmp_dir) / 'pretrain_deepseek.py'), + data_options=benchmark._data_options, + disitributed_args=benchmark._distributed_args + ) + actual_units = normalize_command(command) + expected_units = normalize_command(expected_command) + + self.assertEqual(actual_units, expected_units) + @decorator.load_data('tests/data/megatron_deepspeed.log') @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') def test_megatron_parse_log(self, raw_output, mock_generate_dataset): diff --git a/third_party/Makefile b/third_party/Makefile index b6bec4be7..a4afbf8ed 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -134,7 +134,9 @@ rocm_rocblas: sb_micro_path # Since it takes several hours to build, avoid to build again if hipblaslt-bench exsists. rocm_hipblaslt: sb_micro_path @if [ ! -e $(SB_MICRO_PATH)/bin/hipblaslt-bench ] && [ -z `which hipblaslt-bench` ]; then \ - cd ./hipBLASLt && python3 -m pip install -r tensilelite/requirements.txt && ./install.sh -dc; \ + if [ -d hipBLASLt ]; then rm -rf hipBLASLt; fi; \ + git clone -b ${HIPBLASLT_BRANCH} https://github.com/ROCmSoftwarePlatform/hipBLASLt.git ./hipBLASLt; \ + cd ./hipBLASLt && ./install.sh -dc; \\ cp -v $(SB_MICRO_PATH)/third_party/hipBLASLt/build/release/clients/staging/hipblaslt-bench $(SB_MICRO_PATH)/bin/; \ fi @@ -212,7 +214,7 @@ rocm_megatron_lm: cd Megatron && mkdir -p rocm && cd rocm && \ if [ ! -d "Megatron-LM" ]; then \ git clone -b rocm_dev https://github.com/ROCm/Megatron-LM.git ; \ - fi + fi cp Megatron/rocm/Megatron-LM/examples/deepseek_v2/pretrain_deepseek.py Megatron/rocm/Megatron-LM/ git clone https://github.com/caaatch22/grouped_gemm.git &&\ cd grouped_gemm &&\ From eaa1811b520c1bbaa1c4ea4eb30b4c7f6acc5e1c Mon Sep 17 00:00:00 2001 From: yukirora Date: Fri, 23 May 2025 11:42:13 +0000 Subject: [PATCH 3/6] fix bug --- .vscode/settings.json | 9 +++++++-- setup.py | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 6a0ee151a..bf9e34c59 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,9 +1,14 @@ { - "editor.formatOnSave": true, + "editor.formatOnSave": false, "editor.wordWrap": "on", "python.autoComplete.addBrackets": true, "python.formatting.provider": "yapf", "python.linting.flake8Enabled": true, "python.linting.mypyEnabled": true, - "python.analysis.completeFunctionParens": true + "python.analysis.completeFunctionParens": true, + "python.testing.pytestArgs": [ + "tests" + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true } diff --git a/setup.py b/setup.py index fe3d7d462..6e37b1ca1 100644 --- a/setup.py +++ b/setup.py @@ -192,8 +192,8 @@ def run(self): 'xlrd>=2.0.1', 'xlsxwriter>=1.3.8', 'xmltodict>=0.12.0', - 'types-requests==0.1.13', - 'urllib3>=1.26.5' + 'types-requests', + 'urllib3>=1.26.9' ], extras_require=( lambda x: { From fd784ae598eb3f1bca91d49b430db08e921664b3 Mon Sep 17 00:00:00 2001 From: yukirora Date: Fri, 23 May 2025 11:46:16 +0000 Subject: [PATCH 4/6] restore --- .vscode/settings.json | 9 ++------- setup.py | 2 +- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index bf9e34c59..6a0ee151a 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,14 +1,9 @@ { - "editor.formatOnSave": false, + "editor.formatOnSave": true, "editor.wordWrap": "on", "python.autoComplete.addBrackets": true, "python.formatting.provider": "yapf", "python.linting.flake8Enabled": true, "python.linting.mypyEnabled": true, - "python.analysis.completeFunctionParens": true, - "python.testing.pytestArgs": [ - "tests" - ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true + "python.analysis.completeFunctionParens": true } diff --git a/setup.py b/setup.py index 6e37b1ca1..588687e6e 100644 --- a/setup.py +++ b/setup.py @@ -189,11 +189,11 @@ def run(self): 'requests>=2.27.1', 'seaborn>=0.11.2', 'tcping>=0.1.1rc1', + 'urllib3>=1.26.9', 'xlrd>=2.0.1', 'xlsxwriter>=1.3.8', 'xmltodict>=0.12.0', 'types-requests', - 'urllib3>=1.26.9' ], extras_require=( lambda x: { From f6b4c46d1602c5e9990eb87da20879f0cf0e5d18 Mon Sep 17 00:00:00 2001 From: yukirora Date: Fri, 23 May 2025 11:56:58 +0000 Subject: [PATCH 5/6] fix bug --- superbench/benchmarks/model_benchmarks/megatron_gpt3.py | 5 ++++- tests/benchmarks/model_benchmarks/test_megatron_gpt.py | 2 +- third_party/Makefile | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 41b64ee60..d023fb68d 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -735,7 +735,10 @@ def _cal_params_count(self): BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.CUDA) BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.ROCM) BenchmarkRegistry.register_benchmark( - 'megatron-deepseek', + 'megatron-deepseek-v2', MegatronGPT, parameters='--model=deepseek', platform=Platform.ROCM +) +BenchmarkRegistry.register_benchmark( + 'megatron-deepseek-v2', MegatronGPT, parameters=( '--model=deepseek ' diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index e13174b85..2966703d8 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -351,7 +351,7 @@ def test_deepseek_v2_command(self): with open(self.hostfile_path, 'w') as f: f.write('host1\n') - benchmark_name = 'megatron-deepseek' + benchmark_name = 'megatron-deepseek-v2' (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.ROCM) assert (benchmark_cls) benchmark = benchmark_cls( diff --git a/third_party/Makefile b/third_party/Makefile index a4afbf8ed..c2b3806ed 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -136,7 +136,7 @@ rocm_hipblaslt: sb_micro_path @if [ ! -e $(SB_MICRO_PATH)/bin/hipblaslt-bench ] && [ -z `which hipblaslt-bench` ]; then \ if [ -d hipBLASLt ]; then rm -rf hipBLASLt; fi; \ git clone -b ${HIPBLASLT_BRANCH} https://github.com/ROCmSoftwarePlatform/hipBLASLt.git ./hipBLASLt; \ - cd ./hipBLASLt && ./install.sh -dc; \\ + cd ./hipBLASLt && ./install.sh -dc; \ cp -v $(SB_MICRO_PATH)/third_party/hipBLASLt/build/release/clients/staging/hipblaslt-bench $(SB_MICRO_PATH)/bin/; \ fi From 55ffb8b36b4470baaeedb72c4b80590e239e2884 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Thu, 26 Jun 2025 05:38:16 +0000 Subject: [PATCH 6/6] Fix lint and unittest issues. --- .../model_benchmarks/megatron_gpt3.py | 63 +++++++++++-------- .../model_benchmarks/test_megatron_gpt.py | 9 ++- 2 files changed, 43 insertions(+), 29 deletions(-) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index d023fb68d..379832b30 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -420,26 +420,33 @@ def _append_checkpoint_flags(self, opts): return opts def _append_tokenizer_flags(self, opts): - if self._args.tokenizer_type: - opts += f' --tokenizer-type {self._args.tokenizer_type}' - if self._args.patch_tokenizer_type: - opts += f' --patch-tokenizer-type {self._args.patch_tokenizer_type}' - if self._args.position_embedding_type: - opts += f' --position-embedding-type {self._args.position_embedding_type}' - if self._args.no_rope_fusion: - opts += ' --no-rope-fusion' - if self._args.rotary_base: - opts += f' --rotary-base {self._args.rotary_base}' - if self._args.rotary_scaling_factor: - opts += f' --rotary-scaling-factor {self._args.rotary_scaling_factor}' - if self._args.qk_nope_head_dim: - opts += f' --qk-nope-head-dim {self._args.qk_nope_head_dim}' - if self._args.qk_rope_head_dim: - opts += f' --qk-rope-head-dim {self._args.qk_rope_head_dim}' - if self._args.v_head_dim: - opts += f' --v-head-dim {self._args.v_head_dim}' - if self._args.kv_lora_rank: - opts += f' --kv-lora-rank {self._args.kv_lora_rank}' + args = self._args + + # map of arg-attribute → flag string + flag_map = { + 'tokenizer_type': '--tokenizer-type', + 'patch_tokenizer_type': '--patch-tokenizer-type', + 'position_embedding_type': '--position-embedding-type', + 'rotary_base': '--rotary-base', + 'rotary_scaling_factor': '--rotary-scaling-factor', + 'qk_nope_head_dim': '--qk-nope-head-dim', + 'qk_rope_head_dim': '--qk-rope-head-dim', + 'v_head_dim': '--v-head-dim', + 'kv_lora_rank': '--kv-lora-rank', + 'no_rope_fusion': '--no-rope-fusion', + } + + for attr, flag in flag_map.items(): + val = getattr(args, attr, None) + if not val: + continue + + # boolean flags get no value + if isinstance(val, bool): + opts += f' {flag}' + else: + opts += f' {flag} {val}' + return opts def _append_misc_flags(self, opts): @@ -517,7 +524,9 @@ def _megatron_command(self, precision): # noqa: C901 deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--')) megatron_options = megatron_options.replace('--log-throughput', '').strip() if self._num_nodes > 1: - command = f'torchrun {self._distributed_args} {script_path} {megatron_options} {self._data_options} {deepspeed_option}' + command = f'torchrun {self._distributed_args} {script_path} \ + {megatron_options} {self._data_options} {deepspeed_option}' + else: command = f'deepspeed {script_path} {megatron_options} {self._data_options} {deepspeed_option}' else: @@ -627,10 +636,10 @@ def _generate_dataset(self): Return: True if dataset is created successfully. """ - self._data_options = "" + self._data_options = '' if self._args.mock_data: - logger.info(f"Using mock data.") - self._data_options = "--mock-data" + logger.info('Using mock data.') + self._data_options = '--mock-data' else: self._vocab_path = str(Path(self._args.data_home) / 'gpt2-vocab.json') download_file(self._args.vocab_url, self._vocab_path) @@ -674,14 +683,16 @@ def _generate_dataset(self): --data-path {self._data_path}' if self._args.dataloader_type: - self._data_options += f" --dataloader-type {self._args.dataloader_type}" + self._data_options += f' --dataloader-type {self._args.dataloader_type}' if self._args.split: - self._data_options += f" --split {self._args.split}" + self._data_options += f' --split {self._args.split}' if self._args.data_cache_path: self._data_options += f' --data-cache-path {self._args.data_cache_path}' if self._args.dataset: self._data_options += f' --dataset {self._args.dataset}' + return True + def _set_force_fp32(self): """Set force FP32.""" pass diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index 2966703d8..b7c588677 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -191,7 +191,8 @@ def test_megatron_gpt_command(self, mock_generate_dataset): benchmark = benchmark_cls( self.benchmark_name, parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \ - --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document --override_opt_param_scheduler', + --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document \ + --override_opt_param_scheduler', ) mock_generate_dataset.return_value = True benchmark._preprocess() @@ -278,7 +279,8 @@ def test_megatron_gpt_command(self, mock_generate_dataset): benchmark = benchmark_cls( self.benchmark_name, parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \ - --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document --deepspeed --override_opt_param_scheduler', + --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document \ + --deepspeed --override_opt_param_scheduler', ) benchmark._preprocess() benchmark._data_options = f'\ @@ -342,6 +344,7 @@ def test_megatron_gpt_command(self, mock_generate_dataset): self.assertEqual(actual_units, expected_units) def test_deepseek_v2_command(self): + """Test v2 command.""" # test deepspeed with megatron os.environ['OMPI_COMM_WORLD_SIZE'] = '1' os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1' @@ -408,7 +411,7 @@ def test_deepseek_v2_command(self): ) benchmark._preprocess() - benchmark._data_options = f'\ + benchmark._data_options = '\ --mock-data \ --dataloader-type cyclic \ --data-cache-path /root/cache \