diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 5c3350e95..379832b30 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -41,25 +41,7 @@ def __init__(self, name, parameters=''): def add_parser_arguments(self): """Add the specified arguments.""" super().add_parser_arguments() - self._parser.add_argument('--code_base', type=str, required=False, default='', help='Code base.') - self._parser.add_argument('--dataset_url', type=str, required=False, default=None, help='Dataset URL.') - self._parser.add_argument( - '--vocab_url', - type=str, - required=False, - default='https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json', - help='Vocab URL.' - ) - self._parser.add_argument( - '--merges_url', - type=str, - required=False, - default='https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt', - help='Merges URL.' - ) - self._parser.add_argument( - '--tokenizer_type', type=str, required=False, default='GPT2BPETokenizer', help='Tokenizer type.' - ) + # Model configs self._parser.add_argument('--model_size', type=int, required=False, default=6.7, help='Model size.') self._parser.add_argument('--num_layers', type=int, required=False, default=32, help='Number of layers.') self._parser.add_argument('--hidden_size', type=int, required=False, default=4096, help='Hidden size.') @@ -102,6 +84,8 @@ def add_parser_arguments(self): self._parser.add_argument( '--train_tokens', type=int, required=False, default=300000000000, help='Train tokens.' ) + self._parser.add_argument('--lr_decay_samples', type=int, default=43945312, help='Use lr decay samples.') + self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.') # lr configs # Parallelism configs self._parser.add_argument('--zero_stage', type=int, default=1, help='Zero stage.') @@ -119,14 +103,133 @@ def add_parser_arguments(self): self._parser.add_argument( '--split', type=str, default='949,50,1', help='Split dataset ratio for train/val/test.' ) - self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.') + self._parser.add_argument('--dataset_url', type=str, required=False, default=None, help='Dataset URL.') self._parser.add_argument( - '--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.' + '--vocab_url', + type=str, + required=False, + default='https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json', + help='Vocab URL.' + ) + self._parser.add_argument( + '--merges_url', + type=str, + required=False, + default='https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt', + help='Merges URL.' ) self._parser.add_argument('--data_impl', type=str, default='mmap', help='Data impl.') self._parser.add_argument('--data_prefix', type=str, default='dataset_text_document', help='Data prefix.') + self._parser.add_argument('--mock_data', action='store_true', help='Use mock data.') + self._parser.add_argument( + '--dataloader_type', + type=str, + default=None, + help='Data loader type to load data.', + ) + self._parser.add_argument( + '--max_padding_length', + type=int, + default=None, + help='Max padding legth to embedding.', + ) + self._parser.add_argument( + '--data_cache_path', + type=str, + default=None, + help='Data cache path.', + ) + self._parser.add_argument( + '--dataset', + type=str, + default=None, + help='Dataset to use.', + ) + # Model architecture + self._parser.add_argument('--ffn_hidden_size', type=int, help='FFN hidden layer size.') + self._parser.add_argument('--swiglu', action='store_true', help='Enable SwiGLU activation.') + self._parser.add_argument('--no_bias_swiglu_fusion', action='store_true', help='Disable bias SwiGLU fusion.') + self._parser.add_argument('--disable_bias_linear', action='store_true', help='Disable bias in linear layers.') + self._parser.add_argument('--normalization', type=str, help='Normalization method.') + self._parser.add_argument('--norm_epsilon', type=float, help='Normalization epsilon.') + self._parser.add_argument( + '--untie_embeddings_and_output_weights', action='store_true', help='Untie embeddings and output weights.' + ) + self._parser.add_argument('--extra_vocab_size', type=int, help='Extra vocabulary size.') + self._parser.add_argument('--transformer_impl', type=str, default=None, help='Transformer implementation.') + # Loss settings + self._parser.add_argument('--eod_mask_loss', action='store_true', help='Enable EOD mask loss.') + self._parser.add_argument('--hysteresis', type=int, default=2, help='Hysteresis for loss scale.') + # Optimizer + self._parser.add_argument( + '--optimizer', + type=str, + default='adam', + help='Optimizer to use. Current supported: "adam" and "fused_adam".', + ) + self._parser.add_argument( + '--override_opt_param_scheduler', action='store_true', help='Enable the opt_param scheduler.' + ) + # LoRA settings + self._parser.add_argument('--kv_lora_rank', type=int, help='KV LoRA rank.') + # MoE configuration + self._parser.add_argument( + '--expert_model_parallel_size', + type=int, + default=None, + help='Expert model parallel size.', + ) + self._parser.add_argument( + '--num_experts', + type=int, + default=None, + help='Number of experts.', + ) + self._parser.add_argument('--moe_ffn_hidden_size', type=int, help='MoE FFN hidden size.') + self._parser.add_argument('--enable_shared_expert', action='store_true', help='Enable shared expert in MoE.') + self._parser.add_argument('--moe_layer_freq', type=int, help='MoE layer frequency.') + self._parser.add_argument('--num_shared_experts', type=int, help='Number of shared experts.') + self._parser.add_argument('--moe_router_topk', type=int, help='Top-k routing for MoE.') + self._parser.add_argument('--moe_aux_loss_coeff', type=float, help='Auxiliary loss coefficient.') + self._parser.add_argument( + '--moe_router_load_balancing_type', type=str, help='Load balancing type for MoE router.' + ) + # Tokenizer & Position Encoding + self._parser.add_argument( + '--tokenizer_type', type=str, required=False, default='GPT2BPETokenizer', help='Tokenizer type.' + ) + self._parser.add_argument('--patch_tokenizer_type', type=str, help='Tokenizer type.') + self._parser.add_argument('--position_embedding_type', type=str, help='Position embedding type.') + self._parser.add_argument('--no_rope_fusion', action='store_true', help='Disable RoPE fusion.') + self._parser.add_argument('--rotary_base', type=int, help='Rotary base value.') + self._parser.add_argument('--rotary_scaling_factor', type=int, help='Rotary scaling factor.') + self._parser.add_argument('--qk_nope_head_dim', type=int, help='QK NoPE head dimension.') + self._parser.add_argument('--qk_rope_head_dim', type=int, help='QK RoPE head dimension.') + self._parser.add_argument('--v_head_dim', type=int, help='V head dimension.') + # Checkpoint and loading + self._parser.add_argument('--load', type=str, help='Model to load.') + self._parser.add_argument('--no_load_optim', action='store_true', help='Disable optimizer loading.') + self._parser.add_argument('--no_load_rng', action='store_true', help='Disable RNG loading.') + self._parser.add_argument('--ckpt_format', type=str, help='Checkpoint format.') + # Other settings + self._parser.add_argument('--code_base', type=str, required=False, default='', help='Code base.') + self._parser.add_argument( + '--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.' + ) self._parser.add_argument('--deepspeed', action='store_true', help='Use deepspeed.') self._parser.add_argument('--extra', type=str, default=None, help='Extra options for Megatron.') + self._parser.add_argument( + '--model', + type=str, + default='gpt', + help='Model to run. Current supported: "gpt" and "deepseek".', + ) + self._parser.add_argument( + '--train_mode', + type=str, + default=None, + help='Train mode to run. Current supported: "pretrain" and "finetune".', + ) def _preprocess(self): if not super()._preprocess(): @@ -139,8 +242,9 @@ def _preprocess(self): else: self._args.code_base = os.path.join(os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-LM') - if not os.path.exists(self._args.code_base) or \ - not os.path.exists(os.path.join(self._args.code_base, 'pretrain_gpt.py')): + if not os.path.exists(self._args.code_base) or not os.path.exists( + os.path.join(self._args.code_base, f'pretrain_{self._args.model}.py') + ): logger.error('Code base is not valid.') self._result.set_return_code(ReturnCode.INVALID_ARGUMENT) return False @@ -207,10 +311,11 @@ def __prepare_deespeed_config(self, precision_megatron): 'enabled': True, 'loss_scale': 0, 'loss_scale_window': 500, - 'hysteresis': 2, 'min_loss_scale': 1, 'initial_scale_power': 11 } + if self._args.hysteresis is not None: + precision_template['hysteresis'] = self._args.hysteresis ds_config_template = { 'train_batch_size': self._args.batch_size, @@ -242,6 +347,129 @@ def __prepare_deespeed_config(self, precision_megatron): deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel' return deepspeed_options + def _append_parallel_flags(self, opts): + if self._args.sequence_parallel: + opts += ' --sequence-parallel' + if self._args.no_async_tensor_model_parallel_allreduce: + opts += ' --no-async-tensor-model-parallel-allreduce' + if self._args.pipeline_model_parallel_size > 1: + opts += f' --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size}' + + return opts + + def _append_architecture_flags(self, opts): + if self._args.swiglu: + opts += ' --swiglu' + if self._args.no_bias_swiglu_fusion: + opts += ' --no-bias-swiglu-fusion' + if self._args.disable_bias_linear: + opts += ' --disable-bias-linear' + if self._args.normalization: + opts += f' --normalization {self._args.normalization}' + if self._args.norm_epsilon: + opts += f' --norm-epsilon {self._args.norm_epsilon}' + if self._args.untie_embeddings_and_output_weights: + opts += ' --untie-embeddings-and-output-weights' + if self._args.transformer_impl: + opts += f' --transformer-impl {self._args.transformer_impl}' + if self._args.extra_vocab_size: + opts += f' --extra-vocab-size {self._args.extra_vocab_size}' + if self._args.ffn_hidden_size: + opts += f' --ffn-hidden-size {self._args.ffn_hidden_size}' + return opts + + def _append_moe_flags(self, opts): + if self._args.moe_ffn_hidden_size: + opts += f' --moe-ffn-hidden-size {self._args.moe_ffn_hidden_size}' + if self._args.enable_shared_expert: + opts += ' --enable-shared-expert' + if self._args.moe_layer_freq: + opts += f' --moe-layer-freq {self._args.moe_layer_freq}' + if self._args.num_shared_experts: + opts += f' --num-shared-experts {self._args.num_shared_experts}' + if self._args.moe_router_topk: + opts += f' --moe-router-topk {self._args.moe_router_topk}' + if self._args.moe_aux_loss_coeff: + opts += f' --moe-aux-loss-coeff {self._args.moe_aux_loss_coeff}' + if self._args.moe_router_load_balancing_type: + opts += f' --moe-router-load-balancing-type {self._args.moe_router_load_balancing_type}' + if self._args.expert_model_parallel_size: + opts += f' --expert-model-parallel-size {self._args.expert_model_parallel_size}' + if self._args.num_experts: + opts += f' --num-experts {self._args.num_experts}' + return opts + + def _append_optimizer_flags(self, opts): + if self._args.optimizer: + opts += f' --optimizer {self._args.optimizer}' + if getattr(self._args, 'override_opt_param_scheduler', True): + opts += ' --override-opt_param-scheduler' + if self._args.hysteresis is not None: + opts += f' --hysteresis {self._args.hysteresis}' + return opts + + def _append_checkpoint_flags(self, opts): + if self._args.load: + opts += f' --load {self._args.load}' + if self._args.no_load_optim: + opts += ' --no-load-optim' + if self._args.no_load_rng: + opts += ' --no-load-rng' + if self._args.ckpt_format: + opts += f' --ckpt-format {self._args.ckpt_format}' + return opts + + def _append_tokenizer_flags(self, opts): + args = self._args + + # map of arg-attribute → flag string + flag_map = { + 'tokenizer_type': '--tokenizer-type', + 'patch_tokenizer_type': '--patch-tokenizer-type', + 'position_embedding_type': '--position-embedding-type', + 'rotary_base': '--rotary-base', + 'rotary_scaling_factor': '--rotary-scaling-factor', + 'qk_nope_head_dim': '--qk-nope-head-dim', + 'qk_rope_head_dim': '--qk-rope-head-dim', + 'v_head_dim': '--v-head-dim', + 'kv_lora_rank': '--kv-lora-rank', + 'no_rope_fusion': '--no-rope-fusion', + } + + for attr, flag in flag_map.items(): + val = getattr(args, attr, None) + if not val: + continue + + # boolean flags get no value + if isinstance(val, bool): + opts += f' {flag}' + else: + opts += f' {flag} {val}' + + return opts + + def _append_misc_flags(self, opts): + if self._args.eod_mask_loss: + opts += ' --eod-mask-loss' + if self._args.use_rotary_position_embeddings: + opts += ' --use-rotary-position-embeddings' + if self._args.no_gradient_accumulation_fusion: + opts += ' --no-gradient-accumulation-fusion' + if self._args.use_flash_attn: + opts += ' --use-flash-attn' + if self._args.no_masked_softmax_fusion: + opts += ' --no-masked-softmax-fusion' + if self._args.no_bias_gelu_fusion: + opts += ' --no-bias-gelu-fusion' + if self._args.no_bias_dropout_fusion: + opts += ' --no-bias-dropout-fusion' + if self._args.train_mode: + opts += f' --train-mode {self._args.train_mode}' + if self._args.max_padding_length: + opts += f' --max-padding-length {self._args.max_padding_length}' + return opts + def _megatron_command(self, precision): # noqa: C901 """Generate megatron command.""" if precision == Precision.FLOAT32: @@ -252,12 +480,11 @@ def _megatron_command(self, precision): # noqa: C901 precision_megatron = '--bf16' megatron_options = f'\ - --override-opt_param-scheduler \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --tensor-model-parallel-size {self._args.tensor_model_parallel_size} \ --init-method-std {self._args.init_std} \ - --lr-decay-samples 43945312 \ + --lr-decay-samples {self._args.lr_decay_samples} \ --lr-warmup-samples {self._args.num_warmup * self._args.batch_size} \ --lr-decay-style cosine \ --micro-batch-size {self._args.micro_batch_size} \ @@ -270,54 +497,38 @@ def _megatron_command(self, precision): # noqa: C901 --train-samples {self._args.num_steps * self._args.batch_size} \ --lr {self._args.lr} \ --min-lr {self._args.min_lr} \ - --split {self._args.split} \ --log-interval {self._args.log_interval} \ --eval-interval {self._args.eval_interval} \ --eval-iters {self._args.eval_iters} \ --save-interval {self._args.save_interval} \ --weight-decay 0.1 \ --clip-grad 1.0 \ - --hysteresis 2 \ --num-workers {self._args.num_workers} \ --attention-dropout 0.0 \ --hidden-dropout 0.0 \ - --optimizer adam \ --use-distributed-optimizer \ {precision_megatron} \ --seed {self._args.seed} \ --log-throughput' - if self._args.sequence_parallel: - megatron_options = f'{megatron_options} --sequence-parallel' - if self._args.no_async_tensor_model_parallel_allreduce: - megatron_options = f'{megatron_options} --no-async-tensor-model-parallel-allreduce' - if self._args.use_rotary_position_embeddings: - megatron_options = f'{megatron_options} --use-rotary-position-embeddings' - if self._args.no_gradient_accumulation_fusion: - megatron_options = f'{megatron_options} --no-gradient-accumulation-fusion' - if self._args.use_flash_attn: - megatron_options = f'{megatron_options} --use-flash-attn' - if self._args.no_masked_softmax_fusion: - megatron_options = f'{megatron_options} --no-masked-softmax-fusion' - if self._args.no_bias_gelu_fusion: - megatron_options = f'{megatron_options} --no-bias-gelu-fusion' - if self._args.no_bias_dropout_fusion: - megatron_options = f'{megatron_options} --no-bias-dropout-fusion' - if self._args.extra: - megatron_options = f'{megatron_options} {self._args.extra}' + megatron_options = self._append_parallel_flags(megatron_options) + megatron_options = self._append_architecture_flags(megatron_options) + megatron_options = self._append_moe_flags(megatron_options) + megatron_options = self._append_optimizer_flags(megatron_options) + megatron_options = self._append_checkpoint_flags(megatron_options) + megatron_options = self._append_tokenizer_flags(megatron_options) + megatron_options = self._append_misc_flags(megatron_options) - command = '' - script_path = os.path.join(self._args.code_base, 'pretrain_gpt.py') + script_path = os.path.join(self._args.code_base, f'pretrain_{self._args.model}.py') if self._args.deepspeed: deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--')) - # No --log-throughput in Megatron-DeepSpeed by 20231219 megatron_options = megatron_options.replace('--log-throughput', '').strip() if self._num_nodes > 1: - command = f'torchrun {self._distributed_args} ' + \ - f'{script_path} {megatron_options} {self._data_options} {deepspeed_option}' + command = f'torchrun {self._distributed_args} {script_path} \ + {megatron_options} {self._data_options} {deepspeed_option}' + else: command = f'deepspeed {script_path} {megatron_options} {self._data_options} {deepspeed_option}' - else: command = f'torchrun {self._distributed_args} {script_path} {megatron_options} {self._data_options}' @@ -425,48 +636,61 @@ def _generate_dataset(self): Return: True if dataset is created successfully. """ - self._vocab_path = str(Path(self._args.data_home) / 'gpt2-vocab.json') - download_file(self._args.vocab_url, self._vocab_path) - self._merges_path = str(Path(self._args.data_home) / 'gpt2-merges.txt') - download_file(self._args.merges_url, self._merges_path) - - if not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.bin')) \ - or not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.idx')): - if self._args.dataset_url: - self._raw_data_path = str(Path(self._args.data_home) / 'data.json') - download_file(self._args.dataset_url, self._raw_data_path) - command = ( - 'python3 ' - f'{os.path.join(self._args.code_base, "tools/preprocess_data.py")} ' - f'--input {self._raw_data_path} ' - f'--tokenizer-type {self._args.tokenizer_type} ' - f'--output-prefix {os.path.join(self._args.data_home, "dataset")} ' - f'--workers {str(self._args.num_workers)} ' - f'--vocab-file {self._vocab_path} ' - f'--merge-file {self._merges_path}' - ) - - # split documents - run_command(command, flush_output=True) - # binarize dataset - run_command(command, flush_output=True) - if not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.bin')) \ - or not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.idx')): - logger.error('Dataset failed to generate.') + self._data_options = '' + if self._args.mock_data: + logger.info('Using mock data.') + self._data_options = '--mock-data' + else: + self._vocab_path = str(Path(self._args.data_home) / 'gpt2-vocab.json') + download_file(self._args.vocab_url, self._vocab_path) + self._merges_path = str(Path(self._args.data_home) / 'gpt2-merges.txt') + download_file(self._args.merges_url, self._merges_path) + + if not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.bin')) \ + or not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.idx')): + if self._args.dataset_url: + self._raw_data_path = str(Path(self._args.data_home) / 'data.json') + download_file(self._args.dataset_url, self._raw_data_path) + command = ( + 'python3 ' + f'{os.path.join(self._args.code_base, "tools/preprocess_data.py")} ' + f'--input {self._raw_data_path} ' + f'--tokenizer-type {self._args.tokenizer_type} ' + f'--output-prefix {os.path.join(self._args.data_home, "dataset")} ' + f'--workers {str(self._args.num_workers)} ' + f'--vocab-file {self._vocab_path} ' + f'--merge-file {self._merges_path}' + ) + + # split documents + run_command(command, flush_output=True) + # binarize dataset + run_command(command, flush_output=True) + if not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.bin')) \ + or not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.idx')): + logger.error('Dataset failed to generate.') + self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE) + return False + else: + logger.error('No dataset or dataset url provided.') self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE) return False - else: - logger.error('No dataset or dataset url provided.') - self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE) - return False - self._data_path = os.path.join(self._args.data_home, f'{self._args.data_prefix}') - self._data_options = f'\ - --vocab-file {self._vocab_path} \ - --merge-file {self._merges_path} \ - --data-path {self._data_path}' + self._data_path = os.path.join(self._args.data_home, f'{self._args.data_prefix}') + self._data_options = f'\ + --vocab-file {self._vocab_path} \ + --merge-file {self._merges_path} \ + --data-path {self._data_path}' + + if self._args.dataloader_type: + self._data_options += f' --dataloader-type {self._args.dataloader_type}' + if self._args.split: + self._data_options += f' --split {self._args.split}' + if self._args.data_cache_path: + self._data_options += f' --data-cache-path {self._args.data_cache_path}' + if self._args.dataset: + self._data_options += f' --dataset {self._args.dataset}' - logger.info('Dataset preparation successfully.') return True def _set_force_fp32(self): @@ -521,3 +745,54 @@ def _cal_params_count(self): # Register GPT3 benchmark. BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.CUDA) BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.ROCM) +BenchmarkRegistry.register_benchmark( + 'megatron-deepseek-v2', MegatronGPT, parameters='--model=deepseek', platform=Platform.ROCM +) +BenchmarkRegistry.register_benchmark( + 'megatron-deepseek-v2', + MegatronGPT, + parameters=( + '--model=deepseek ' + '--tokenizer_type=DeepSeekV2Tokenizer ' + '--transformer_impl=transformer_engine ' + '--num_layers=27 ' + '--hidden_size=1024 ' + '--seq_len=4096 ' + '--num_attn_heads=16 ' + '--moe_ffn_hidden_size=1408 ' + '--enable_shared_expert ' + '--moe_layer_freq=1 ' + '--num_shared_experts=2 ' + '--moe_router_topk=6 ' + '--moe_aux_loss_coeff=1e-2 ' + '--moe_router_load_balancing_type=aux_loss ' + '--num_experts=64 ' + '--patch_tokenizer_type=DeepSeekV2Tokenizer ' + '--position_embedding_type=rope ' + '--no_rope_fusion ' + '--rotary_base=10000 ' + '--rotary_scaling_factor=40 ' + '--qk_nope_head_dim=128 ' + '--qk_rope_head_dim=64 ' + '--v_head_dim=128 ' + '--ffn_hidden_size=10944 ' + '--swiglu ' + '--normalization=RMSNorm ' + '--norm_epsilon=1e-06 ' + '--no_bias_swiglu_fusion ' + '--disable_bias_linear ' + '--untie_embeddings_and_output_weights ' + '--extra_vocab_size=2400 ' + '--load=deepseek-ai/DeepSeek-V2-Lite ' + '--no_load_optim ' + '--no_load_rng ' + '--ckpt_format=torch ' + '--eod_mask_loss ' + '--train_mode=pretrain ' + '--data_cache_path=/root/cache ' + '--max_padding_length=4096 ' + '--kv_lora_rank=512 ' + '--dataloader_type=cyclic' + ), + platform=Platform.ROCM +) diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index 2f1d076f5..b7c588677 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -5,6 +5,7 @@ import os from pathlib import Path +import shlex import statistics from unittest import mock import unittest @@ -15,6 +16,26 @@ from tests.helper.testcase import BenchmarkTestCase +def normalize_command(cmd): + """Convert a CLI string into a list of meaningful argument units (key-value or flag).""" + tokens = shlex.split(cmd) + units = [] + i = 0 + while i < len(tokens): + if tokens[i].startswith('--'): + if i + 1 >= len(tokens) or tokens[i + 1].startswith('--'): + units.append(tokens[i]) # flag-only + i += 1 + else: + units.append(f'{tokens[i]} {tokens[i + 1]}') # key-value pair + i += 2 + else: + # Include positional args like torchrun, script path, etc. + units.append(tokens[i]) + i += 1 + return sorted(units) + + class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase): """Tests for IBBenchmark benchmark.""" @classmethod @@ -170,17 +191,20 @@ def test_megatron_gpt_command(self, mock_generate_dataset): benchmark = benchmark_cls( self.benchmark_name, parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \ - --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document', + --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document \ + --override_opt_param_scheduler', ) mock_generate_dataset.return_value = True benchmark._preprocess() benchmark._data_options = f'\ --vocab-file {self._tmp_dir}/gpt2-vocab.json \ --merge-file {self._tmp_dir}/gpt2-merges.txt \ - --data-path {self._tmp_dir}/dataset_text_document' + --data-path {self._tmp_dir}/dataset_text_document \ + --split 949,50,1' script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py') - expected_command = 'torchrun {distributed_args} {script_path} \ + expected_command_template = 'torchrun {distributed_args} {script_path} \ + --tokenizer-type GPT2BPETokenizer \ --override-opt_param-scheduler \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ @@ -199,7 +223,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset): --train-samples 20480 \ --lr 0.00012 \ --min-lr 1e-06 \ - --split 949,50,1 \ --log-interval 1 \ --eval-interval 10 \ --eval-iters 0 \ @@ -217,54 +240,58 @@ def test_megatron_gpt_command(self, mock_generate_dataset): --log-throughput {data_options}' precision = Precision.FLOAT32 - command = benchmark._megatron_command(precision) - self.assertEqual( - command, - expected_command.format( - precision='', - data_options=benchmark._data_options, - distributed_args=benchmark._distributed_args, - script_path=script_path - ) + expected_command = expected_command_template.format( + precision='', + data_options=benchmark._data_options, + distributed_args=benchmark._distributed_args, + script_path=script_path ) - precision = Precision.FLOAT16 command = benchmark._megatron_command(precision) - self.assertEqual( - command, - expected_command.format( - precision='--fp16', - data_options=benchmark._data_options, - distributed_args=benchmark._distributed_args, - script_path=script_path - ) + actual_units = normalize_command(command) + expected_units = normalize_command(expected_command) + self.assertEqual(actual_units, expected_units) + + precision = Precision.FLOAT16 + expected_command = expected_command_template.format( + precision='--fp16', + data_options=benchmark._data_options, + distributed_args=benchmark._distributed_args, + script_path=script_path ) - precision = Precision.BFLOAT16 command = benchmark._megatron_command(precision) - self.assertEqual( - command, - expected_command.format( - precision='--bf16', - data_options=benchmark._data_options, - distributed_args=benchmark._distributed_args, - script_path=script_path - ) + actual_units = normalize_command(command) + expected_units = normalize_command(expected_command) + self.assertEqual(actual_units, expected_units) + + precision = Precision.BFLOAT16 + expected_command = expected_command_template.format( + precision='--bf16', + data_options=benchmark._data_options, + distributed_args=benchmark._distributed_args, + script_path=script_path ) + command = benchmark._megatron_command(precision) + actual_units = normalize_command(command) + expected_units = normalize_command(expected_command) + self.assertEqual(actual_units, expected_units) os.environ['OMPI_COMM_WORLD_SIZE'] = '1' benchmark = benchmark_cls( self.benchmark_name, parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \ - --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document --deepspeed', + --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document \ + --deepspeed --override_opt_param_scheduler', ) - mock_generate_dataset.return_value = True benchmark._preprocess() benchmark._data_options = f'\ --vocab-file {self._tmp_dir}/gpt2-vocab.json \ --merge-file {self._tmp_dir}/gpt2-merges.txt \ - --data-path {self._tmp_dir}/dataset_text_document' + --data-path {self._tmp_dir}/dataset_text_document \ + --split 949,50,1' command = benchmark._megatron_command(Precision.BFLOAT16) expected_command = 'deepspeed {script_path} --override-opt_param-scheduler \ + --tokenizer-type GPT2BPETokenizer \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --tensor-model-parallel-size 1 \ @@ -282,7 +309,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset): --train-samples 20480 \ --lr 0.00012 \ --min-lr 1e-06 \ - --split 949,50,1 \ --log-interval 1 \ --eval-interval 10 \ --eval-iters 0 \ @@ -306,15 +332,173 @@ def test_megatron_gpt_command(self, mock_generate_dataset): --train-tokens 300000000000 \ --data-impl mmap --no-pipeline-parallel' - self.assertEqual( - command, - expected_command.format( - precision='--bf16', - data_options=benchmark._data_options, - script_path=script_path, - deepseed_options=expect_ds_options - ) + expected_command = expected_command.format( + precision='--bf16', + data_options=benchmark._data_options, + deepseed_options=expect_ds_options, + script_path=script_path ) + command = benchmark._megatron_command(Precision.BFLOAT16) + actual_units = normalize_command(command) + expected_units = normalize_command(expected_command) + self.assertEqual(actual_units, expected_units) + + def test_deepseek_v2_command(self): + """Test v2 command.""" + # test deepspeed with megatron + os.environ['OMPI_COMM_WORLD_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_RANK'] = '0' + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '12345' + with open(self.hostfile_path, 'w') as f: + f.write('host1\n') + + benchmark_name = 'megatron-deepseek-v2' + (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.ROCM) + assert (benchmark_cls) + benchmark = benchmark_cls( + benchmark_name, + parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} ' + '--num_warmup 0 ' + '--num_steps 10 ' + '--batch_size 256 ' + '--expert_model_parallel_size 8 ' + '--micro_batch_size 2 ' + '--mock_data ' + '--model=deepseek ' + '--tokenizer_type=DeepSeekV2Tokenizer ' + '--transformer_impl=transformer_engine ' + '--num_layers=27 ' + '--hidden_size=1024 ' + '--seq_len=4096 ' + '--ffn_hidden_size=10944 ' + '--num_attn_heads=16 ' + '--moe_ffn_hidden_size=1408 ' + '--enable_shared_expert ' + '--moe_layer_freq=1 ' + '--num_shared_experts=2 ' + '--moe_router_topk=6 ' + '--moe_aux_loss_coeff=0.01 ' + '--moe_router_load_balancing_type=aux_loss ' + '--num_experts=64 ' + '--patch_tokenizer_type=DeepSeekV2Tokenizer ' + '--position_embedding_type=rope ' + '--no_rope_fusion ' + '--rotary_base=10000 ' + '--rotary_scaling_factor=40 ' + '--qk_nope_head_dim=128 ' + '--qk_rope_head_dim=64 ' + '--v_head_dim=128 ' + '--ffn_hidden_size=10944 ' + '--swiglu ' + '--normalization=RMSNorm ' + '--norm_epsilon=1e-06 ' + '--no_bias_swiglu_fusion ' + '--disable_bias_linear ' + '--untie_embeddings_and_output_weights ' + '--extra_vocab_size=2400 ' + '--load=deepseek-ai/DeepSeek-V2-Lite ' + '--no_load_optim ' + '--no_load_rng ' + '--ckpt_format=torch ' + '--eod_mask_loss ' + '--train_mode=pretrain ' + '--data_cache_path=/root/cache ' + '--max_padding_length=4096 ' + '--kv_lora_rank=512 ' + '--dataloader_type=cyclic ' + ) + + benchmark._preprocess() + benchmark._data_options = '\ + --mock-data \ + --dataloader-type cyclic \ + --data-cache-path /root/cache \ + --dataset LLama-Pretrain-Idxmap' + + precision = Precision.BFLOAT16 + command = benchmark._megatron_command(precision) + + expected_command = ( + 'torchrun {script_path} --bf16 \ + --init-method-std 0.009 \ + --adam-beta1 0.9 \ + --hidden-dropout 0.0 \ + --min-lr 1e-06 \ + --lr 0.00012 \ + --optimizer adam \ + --log-interval 1 \ + --eval-interval 10 \ + --seed 1234 \ + --eval-iters 0 \ + --max-position-embeddings 4096 \ + --hysteresis 2 \ + --lr-decay-style cosine \ + --lr-decay-samples 43945312 \ + --clip-grad 1.0 \ + --save-interval 10000 \ + --adam-beta2 0.95 \ + --moe-aux-loss-coeff 0.01 \ + --log-throughput \ + --num-workers 8 \ + --use-distributed-optimizer \ + --attention-dropout 0.0 \ + --tensor-model-parallel-size 1 \ + --lr-warmup-samples 0 \ + --weight-decay 0.1 \ + --train-samples 2560 \ + --no-load-optim \ + --load deepseek-ai/DeepSeek-V2-Lite \ + --no-load-rng \ + --ffn-hidden-size 10944 \ + --patch-tokenizer-type DeepSeekV2Tokenizer \ + --swiglu \ + --normalization RMSNorm \ + --norm-epsilon 1e-06 \ + --no-bias-swiglu-fusion \ + --no-rope-fusion \ + --position-embedding-type rope \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --ckpt-format torch \ + --rotary-base 10000 \ + --rotary-scaling-factor 40 \ + --eod-mask-loss \ + --moe-ffn-hidden-size 1408 \ + --enable-shared-expert \ + --moe-layer-freq 1 \ + --num-shared-experts 2 \ + --moe-router-topk 6 \ + --kv-lora-rank 512 \ + --qk-nope-head-dim 128 \ + --qk-rope-head-dim 64 \ + --v-head-dim 128 \ + --moe-router-load-balancing-type aux_loss \ + --train-mode pretrain \ + --extra-vocab-size 2400 \ + --global-batch-size 256 \ + --micro-batch-size 2 \ + --num-layers 27 \ + --hidden-size 1024 \ + --seq-length 4096 \ + --num-attention-heads 16 \ + --tokenizer-type DeepSeekV2Tokenizer \ + --transformer-impl transformer_engine \ + --num-experts 64 \ + --expert-model-parallel-size 8 \ + --max-padding-length 4096 \ + {data_options} \ + {disitributed_args}' + ).format( + script_path=str(Path(self._tmp_dir) / 'pretrain_deepseek.py'), + data_options=benchmark._data_options, + disitributed_args=benchmark._distributed_args + ) + actual_units = normalize_command(command) + expected_units = normalize_command(expected_command) + + self.assertEqual(actual_units, expected_units) @decorator.load_data('tests/data/megatron_deepspeed.log') @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') diff --git a/third_party/Makefile b/third_party/Makefile index a8360bb85..667a46a47 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -16,13 +16,13 @@ ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0") NUM_MAKE_JOBS ?= $(shell nproc --ignore=2) -.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm nvbandwidth +.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm # Build targets. all: cuda rocm cuda_with_msccl: cuda cuda_msccl cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth -rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm +rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm cpu: common cpu_perftest common: fio cpu_stream @@ -230,6 +230,18 @@ megatron_deepspeed: python -m pip install --no-cache-dir -r requirements.txt && \ python -m pip install DeepSpeed +rocm_megatron_lm: + cd Megatron && mkdir -p rocm && cd rocm && \ + if [ ! -d "Megatron-LM" ]; then \ + git clone -b rocm_dev https://github.com/ROCm/Megatron-LM.git ; \ + fi + cp Megatron/rocm/Megatron-LM/examples/deepseek_v2/pretrain_deepseek.py Megatron/rocm/Megatron-LM/ + git clone https://github.com/caaatch22/grouped_gemm.git &&\ + cd grouped_gemm &&\ + git checkout 8a9b438 &&\ + git submodule update --init --recursive &&\ + pip install . + # Instal apex of ROCm due to dependency of Megatron apex_rocm: $(eval TORCH_VERSION ?= $(shell python -c "import torch; print(torch.__version__)"))