@@ -174,6 +174,115 @@ def test_megatron_gpt_dataset(self):
174174 ret = benchmark ._generate_dataset ()
175175 assert (ret is True )
176176
177+ @mock .patch ('superbench.benchmarks.model_benchmarks.megatron_gpt3.run_command' )
178+ @mock .patch ('superbench.benchmarks.model_benchmarks.megatron_gpt3.download_file' )
179+ def test_megatron_gpt_dataset_generate_command (self , mock_download_file , mock_run_command ):
180+ """Verify _generate_dataset clamps --workers to >=1 and derives --output-prefix from data_prefix."""
181+ (benchmark_cls , _ ) = BenchmarkRegistry ._BenchmarkRegistry__select_benchmark (self .benchmark_name , Platform .CUDA )
182+ assert (benchmark_cls )
183+ os .environ ['OMPI_COMM_WORLD_SIZE' ] = '1'
184+ os .environ ['OMPI_COMM_WORLD_LOCAL_SIZE' ] = '1'
185+ os .environ ['OMPI_COMM_WORLD_RANK' ] = '0'
186+ os .environ ['MASTER_ADDR' ] = 'localhost'
187+ os .environ ['MASTER_PORT' ] = '12345'
188+
189+ # Use a real, valid code_base so _preprocess() can validate it (avoid hardcoded /root path).
190+ # Clean up after this test so the alphabetically-later test_megatron_gpt_preprocess
191+ # (which expects pretrain_gpt.py to NOT exist initially) is not affected by leaked state.
192+ self .createMockFiles (['pretrain_gpt.py' ])
193+ pretrain_path = Path (self ._tmp_dir ) / 'pretrain_gpt.py'
194+
195+ # Helper: make run_command's side_effect create the expected .bin/.idx files
196+ # so _generate_dataset() (invoked from within _preprocess()) succeeds.
197+ created_files = []
198+
199+ def _make_dataset_files (prefix ):
200+ def _side_effect (* _args , ** _kwargs ):
201+ for ext in ('.bin' , '.idx' ):
202+ p = Path (self ._tmp_dir ) / f'{ prefix } { ext } '
203+ p .touch ()
204+ created_files .append (p )
205+
206+ return _side_effect
207+
208+ def _cleanup_created_files ():
209+ for p in created_files + [pretrain_path ]:
210+ if p .is_file ():
211+ p .unlink ()
212+
213+ self .addCleanup (_cleanup_created_files )
214+
215+ def _build_benchmark (extra_params ):
216+ return benchmark_cls (
217+ self .benchmark_name ,
218+ parameters = (
219+ f'--code_base { self ._tmp_dir } --data_home { self ._tmp_dir } '
220+ f'--batch_size 2048 --dataset_url http://example.com/data.json '
221+ f'{ extra_params } '
222+ ),
223+ )
224+
225+ def _run_case (extra_params , expected_workers , expected_prefix_basename , expected_data_prefix ):
226+ mock_run_command .reset_mock ()
227+ mock_run_command .side_effect = _make_dataset_files (expected_data_prefix )
228+ benchmark = _build_benchmark (extra_params )
229+ assert benchmark ._preprocess () is True
230+ assert mock_run_command .call_count >= 1
231+ # Use tuple indexing instead of `.args` for Python 3.7 compatibility
232+ # (mock.call.args was added in Python 3.8).
233+ cmd = mock_run_command .call_args_list [0 ][0 ][0 ]
234+ units = normalize_command (cmd )
235+ assert f'--workers { expected_workers } ' in units , units
236+ expected_output_prefix = os .path .join (self ._tmp_dir , expected_prefix_basename )
237+ assert f'--output-prefix { expected_output_prefix } ' in units , units
238+
239+ def _run_invalid_case (extra_params , expected_downloads ):
240+ """Assert _preprocess() fails fast with INVALID_ARGUMENT and no run_command call.
241+
242+ expected_downloads is the number of download_file calls before validation fails:
243+ negative num_workers is rejected before any download (0), while an invalid
244+ data_prefix is rejected only after the vocab + merges downloads (2).
245+ """
246+ mock_run_command .reset_mock ()
247+ mock_run_command .side_effect = None
248+ mock_download_file .reset_mock ()
249+ benchmark = _build_benchmark (extra_params )
250+ assert benchmark ._preprocess () is False
251+ assert mock_run_command .call_count == 0
252+ assert mock_download_file .call_count == expected_downloads
253+ assert benchmark .return_code == ReturnCode .INVALID_ARGUMENT
254+
255+ # Case 1: num_workers=0 with default data_prefix should produce '--workers 1' (clamped)
256+ # and '--output-prefix <data_home>/dataset' (default 'dataset_text_document' suffix stripped).
257+ _run_case (
258+ extra_params = '--num_workers 0' ,
259+ expected_workers = 1 ,
260+ expected_prefix_basename = 'dataset' ,
261+ expected_data_prefix = 'dataset_text_document' ,
262+ )
263+
264+ # Case 2: num_workers=4 with custom data_prefix='custom_text_document' should produce
265+ # '--workers 4' and '--output-prefix <data_home>/custom'.
266+ _run_case (
267+ extra_params = '--num_workers 4 --data_prefix custom_text_document' ,
268+ expected_workers = 4 ,
269+ expected_prefix_basename = 'custom' ,
270+ expected_data_prefix = 'custom_text_document' ,
271+ )
272+
273+ # Case 3: data_prefix without the '_text_document' suffix is invalid for generation
274+ # because preprocess_data.py would produce 'mydata_text_document.bin/.idx' but the
275+ # existence check looks for 'mydata.bin/.idx'. _preprocess() must fail fast (after the
276+ # vocab + merges downloads).
277+ _run_invalid_case (extra_params = '--num_workers 2 --data_prefix mydata' , expected_downloads = 2 )
278+
279+ # Case 4: data_prefix == '_text_document' has an empty stem after stripping the suffix,
280+ # which would produce a malformed '--output-prefix <data_home>/'. Must fail fast.
281+ _run_invalid_case (extra_params = '--num_workers 1 --data_prefix _text_document' , expected_downloads = 2 )
282+
283+ # Case 5: negative num_workers is invalid input and is rejected before any downloads.
284+ _run_invalid_case (extra_params = '--num_workers -1 --data_prefix negative_text_document' , expected_downloads = 0 )
285+
177286 @mock .patch ('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset' )
178287 def test_megatron_gpt_command (self , mock_generate_dataset ):
179288 """Test command generation."""
0 commit comments