huggingface · NouamaneTazi · Apr 14, 2025 · Apr 14, 2025 · Apr 16, 2025 · Apr 16, 2025
diff --git a/examples/config_qwen.py b/examples/config_qwen.py
@@ -30,7 +30,7 @@
     "410m": (24, 1024, 16, 16, 4096),  # ~410M params
     # Small to medium models
     "1b": (16, 2048, 16, 16, 5632),  # ~1B params
-    "3b": (28, 2048, 16, 2, 11008),  # ~3B params
+    "3b": (36, 2048, 16, 4, 11008),  # ~3B params
     # Standard sizes
     "7b": (32, 4096, 32, 32, 11008),  # ~7B params
     "13b": (40, 5120, 40, 40, 13824),  # ~13B params
@@ -47,7 +47,7 @@ def get_args():
     parser.add_argument(
         "--model",
         choices=MODEL_SIZES.keys(),
-        default="custom",
+        default="3b",
         help="Model size to generate config for (e.g., 7b, 13b)",
     )
     parser.add_argument(
@@ -76,6 +76,10 @@ def get_args():
     tokens_group.add_argument("--mbs", type=int, default=3, help="Micro batch size")
     tokens_group.add_argument("--acc", type=int, default=1, help="Batch accumulation per replica")
 
+    # checkpoints
+    checkpoints_group = parser.add_argument_group("checkpoints")
+    checkpoints_group.add_argument("--ckpt-save", type=int, default=10, help="Checkpoint save interval")
+
     args = parser.parse_args()
     return args
 
@@ -108,7 +112,7 @@ def get_model_config(model_size: str) -> Qwen2Config:
         is_qwen2_config=True,
         pad_token_id=None,
         _attn_implementation="flash_attention_2",
-        # sliding_window_size=20,
+        _use_doc_masking=True,
     )
 
 
@@ -154,7 +158,7 @@ def calculate_parameters(model_config: Qwen2Config) -> str:
 
 def create_config(model_config: Qwen2Config, args: argparse.Namespace) -> Config:
     learning_rate = LRSchedulerArgs(
-        learning_rate=3e-4, lr_warmup_steps=2, lr_warmup_style="linear", lr_decay_style="cosine", min_decay_lr=1e-5
+        learning_rate=3e-4, lr_warmup_steps=2000, lr_warmup_style="linear", lr_decay_style="cosine", min_decay_lr=0
     )
     parallelism = ParallelismArgs(
         dp=args.dp,
@@ -175,7 +179,7 @@ def create_config(model_config: Qwen2Config, args: argparse.Namespace) -> Config
     )
     optimizer = OptimizerArgs(
         zero_stage=args.zero,
-        weight_decay=0.01,
+        weight_decay=0.1,
         clip_grad=1.0,
         accumulate_grad_in_fp32=True,
         learning_rate_scheduler=learning_rate,
@@ -192,7 +196,7 @@ def create_config(model_config: Qwen2Config, args: argparse.Namespace) -> Config
 
     return Config(
         general=GeneralArgs(project="debug", run=args.run, seed=seed, ignore_sanity_checks=args.no_sanity),
-        checkpoints=CheckpointsArgs(checkpoints_path=checkpoints_path, checkpoint_interval=10),
+        checkpoints=CheckpointsArgs(checkpoints_path=checkpoints_path, checkpoint_interval=args.ckpt_save),
         parallelism=parallelism,
         model=ModelArgs(init_method=RandomInit(std=0.025), model_config=model_config),
         # tokenizer=TokenizerArgs("HuggingFaceTB/cosmo2-tokenizer"),
@@ -219,7 +223,11 @@ def create_config(model_config: Qwen2Config, args: argparse.Namespace) -> Config
     world_size = args.dp * args.tp * args.pp * args.cp
     if world_size <= 8:
         print(
-            f"CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node={world_size} run_train.py --config-file {args.out}"
+            f"ENABLE_TIMERS=1 DEBUG_CPU=1 STATS_SAMPLING_INTERVAL_IN_SEC=1 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node={world_size} run_train.py --config-file {args.out}"
         )
+        print("You can also use environment variables for more debugging:")
+        print("  - ENABLE_TIMERS=1: Enable detailed timing information")
+        print("  - DEBUG_CPU=1: Log CPU and memory usage statistics")
+        print("  - STATS_SAMPLING_INTERVAL_IN_SEC=1: Set sampling interval for metrics collection")
     else:
         print("Checkout slurm_launcher.py to launch a multi-node job")
diff --git a/examples/config_qwen.yaml b/examples/config_qwen.yaml
@@ -1,5 +1,5 @@
 checkpoints:
-  checkpoint_interval: 10
+  checkpoint_interval: 100000
   checkpoints_path: checkpoints
   checkpoints_path_is_shared_file_system: false
   load_lr_scheduler: true
@@ -30,9 +30,9 @@ data_stages:
 general:
   benchmark_csv_path: null
   consumed_train_samples: null
-  ignore_sanity_checks: false
+  ignore_sanity_checks: true
   project: debug
-  run: qwen_20250423_201000_16423158
+  run: qwen_20250424_120835_16423158
   seed: 42
   step: null
 lighteval: null
@@ -50,24 +50,24 @@ model:
   make_vocab_size_divisible_by: 1
   model_config:
     _attn_implementation: flash_attention_2
-    _fused_rms_norm: false
-    _fused_rotary_emb: false
-    _use_doc_masking: false
-    _use_qkv_packed: false
+    _fused_rms_norm: true
+    _fused_rotary_emb: true
+    _use_doc_masking: true
+    _use_qkv_packed: true
     attention_bias: false
     bos_token_id: 1
     eos_token_id: 2
     flex_attention_mask: null
     hidden_act: silu
-    hidden_size: 256
+    hidden_size: 2048
     initializer_range: 0.02
-    intermediate_size: 768
+    intermediate_size: 11008
     is_qwen2_config: true
     max_position_embeddings: 4096
     moe_config: null
     no_rope_layer: null
-    num_attention_heads: 4
-    num_hidden_layers: 12
+    num_attention_heads: 16
+    num_hidden_layers: 36
     num_key_value_heads: 4
     pad_token_id: null
     pretraining_tp: 1
@@ -108,7 +108,7 @@ parallelism:
   pp: 1
   pp_engine: 1f1b
   recompute_layer: false
-  tp: 1
+  tp: 2
   tp_linear_async_communication: true
   tp_mode: REDUCE_SCATTER
   tp_recompute_allgather: true

diff --git a/run_generate.py b/run_generate.py
@@ -5,6 +5,14 @@
 ```
 export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
 torchrun --nproc_per_node=1 run_generate.py --ckpt-path checkpoints/10
+torchrun --nproc_per_node=2 run_generate.py --ckpt-path /scratch/1044000
+
+torchrun --rdzv_endpoint=127.0.0.1:12357 --nproc_per_node=2 run_generate.py --ckpt-path /scratch/1044000 --use-cache --max-micro-batch-size 2
+export CUDA_VISIBLE_DEVICES=2,3
+torchrun --rdzv_endpoint=127.0.0.1:12356 --nproc_per_node=2 run_generate.py --ckpt-path /scratch/1044000 --use-cache
+export CUDA_VISIBLE_DEVICES=4,5
+torchrun --rdzv_endpoint=127.0.0.1:12355 --nproc_per_node=2 run_generate.py --ckpt-path /scratch/1044000 --max-micro-batch-size 2 --use-decode-tokenized
+torchrun --rdzv_endpoint=127.0.0.1:12355 --nproc_per_node=2 run_generate.py --ckpt-path /scratch/1044000 --use-decode-tokenized
 ```
 """
 
@@ -45,10 +53,7 @@
 from nanotron.serialize import load_weights
 from nanotron.trainer import CONFIG_TO_MODEL_CLASS, mark_tied_parameters
 
-try:
-    from transformers import AutoTokenizer
-except ImportError:
-    AutoTokenizer = None
+from transformers import AutoTokenizer
 
 # import lovely_tensors as lt
 
@@ -65,6 +70,10 @@ def get_args():
     parser.add_argument("--tp", type=int, default=0)
     parser.add_argument("--max-new-tokens", type=int, default=128, help="Maximum number of new tokens to generate")
     parser.add_argument("--use-cache", action="store_true", help="Use KV cache to speed up generation")
+    parser.add_argument(
+        "--max-micro-batch-size", type=int, default=1, help="Maximum number of micro batches to generate"
+    )
+    parser.add_argument("--use-decode-tokenized", action="store_true", help="Use decode_tokenized to generate text")
     return parser.parse_args()
 
 
@@ -73,6 +82,17 @@ def main():
 
     assert args.ckpt_path.exists(), f"Checkpoint path {args.ckpt_path} does not exist"
 
+    dummy_inputs = [
+        # "The future of AI is",
+        # "Passage: Daniel went back to the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:",
+        "def fib(n)",
+        # 'Here is an extract from a webpage: "Have you ever experienced heel pain after a heavy physical activity, or even right after a long period of standing? If you regard this as something usual and normal, then think again. Miscalled as heel pain, plantar fasciitis causes these frequent mild pains experienced in the soles of the feet. It is the inflammation and enlargement the plantar fascia tissue that is located in the heels of the feet, stretching to the base of the toes. This tissue is responsible for absorbing shock in the feet and for supporting the arches. It also plays a vital role in foot movements during walking and standing. Many factors such as excessive walking, standing, and running trigger heel pain and plantar fasciitis. A sudden increase in intensity of activities, increase in weight, and abrupt change of footwear also cause the swelling of the ligament. Non-supportive footwear lacking arch cushions and improper and worn out running or training can also lead to the problem. It is also most evident among those". Write an extensive and detailed course unit suitable for a textbook targeted at college students, related to the given extract, within the context of "Medicine". Do not just list concepts, but develop each one in detail before moving to the next, as we prioritize depth of understanding and comprehensive exploration of the subject matter over breadth. Focus on: - Rigor: Ensure in-depth coverage of the concepts/sections. - Engagement: Write with an academic, professional and engaging tone that captivates interest. - Application: Incorporate specific, practical examples, such as proofs in calculus or critical dates and figures in history. Do not include a title or an introduction, simply write the content without headlines and introductory phrases. Do not use images.',
+        # "Advancements in technology will lead to",
+        # "Tomorrow's world is shaped by",
+        # "What is the meaning of the word chutzpah?\nThe word chutzpah means",
+    ]
+
+
     config = get_config_from_file((args.ckpt_path / "config.yaml").as_posix())
     model_config = config.model.model_config
     tokenizer_path = config.tokenizer.tokenizer_name_or_path
@@ -154,36 +174,29 @@ def main():
     load_weights(model=model, parallel_context=parallel_context, root_folder=checkpoint_path)
 
     model.eval()
-    if AutoTokenizer is not None:
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-        # tokenizer.pad_token_id = tokenizer.eos_token_id
-        if tokenizer.pad_token_id is None:
-            if tokenizer.eos_token_id is not None:
-                tokenizer.pad_token_id = tokenizer.eos_token_id
-            elif getattr(model.config, "pad_token_id", None) is not None:
-                tokenizer.pad_token_id = int(model.config.pad_token_id)
-            elif getattr(model.config, "eos_token_id", None) is not None:
-                tokenizer.pad_token_id = int(model.config.eos_token_id)
-            else:
-                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-        tokenizer.padding_side = "left"
-        tokenizer.truncation_side = "left"  # TODO @nouamane: do we want this?
-        dummy_inputs = [
-            # "The future of AI is",
-            "Passage: Daniel went back to the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:",
-            "def fib(n)",
-            # 'Here is an extract from a webpage: "Have you ever experienced heel pain after a heavy physical activity, or even right after a long period of standing? If you regard this as something usual and normal, then think again. Miscalled as heel pain, plantar fasciitis causes these frequent mild pains experienced in the soles of the feet. It is the inflammation and enlargement the plantar fascia tissue that is located in the heels of the feet, stretching to the base of the toes. This tissue is responsible for absorbing shock in the feet and for supporting the arches. It also plays a vital role in foot movements during walking and standing. Many factors such as excessive walking, standing, and running trigger heel pain and plantar fasciitis. A sudden increase in intensity of activities, increase in weight, and abrupt change of footwear also cause the swelling of the ligament. Non-supportive footwear lacking arch cushions and improper and worn out running or training can also lead to the problem. It is also most evident among those". Write an extensive and detailed course unit suitable for a textbook targeted at college students, related to the given extract, within the context of "Medicine". Do not just list concepts, but develop each one in detail before moving to the next, as we prioritize depth of understanding and comprehensive exploration of the subject matter over breadth. Focus on: - Rigor: Ensure in-depth coverage of the concepts/sections. - Engagement: Write with an academic, professional and engaging tone that captivates interest. - Application: Incorporate specific, practical examples, such as proofs in calculus or critical dates and figures in history. Do not include a title or an introduction, simply write the content without headlines and introductory phrases. Do not use images.',
-            # "Advancements in technology will lead to",
-            # "Tomorrow's world is shaped by",
-        ]
 
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    # tokenizer.pad_token_id = tokenizer.eos_token_id
+    if tokenizer.pad_token_id is None:
+        if tokenizer.eos_token_id is not None:
+            tokenizer.pad_token_id = tokenizer.eos_token_id
+        elif getattr(model.config, "pad_token_id", None) is not None:
+            tokenizer.pad_token_id = int(model.config.pad_token_id)
+        elif getattr(model.config, "eos_token_id", None) is not None:
+            tokenizer.pad_token_id = int(model.config.eos_token_id)
+        else:
+            tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+    tokenizer.padding_side = "left"
+    tokenizer.truncation_side = "left"  # TODO @nouamane: do we want this?
+
+    if not args.use_decode_tokenized:    
         outputs = decode_text(
             input_iter=(GenerationInput(text=text) for text in dummy_inputs),
             tokenizer=tokenizer,
             model=model.model,
             parallel_context=parallel_context,
             max_new_tokens=args.max_new_tokens,
-            max_micro_batch_size=2,
+            max_micro_batch_size=args.max_micro_batch_size,
             generation_config=GenerationArgs(sampler="greedy", use_cache=args.use_cache),
             tokenizer_config=TokenizerConfig(max_input_length=None),
             is_bench=os.environ.get("USE_BENCH", "0") == "1",
@@ -217,15 +230,27 @@ def main():
                 rank=0,
             )
     else:
+        # Tokenize dummy inputs
+        tokenized_inputs = tokenizer(
+            dummy_inputs,
+            padding=True,
+            truncation=True,
+            return_tensors="pt",
+            add_special_tokens=False, # TODO: this is important to avoid adding bos token to the input
+        )
+        input_ids = tokenized_inputs["input_ids"].to(device="cuda")
+        attention_mask = tokenized_inputs["attention_mask"].to(device="cuda")
+
         outputs = decode_tokenized(
-            input_ids=torch.zeros(1, 1).to(dtype=torch.int64, device="cuda"),
-            input_mask=torch.ones(1, 1).to(dtype=torch.bool, device="cuda"),
+            input_ids=input_ids,
+            input_mask=attention_mask,
             model=model.model,
             parallel_context=parallel_context,
-            generation_config=GenerationArgs(sampler="greedy", use_cache=True),
-            max_micro_batch_size=1,
-            max_new_tokens=12,
+            generation_config=GenerationArgs(sampler="greedy", use_cache=args.use_cache),
+            max_micro_batch_size=args.max_micro_batch_size,
+            max_new_tokens=args.max_new_tokens,
             returns_logits=False,
+            bos_token_id=tokenizer.bos_token_id,
         )
         for output in outputs:
             input_ids = output.input_ids
@@ -234,8 +259,16 @@ def main():
                 assert isinstance(generated_ids, TensorPointer)
                 continue
             assert isinstance(generated_ids, torch.Tensor)
+
+            log_rank(
+                f"input: {tokenizer.decode(input_ids, clean_up_tokenization_spaces=False)[:1000]}",
+                logger=logger,
+                level=logging.INFO,
+                rank=0,
+            )
+
             log_rank(
-                f"generation: {generated_ids[len(input_ids) :]}",
+                f"generation: {tokenizer.decode(generated_ids[len(input_ids):], clean_up_tokenization_spaces=False)}",
                 logger=logger,
                 level=logging.INFO,
                 rank=0,

diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
@@ -632,7 +632,7 @@ def get_config_from_dict(
                 InitScalingMethod: lambda x: InitScalingMethod[x.upper()],
                 SamplerType: lambda x: SamplerType[x.upper()],
             },
-            # strict_unions_match=True,
+            strict_unions_match=True,
             strict=True,
         ),
     )

diff --git a/src/nanotron/config/lighteval_config.py b/src/nanotron/config/lighteval_config.py
@@ -79,11 +79,11 @@ class LightEvalSlurm:
 
     gpus_per_node: int = 8
     partition: str = "hopper-prod"
-    hf_cache: str = "~/.cache/huggingface"
+    hf_cache: Optional[str] = None
     cpus_per_task: int = 88
     qos: str = "low"
     time: str = "24:00:00"
-    reservation: Optional[str] = "smollm"
+    reservation: Optional[str] = None
 
     def __post_init__(self):
         self.hf_cache = str(Path(self.hf_cache).expanduser())
@@ -109,11 +109,15 @@ class LightEvalConfig:
     logging: Optional[LightEvalLoggingArgs] = None
     wandb: Optional[LightEvalWandbLoggerConfig] = None
     slurm: Optional[LightEvalSlurm] = None
-    s3_save_path: Optional[str] = None # should not be dependent of the run_name
-    output_dir: Optional[str] = None # we should sanity check that it's the same as the one in the eval_config_override
+    s3_save_path: Optional[str] = None  # should not be dependent of the run_name
+    upload_to_wandb: Optional[bool] = False
+    wandb_project: Optional[str] = None
+    wandb_entity: Optional[str] = None
+    output_dir: Optional[
+        str
+    ] = None  # we should sanity check that it's the same as the one in the eval_config_override
     nanotron_path: Optional[str] = "./"
-    eval_config_override: str = None
-    eval_config_override: Path = None  # Previously hardcoded in run_slurm_one_job
+    lighteval_config_path: Path = None  # Previously hardcoded in run_slurm_one_job
     eval_interval: Optional[
         int
     ] = None  # Must be multiple of checkpoint_interval. If None, eval will be done after each checkpoint upload to s3
@@ -127,6 +131,12 @@ def __post_init__(self):
         if self.slurm is None:
             self.slurm = LightEvalSlurm()
         self.local_checkpoint_dir = str(Path(self.local_checkpoint_dir).expanduser())
+        if self.upload_to_wandb:
+            assert (
+                self.s3_save_path is not None
+            ), " We should have a s3_save_path if we want to upload to wandb"  # todo: add the option to read from local folder i guess
+            assert self.wandb_project is not None, "wandb_project must be specified if upload_to_wandb is True"
+            assert self.wandb_entity is not None, "wandb_entity must be specified if upload_to_wandb is True"
         if self.eval_interval_file is not None and Path(self.eval_interval_file).exists():
             logger.warning(
                 f"Eval interval file {self.eval_interval_file} exists. `eval_interval` will be replaced by the value in the file upon the next evaluation. You should probably delete this file if that's not what you want."

diff --git a/src/nanotron/config/models_config.py b/src/nanotron/config/models_config.py
@@ -279,4 +279,4 @@ def n_inner(self):
         return self.intermediate_size
 
 
-NanotronConfigs = Union[LlamaConfig, Starcoder2Config, Qwen2Config, Any]
+NanotronConfigs = Union[LlamaConfig, Starcoder2Config, Qwen2Config]
diff --git a/src/nanotron/config/parallelism_config.py b/src/nanotron/config/parallelism_config.py
@@ -35,6 +35,8 @@ class ParallelismArgs:
     recompute_layer: bool = False
     tp_recompute_allgather: bool = True
 
+    moe_layer_recompute: bool = False  # TODO: legacy config for smollm
+
     expert_parallel_size: int = 1
     context_parallel_size: int = 1
Original file line number	Diff line number	Diff line change
Expand Up		@@ -279,4 +279,4 @@ def n_inner(self):
		return self.intermediate_size


		NanotronConfigs = Union[LlamaConfig, Starcoder2Config, Qwen2Config, Any]
		NanotronConfigs = Union[LlamaConfig, Starcoder2Config, Qwen2Config]