Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
d4e9daf
InitScalingMethod
NouamaneTazi Apr 14, 2025
6e7f0fa
InitScalingMethod
NouamaneTazi Apr 14, 2025
24d07e5
eval
NouamaneTazi Apr 16, 2025
438257a
try adding lightevalrunner to trainer
NouamaneTazi Apr 16, 2025
4f8a350
amend
NouamaneTazi Apr 16, 2025
c9c479d
amend
NouamaneTazi Apr 16, 2025
190a6b9
amend
NouamaneTazi Apr 17, 2025
004a89c
amend
NouamaneTazi Apr 17, 2025
b4cbb55
amend
NouamaneTazi Apr 17, 2025
d39872b
amend
NouamaneTazi Apr 17, 2025
feb818a
.
NouamaneTazi Apr 17, 2025
025f314
amend
NouamaneTazi Apr 17, 2025
abe75af
amend
NouamaneTazi Apr 17, 2025
bd50c66
.
NouamaneTazi Apr 17, 2025
2227432
qos to low
eliebak Apr 17, 2025
b62cacd
add nanotron_path
eliebak Apr 17, 2025
802fad6
some fix: logs, and config
eliebak Apr 17, 2025
895354a
cp instead of sync
eliebak Apr 17, 2025
55a5d3e
eval_interval
NouamaneTazi Apr 17, 2025
298492e
serialize sanity checks
NouamaneTazi Apr 17, 2025
4219ec8
add output dir and s3_save path in the config
eliebak Apr 17, 2025
f1780ec
add output dir and s3_save path in the config
eliebak Apr 17, 2025
016760e
fix s3 only if define
eliebak Apr 17, 2025
85138ca
fixes
NouamaneTazi Apr 17, 2025
0390de2
Merge branch 'nouamane/lighteval' of https://github.com/huggingface/n…
NouamaneTazi Apr 17, 2025
fefb560
add requeue
eliebak Apr 17, 2025
4558036
add wandb with lighteval and fix eval interval
eliebak Apr 18, 2025
17b5284
Merge branch 'nouamane/lighteval' of github.com:huggingface/nanotron …
eliebak Apr 18, 2025
b5ea942
fix this little space :(
eliebak Apr 20, 2025
561ca6b
folder_path should always have s3 when using s3 (fix consumed tokens …
NouamaneTazi Apr 23, 2025
dc6edaa
Merge branch 'dev' of https://github.com/huggingface/nanotron into no…
NouamaneTazi Apr 24, 2025
7724cf1
config qwen
NouamaneTazi Apr 24, 2025
46949b6
.
NouamaneTazi Apr 24, 2025
1078404
fix makefile, sync with datatrove, update lighteval config (#363)
hynky1999 May 8, 2025
bb33a52
Nouamane/lighteval fix (#360)
NouamaneTazi Jun 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions examples/config_qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
"410m": (24, 1024, 16, 16, 4096), # ~410M params
# Small to medium models
"1b": (16, 2048, 16, 16, 5632), # ~1B params
"3b": (28, 2048, 16, 2, 11008), # ~3B params
"3b": (36, 2048, 16, 4, 11008), # ~3B params
# Standard sizes
"7b": (32, 4096, 32, 32, 11008), # ~7B params
"13b": (40, 5120, 40, 40, 13824), # ~13B params
Expand All @@ -47,7 +47,7 @@ def get_args():
parser.add_argument(
"--model",
choices=MODEL_SIZES.keys(),
default="custom",
default="3b",
help="Model size to generate config for (e.g., 7b, 13b)",
)
parser.add_argument(
Expand Down Expand Up @@ -76,6 +76,10 @@ def get_args():
tokens_group.add_argument("--mbs", type=int, default=3, help="Micro batch size")
tokens_group.add_argument("--acc", type=int, default=1, help="Batch accumulation per replica")

# checkpoints
checkpoints_group = parser.add_argument_group("checkpoints")
checkpoints_group.add_argument("--ckpt-save", type=int, default=10, help="Checkpoint save interval")

args = parser.parse_args()
return args

Expand Down Expand Up @@ -108,7 +112,7 @@ def get_model_config(model_size: str) -> Qwen2Config:
is_qwen2_config=True,
pad_token_id=None,
_attn_implementation="flash_attention_2",
# sliding_window_size=20,
_use_doc_masking=True,
)


Expand Down Expand Up @@ -154,7 +158,7 @@ def calculate_parameters(model_config: Qwen2Config) -> str:

def create_config(model_config: Qwen2Config, args: argparse.Namespace) -> Config:
learning_rate = LRSchedulerArgs(
learning_rate=3e-4, lr_warmup_steps=2, lr_warmup_style="linear", lr_decay_style="cosine", min_decay_lr=1e-5
learning_rate=3e-4, lr_warmup_steps=2000, lr_warmup_style="linear", lr_decay_style="cosine", min_decay_lr=0
)
parallelism = ParallelismArgs(
dp=args.dp,
Expand All @@ -175,7 +179,7 @@ def create_config(model_config: Qwen2Config, args: argparse.Namespace) -> Config
)
optimizer = OptimizerArgs(
zero_stage=args.zero,
weight_decay=0.01,
weight_decay=0.1,
clip_grad=1.0,
accumulate_grad_in_fp32=True,
learning_rate_scheduler=learning_rate,
Expand All @@ -192,7 +196,7 @@ def create_config(model_config: Qwen2Config, args: argparse.Namespace) -> Config

return Config(
general=GeneralArgs(project="debug", run=args.run, seed=seed, ignore_sanity_checks=args.no_sanity),
checkpoints=CheckpointsArgs(checkpoints_path=checkpoints_path, checkpoint_interval=10),
checkpoints=CheckpointsArgs(checkpoints_path=checkpoints_path, checkpoint_interval=args.ckpt_save),
parallelism=parallelism,
model=ModelArgs(init_method=RandomInit(std=0.025), model_config=model_config),
# tokenizer=TokenizerArgs("HuggingFaceTB/cosmo2-tokenizer"),
Expand All @@ -219,7 +223,11 @@ def create_config(model_config: Qwen2Config, args: argparse.Namespace) -> Config
world_size = args.dp * args.tp * args.pp * args.cp
if world_size <= 8:
print(
f"CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node={world_size} run_train.py --config-file {args.out}"
f"ENABLE_TIMERS=1 DEBUG_CPU=1 STATS_SAMPLING_INTERVAL_IN_SEC=1 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node={world_size} run_train.py --config-file {args.out}"
)
print("You can also use environment variables for more debugging:")
print(" - ENABLE_TIMERS=1: Enable detailed timing information")
print(" - DEBUG_CPU=1: Log CPU and memory usage statistics")
print(" - STATS_SAMPLING_INTERVAL_IN_SEC=1: Set sampling interval for metrics collection")
else:
print("Checkout slurm_launcher.py to launch a multi-node job")
24 changes: 12 additions & 12 deletions examples/config_qwen.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
checkpoints:
checkpoint_interval: 10
checkpoint_interval: 100000
checkpoints_path: checkpoints
checkpoints_path_is_shared_file_system: false
load_lr_scheduler: true
Expand Down Expand Up @@ -30,9 +30,9 @@ data_stages:
general:
benchmark_csv_path: null
consumed_train_samples: null
ignore_sanity_checks: false
ignore_sanity_checks: true
project: debug
run: qwen_20250423_201000_16423158
run: qwen_20250424_120835_16423158
seed: 42
step: null
lighteval: null
Expand All @@ -50,24 +50,24 @@ model:
make_vocab_size_divisible_by: 1
model_config:
_attn_implementation: flash_attention_2
_fused_rms_norm: false
_fused_rotary_emb: false
_use_doc_masking: false
_use_qkv_packed: false
_fused_rms_norm: true
_fused_rotary_emb: true
_use_doc_masking: true
_use_qkv_packed: true
attention_bias: false
bos_token_id: 1
eos_token_id: 2
flex_attention_mask: null
hidden_act: silu
hidden_size: 256
hidden_size: 2048
initializer_range: 0.02
intermediate_size: 768
intermediate_size: 11008
is_qwen2_config: true
max_position_embeddings: 4096
moe_config: null
no_rope_layer: null
num_attention_heads: 4
num_hidden_layers: 12
num_attention_heads: 16
num_hidden_layers: 36
num_key_value_heads: 4
pad_token_id: null
pretraining_tp: 1
Expand Down Expand Up @@ -108,7 +108,7 @@ parallelism:
pp: 1
pp_engine: 1f1b
recompute_layer: false
tp: 1
tp: 2
tp_linear_async_communication: true
tp_mode: REDUCE_SCATTER
tp_recompute_allgather: true
Expand Down
99 changes: 66 additions & 33 deletions run_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@
```
export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
torchrun --nproc_per_node=1 run_generate.py --ckpt-path checkpoints/10
torchrun --nproc_per_node=2 run_generate.py --ckpt-path /scratch/1044000

torchrun --rdzv_endpoint=127.0.0.1:12357 --nproc_per_node=2 run_generate.py --ckpt-path /scratch/1044000 --use-cache --max-micro-batch-size 2
export CUDA_VISIBLE_DEVICES=2,3
torchrun --rdzv_endpoint=127.0.0.1:12356 --nproc_per_node=2 run_generate.py --ckpt-path /scratch/1044000 --use-cache
export CUDA_VISIBLE_DEVICES=4,5
torchrun --rdzv_endpoint=127.0.0.1:12355 --nproc_per_node=2 run_generate.py --ckpt-path /scratch/1044000 --max-micro-batch-size 2 --use-decode-tokenized
torchrun --rdzv_endpoint=127.0.0.1:12355 --nproc_per_node=2 run_generate.py --ckpt-path /scratch/1044000 --use-decode-tokenized
```
"""

Expand Down Expand Up @@ -45,10 +53,7 @@
from nanotron.serialize import load_weights
from nanotron.trainer import CONFIG_TO_MODEL_CLASS, mark_tied_parameters

try:
from transformers import AutoTokenizer
except ImportError:
AutoTokenizer = None
from transformers import AutoTokenizer

# import lovely_tensors as lt

Expand All @@ -65,6 +70,10 @@ def get_args():
parser.add_argument("--tp", type=int, default=0)
parser.add_argument("--max-new-tokens", type=int, default=128, help="Maximum number of new tokens to generate")
parser.add_argument("--use-cache", action="store_true", help="Use KV cache to speed up generation")
parser.add_argument(
"--max-micro-batch-size", type=int, default=1, help="Maximum number of micro batches to generate"
)
parser.add_argument("--use-decode-tokenized", action="store_true", help="Use decode_tokenized to generate text")
return parser.parse_args()


Expand All @@ -73,6 +82,17 @@ def main():

assert args.ckpt_path.exists(), f"Checkpoint path {args.ckpt_path} does not exist"

dummy_inputs = [
# "The future of AI is",
# "Passage: Daniel went back to the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:",
"def fib(n)",
# 'Here is an extract from a webpage: "Have you ever experienced heel pain after a heavy physical activity, or even right after a long period of standing? If you regard this as something usual and normal, then think again. Miscalled as heel pain, plantar fasciitis causes these frequent mild pains experienced in the soles of the feet. It is the inflammation and enlargement the plantar fascia tissue that is located in the heels of the feet, stretching to the base of the toes. This tissue is responsible for absorbing shock in the feet and for supporting the arches. It also plays a vital role in foot movements during walking and standing. Many factors such as excessive walking, standing, and running trigger heel pain and plantar fasciitis. A sudden increase in intensity of activities, increase in weight, and abrupt change of footwear also cause the swelling of the ligament. Non-supportive footwear lacking arch cushions and improper and worn out running or training can also lead to the problem. It is also most evident among those". Write an extensive and detailed course unit suitable for a textbook targeted at college students, related to the given extract, within the context of "Medicine". Do not just list concepts, but develop each one in detail before moving to the next, as we prioritize depth of understanding and comprehensive exploration of the subject matter over breadth. Focus on: - Rigor: Ensure in-depth coverage of the concepts/sections. - Engagement: Write with an academic, professional and engaging tone that captivates interest. - Application: Incorporate specific, practical examples, such as proofs in calculus or critical dates and figures in history. Do not include a title or an introduction, simply write the content without headlines and introductory phrases. Do not use images.',
# "Advancements in technology will lead to",
# "Tomorrow's world is shaped by",
# "What is the meaning of the word chutzpah?\nThe word chutzpah means",
]


config = get_config_from_file((args.ckpt_path / "config.yaml").as_posix())
model_config = config.model.model_config
tokenizer_path = config.tokenizer.tokenizer_name_or_path
Expand Down Expand Up @@ -154,36 +174,29 @@ def main():
load_weights(model=model, parallel_context=parallel_context, root_folder=checkpoint_path)

model.eval()
if AutoTokenizer is not None:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
# tokenizer.pad_token_id = tokenizer.eos_token_id
if tokenizer.pad_token_id is None:
if tokenizer.eos_token_id is not None:
tokenizer.pad_token_id = tokenizer.eos_token_id
elif getattr(model.config, "pad_token_id", None) is not None:
tokenizer.pad_token_id = int(model.config.pad_token_id)
elif getattr(model.config, "eos_token_id", None) is not None:
tokenizer.pad_token_id = int(model.config.eos_token_id)
else:
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
tokenizer.padding_side = "left"
tokenizer.truncation_side = "left" # TODO @nouamane: do we want this?
dummy_inputs = [
# "The future of AI is",
"Passage: Daniel went back to the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:",
"def fib(n)",
# 'Here is an extract from a webpage: "Have you ever experienced heel pain after a heavy physical activity, or even right after a long period of standing? If you regard this as something usual and normal, then think again. Miscalled as heel pain, plantar fasciitis causes these frequent mild pains experienced in the soles of the feet. It is the inflammation and enlargement the plantar fascia tissue that is located in the heels of the feet, stretching to the base of the toes. This tissue is responsible for absorbing shock in the feet and for supporting the arches. It also plays a vital role in foot movements during walking and standing. Many factors such as excessive walking, standing, and running trigger heel pain and plantar fasciitis. A sudden increase in intensity of activities, increase in weight, and abrupt change of footwear also cause the swelling of the ligament. Non-supportive footwear lacking arch cushions and improper and worn out running or training can also lead to the problem. It is also most evident among those". Write an extensive and detailed course unit suitable for a textbook targeted at college students, related to the given extract, within the context of "Medicine". Do not just list concepts, but develop each one in detail before moving to the next, as we prioritize depth of understanding and comprehensive exploration of the subject matter over breadth. Focus on: - Rigor: Ensure in-depth coverage of the concepts/sections. - Engagement: Write with an academic, professional and engaging tone that captivates interest. - Application: Incorporate specific, practical examples, such as proofs in calculus or critical dates and figures in history. Do not include a title or an introduction, simply write the content without headlines and introductory phrases. Do not use images.',
# "Advancements in technology will lead to",
# "Tomorrow's world is shaped by",
]

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
# tokenizer.pad_token_id = tokenizer.eos_token_id
if tokenizer.pad_token_id is None:
if tokenizer.eos_token_id is not None:
tokenizer.pad_token_id = tokenizer.eos_token_id
elif getattr(model.config, "pad_token_id", None) is not None:
tokenizer.pad_token_id = int(model.config.pad_token_id)
elif getattr(model.config, "eos_token_id", None) is not None:
tokenizer.pad_token_id = int(model.config.eos_token_id)
else:
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
tokenizer.padding_side = "left"
tokenizer.truncation_side = "left" # TODO @nouamane: do we want this?

if not args.use_decode_tokenized:
outputs = decode_text(
input_iter=(GenerationInput(text=text) for text in dummy_inputs),
tokenizer=tokenizer,
model=model.model,
parallel_context=parallel_context,
max_new_tokens=args.max_new_tokens,
max_micro_batch_size=2,
max_micro_batch_size=args.max_micro_batch_size,
generation_config=GenerationArgs(sampler="greedy", use_cache=args.use_cache),
tokenizer_config=TokenizerConfig(max_input_length=None),
is_bench=os.environ.get("USE_BENCH", "0") == "1",
Expand Down Expand Up @@ -217,15 +230,27 @@ def main():
rank=0,
)
else:
# Tokenize dummy inputs
tokenized_inputs = tokenizer(
dummy_inputs,
padding=True,
truncation=True,
return_tensors="pt",
add_special_tokens=False, # TODO: this is important to avoid adding bos token to the input
)
input_ids = tokenized_inputs["input_ids"].to(device="cuda")
attention_mask = tokenized_inputs["attention_mask"].to(device="cuda")

outputs = decode_tokenized(
input_ids=torch.zeros(1, 1).to(dtype=torch.int64, device="cuda"),
input_mask=torch.ones(1, 1).to(dtype=torch.bool, device="cuda"),
input_ids=input_ids,
input_mask=attention_mask,
model=model.model,
parallel_context=parallel_context,
generation_config=GenerationArgs(sampler="greedy", use_cache=True),
max_micro_batch_size=1,
max_new_tokens=12,
generation_config=GenerationArgs(sampler="greedy", use_cache=args.use_cache),
max_micro_batch_size=args.max_micro_batch_size,
max_new_tokens=args.max_new_tokens,
returns_logits=False,
bos_token_id=tokenizer.bos_token_id,
)
for output in outputs:
input_ids = output.input_ids
Expand All @@ -234,8 +259,16 @@ def main():
assert isinstance(generated_ids, TensorPointer)
continue
assert isinstance(generated_ids, torch.Tensor)

log_rank(
f"input: {tokenizer.decode(input_ids, clean_up_tokenization_spaces=False)[:1000]}",
logger=logger,
level=logging.INFO,
rank=0,
)

log_rank(
f"generation: {generated_ids[len(input_ids) :]}",
f"generation: {tokenizer.decode(generated_ids[len(input_ids):], clean_up_tokenization_spaces=False)}",
logger=logger,
level=logging.INFO,
rank=0,
Expand Down
2 changes: 1 addition & 1 deletion src/nanotron/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,7 @@ def get_config_from_dict(
InitScalingMethod: lambda x: InitScalingMethod[x.upper()],
SamplerType: lambda x: SamplerType[x.upper()],
},
# strict_unions_match=True,
strict_unions_match=True,
strict=True,
),
)
Expand Down
22 changes: 16 additions & 6 deletions src/nanotron/config/lighteval_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,11 @@ class LightEvalSlurm:

gpus_per_node: int = 8
partition: str = "hopper-prod"
hf_cache: str = "~/.cache/huggingface"
hf_cache: Optional[str] = None
cpus_per_task: int = 88
qos: str = "low"
time: str = "24:00:00"
reservation: Optional[str] = "smollm"
reservation: Optional[str] = None

def __post_init__(self):
self.hf_cache = str(Path(self.hf_cache).expanduser())
Expand All @@ -109,11 +109,15 @@ class LightEvalConfig:
logging: Optional[LightEvalLoggingArgs] = None
wandb: Optional[LightEvalWandbLoggerConfig] = None
slurm: Optional[LightEvalSlurm] = None
s3_save_path: Optional[str] = None # should not be dependent of the run_name
output_dir: Optional[str] = None # we should sanity check that it's the same as the one in the eval_config_override
s3_save_path: Optional[str] = None # should not be dependent of the run_name
upload_to_wandb: Optional[bool] = False
wandb_project: Optional[str] = None
wandb_entity: Optional[str] = None
output_dir: Optional[
str
] = None # we should sanity check that it's the same as the one in the eval_config_override
nanotron_path: Optional[str] = "./"
eval_config_override: str = None
eval_config_override: Path = None # Previously hardcoded in run_slurm_one_job
lighteval_config_path: Path = None # Previously hardcoded in run_slurm_one_job
eval_interval: Optional[
int
] = None # Must be multiple of checkpoint_interval. If None, eval will be done after each checkpoint upload to s3
Expand All @@ -127,6 +131,12 @@ def __post_init__(self):
if self.slurm is None:
self.slurm = LightEvalSlurm()
self.local_checkpoint_dir = str(Path(self.local_checkpoint_dir).expanduser())
if self.upload_to_wandb:
assert (
self.s3_save_path is not None
), " We should have a s3_save_path if we want to upload to wandb" # todo: add the option to read from local folder i guess
assert self.wandb_project is not None, "wandb_project must be specified if upload_to_wandb is True"
assert self.wandb_entity is not None, "wandb_entity must be specified if upload_to_wandb is True"
if self.eval_interval_file is not None and Path(self.eval_interval_file).exists():
logger.warning(
f"Eval interval file {self.eval_interval_file} exists. `eval_interval` will be replaced by the value in the file upon the next evaluation. You should probably delete this file if that's not what you want."
Expand Down
2 changes: 1 addition & 1 deletion src/nanotron/config/models_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,4 +279,4 @@ def n_inner(self):
return self.intermediate_size


NanotronConfigs = Union[LlamaConfig, Starcoder2Config, Qwen2Config, Any]
NanotronConfigs = Union[LlamaConfig, Starcoder2Config, Qwen2Config]
2 changes: 2 additions & 0 deletions src/nanotron/config/parallelism_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ class ParallelismArgs:
recompute_layer: bool = False
tp_recompute_allgather: bool = True

moe_layer_recompute: bool = False # TODO: legacy config for smollm

expert_parallel_size: int = 1
context_parallel_size: int = 1

Expand Down
Loading