Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 111 additions & 82 deletions examples/config_qwen.yaml
Original file line number Diff line number Diff line change
@@ -1,79 +1,112 @@
checkpoints:
checkpoint_interval: 100000
checkpoints_path: checkpoints
checkpoint_interval: 10
checkpoints_path: checkpoints/smollm3-test-tps-48nn-elie-config
checkpoints_path_is_shared_file_system: false
load_lr_scheduler: true
load_optimizer: true
resume_checkpoint_path: null
save_final_state: false
resume_checkpoint_path: s3://smollm3/pre-training-final/tests/smollm3-test-tps-48nn-elie-config
save_final_state: true
save_initial_state: false
data_stages:
- data:
# dataset: null
dataset:
dataset_folder:
# - /fsx/loubna/datasets/llama_tokenized/fineweb-edu/merged
# - /fsx/loubna/datasets/llama_tokenized/other_sources/dclm/
# - /fsx/loubna/datasets/llama_tokenized/pes2o/standard
- /fsx/loubna/datasets/llama_tokenized/fineweb-edu/merged
- /fsx/loubna/datasets/llama_tokenized/dclm_merged/
- /fsx/loubna/datasets/llama_tokenized/pes2o/standard
- /fsx/loubna/datasets/llama_tokenized/other_sources/wiki
# - /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hq-fra_Latn/
# - /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hq-spa_Latn/
# - /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hq-deu_Latn/
# - /fsx/loubna/datasets/llama_tokenized/fw2-hq-ita_Latn/standard
# - /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hq-por_Latn/
# - /fsx/loubna/datasets/llama_tokenized/fw2-hq-cmn_Hani/standard
# - /fsx/loubna/datasets/llama_tokenized/fw2-hq-rus_Cyrl/standard
# - /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hq-fas_Arab/
# - /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hq-jpn_Jpan/
# - /fsx/loubna/datasets/llama_tokenized/fw2-kor_Hang/standard
# - /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hin_Deva/
# - /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-tha_Thai/
# - /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hq-vie_Latn/
# - /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hq-ell_Grek/
# - /fsx/loubna/datasets/llama_tokenized/other_sources/infiwebmath-3plus/
# - /fsx/loubna/datasets/llama_tokenized/other_sources/finemath-3plus/
# - /fsx/loubna/datasets/llama_tokenized/other_sources/stack-edu-Python/
# - /fsx/loubna/datasets/llama_tokenized/other_sources/stack-edu-Java/
# - /fsx/loubna/datasets/llama_tokenized/other_sources/stack-edu-JavaScript/
# - /fsx/loubna/datasets/llama_tokenized/kaggle/standard
# dataset_weights:
# - 0.307
# - 0.307
# - 0.024
# - 0.002
# - 0.018
# - 0.018
# - 0.018
# - 0.012
# - 0.012
# - 0.013
# - 0.012
# - 0.003
# - 0.0026
# - 0.0026
# - 0.0026
# - 0.0026
# - 0.0026
# - 0.0026
# - 0.02
# - 0.02
# - 0.069
# - 0.069
# - 0.059
# - 0.003
- /fsx/loubna/datasets/llama_tokenized/stackexchange/standard
- /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hq-fra_Latn/
- /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hq-spa_Latn/
- /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hq-deu_Latn/
- /fsx/loubna/datasets/llama_tokenized/fw2-hq-ita_Latn/standard
- /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hq-por_Latn/
- /fsx/loubna/datasets/llama_tokenized/fw2-hq-cmn_Hani/standard
- /fsx/loubna/datasets/llama_tokenized/fw2-hq-rus_Cyrl/standard
- /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hq-fas_Arab/
- /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hq-jpn_Jpan/
- /fsx/loubna/datasets/llama_tokenized/fw2-kor_Hang/standard
- /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hin_Deva/
- /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-tha_Thai/
- /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hq-vie_Latn/
- /fsx/loubna/datasets/llama_tokenized/other_sources/fw2-hq-ell_Grek/
- /fsx/loubna/datasets/llama_tokenized/other_sources/infiwebmath-3plus/
- /fsx/loubna/datasets/llama_tokenized/other_sources/finemath-3plus/
- /fsx/loubna/datasets/llama_tokenized/other_sources/stack-edu-Python/
- /fsx/loubna/datasets/llama_tokenized/other_sources/stack-edu-Java/
- /fsx/loubna/datasets/llama_tokenized/other_sources/stack-edu-JavaScript/
- /fsx/loubna/datasets/llama_tokenized/other_sources/stack-edu-C/
- /fsx/loubna/datasets/llama_tokenized/stack-edu-Cpp/standard
- /fsx/loubna/datasets/llama_tokenized/other_sources/stack-edu-C-Sharp/
- /fsx/loubna/datasets/llama_tokenized/other_sources/stack-edu-PHP/
- /fsx/loubna/datasets/llama_tokenized/other_sources/stack-edu-TypeScript/
- /fsx/loubna/datasets/llama_tokenized/other_sources/stack-edu-Swift/
- /fsx/loubna/datasets/llama_tokenized/other_sources/stack-edu-SQL/
- /fsx/loubna/datasets/llama_tokenized/other_sources/stack-edu-Ruby/
- /fsx/loubna/datasets/llama_tokenized/stack-edu-Markdown/standard
- /fsx/loubna/datasets/llama_tokenized/stack-edu-HTML/standard
- /fsx/loubna/datasets/llama_tokenized/other_sources/stack-edu-Rust/
- /fsx/loubna/datasets/llama_tokenized/other_sources/stack-edu-Go/
- /fsx/loubna/datasets/llama_tokenized/other_sources/stack-edu-Shell/
- /fsx/loubna/datasets/llama_tokenized/pull-requests/standard
- /fsx/loubna/datasets/llama_tokenized/kaggle/standard
- /fsx/loubna/datasets/llama_tokenized/jupyter-scripts/standard
- /fsx/loubna/datasets/llama_tokenized/github-issues/standard
dataset_weights:
- 0.333
- 0.38
- 0.02
- 0.001
- 0.004
- 0.016
- 0.02
- 0.022
- 0.0105
- 0.01
- 0.01
- 0.01
- 0.003
- 0.00325
- 0.00325
- 0.00325
- 0.00325
- 0.00325
- 0.00225
- 0.008
- 0.014
- 0.022
- 0.013
- 0.013
- 0.007
- 0.016
- 0.006
- 0.006
- 0.003
- 0.001
- 0.004
- 0.0008
- 0.005
- 0.006
- 0.0008
- 0.0005
- 0.0007
- 0.006
- 0.0005
- 0.0055
- 0.0032
token_size_in_bytes: 4
tokenizer_name: meta-llama/Llama-3.2-1B
vocab_size: 128256
num_loading_workers: 8
seed: 42
name: Training Stage
num_loading_workers: 0
seed: 6
name: training stage
start_training_step: 1
general:
# benchmark_csv_path: benchmark.csv
benchmark_csv_path: null
consumed_train_samples: null
ignore_sanity_checks: true
project: smollm3-benchmarks
run: qwen-3B-nn8-mbs3-tp2-not-fused
project: smollm3-training
run: smollm3-test-tps-48nn-elie-config
seed: 6
step: null
lighteval: null
Expand All @@ -82,7 +115,7 @@ logging:
log_level: info
log_level_replica: info
model:
ddp_bucket_cap_mb: 25
ddp_bucket_cap_mb: 128
dtype: bfloat16
init_method:
std: 0.02
Expand All @@ -105,26 +138,26 @@ model:
moe_config: null
num_attention_heads: 16
num_hidden_layers: 36
num_key_value_heads: 2
num_key_value_heads: 4
pad_token_id: null
pretraining_tp: 2
rms_norm_eps: 1.0e-06
rope_interleaved: false
rope_scaling: null
rope_theta: 10000.0
rope_theta: 50000.0
sliding_window_size: null
tie_word_embeddings: true
use_cache: true
vocab_size: 128256
z_loss_coefficient: 0.0001
z_loss_coefficient: 1.0e-05
z_loss_enabled: false
optimizer:
accumulate_grad_in_fp32: true
clip_grad: 1.0
learning_rate_scheduler:
learning_rate: 0.0002
lr_decay_starting_step: 26000
lr_decay_steps: 6000
lr_decay_starting_step: 2600000
lr_decay_steps: 600000
lr_decay_style: linear
lr_warmup_steps: 2000
lr_warmup_style: linear
Expand All @@ -135,12 +168,13 @@ optimizer:
adam_eps: 1.0e-08
name: adamW
torch_adam_is_fused: true
weight_decay: 0.01
weight_decay_exclude_named_params: []
weight_decay: 0.1
weight_decay_exclude_named_params:
- .*token_embedding.*
zero_stage: 0
parallelism:
context_parallel_size: 1
dp: 4
dp: 1
expert_parallel_size: 1
moe_layer_recompute: false
pp: 1
Expand All @@ -150,27 +184,22 @@ parallelism:
tp_linear_async_communication: true
tp_mode: REDUCE_SCATTER
tp_recompute_allgather: true
# profiler:
# active: 1
# export_chrome_trace: false
# profile_memory: false
# profiler_export_path: tb_logs
# record_shapes: false
# repeat: 1
# skip_first: 3
# wait: 1
# warmup: 1
# with_stack: true
profiler: null
s3_upload: null
# remove_after_upload: true
# s5cmd_concurrency: 5
# s5cmd_numworkers: 16
# s5cmd_path: /fsx/elie_bakouch/smollm3_training/0304-begin-nanotron/cu124-0304/bin/s5cmd
# upload_s3_path: tests/smollm3-test-tps-48nn-elie-config
tokenizer:
tokenizer_max_length: null
tokenizer_max_length: 4096
tokenizer_name_or_path: meta-llama/Llama-3.2-1B
tokenizer_revision: null
tokens:
batch_accumulation_per_replica: 8
batch_accumulation_per_replica: 1
limit_test_batches: 0
limit_val_batches: 0
micro_batch_size: 3
sequence_length: 4096
train_steps: 32000
val_check_interval: -1
val_check_interval: 100
Loading