Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ All notable changes to fairseq2 are documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

## [0.8.1] - Unreleased
- Qwen 3.5 model family (0.8B, 2B, 9B, 27B dense and 35B-A3B MoE) with base and instruction-tuned variants. Features hybrid GatedDeltaNet linear attention (75%) + full attention (25%) architecture, partial RoPE, QK-norm, output gating, Top-K MoE routing with shared experts, RMSNorm 1+w convention, bidirectional HuggingFace state dict conversion, and SFT/pretraining recipe configs.
- Gemma 4 model family (E4B, 31B, 26B-A4B) with base and instruction-tuned variants. Includes decoder with Per-Layer Embeddings (PLE), partial RoPE, KV sharing across sliding/global attention layers, Mixture-of-Experts (26B-A4B), QK/V-norm, logit soft-capping, audio tower (Conformer encoder for multimodal E4B), bidirectional HuggingFace state dict conversion, FSDP/activation checkpointing/tensor parallel support, and SFT recipe configs.
- Bump transformers~=v5.5 and loosen huggingface_hub upper bound. (#1508)
- Fixed typo in WerMetric: use `hyp_seqs` instead of `ref_seqs` for `hyp_seqs_list`. (#1506)
Expand Down
81 changes: 81 additions & 0 deletions recipes/lm/sft/configs/qwen35_0.8b_gsm8k.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Qwen 3.5 0.8B GSM8K SFT Fine-tuning Config
#
# Validates training recipe integration and loss convergence for the Qwen 3.5
# model on the GSM8K math reasoning dataset.
#
# Usage:
# torchrun --standalone --nproc_per_node=8 -m recipes.lm.sft \
# --config-file recipes/lm/sft/configs/qwen35_0.8b_gsm8k.yaml \
# /path/to/output_dir

model:
name: "qwen35_0.8b"
dtype: bfloat16
config_overrides:
pad_idx: 248044

tokenizer:
name: "qwen35_0.8b"
config_overrides:
use_im_end: true

dataset:
max_seq_len: 4096
max_num_tokens: 8192
valid_split: "sft_test"
chat_mode: false
config_overrides:
sources:
train:
- path: "hg://facebook/fairseq2-lm-gsm8k"
split: "sft_train"
weight: 1.0
sft_test:
- path: "hg://facebook/fairseq2-lm-gsm8k"
split: "sft_test"
weight: 1.0

trainer:
data_parallelism: fsdp
max_grad_norm: 1.0
mixed_precision:
mode: static
dtype: bfloat16

optimizer:
name: adamw
config:
lr: 2.0e-5
betas: [0.9, 0.95]
weight_decay: 0.1
impl: fused

lr_scheduler:
name: cosine_annealing
config:
final_lr_scale: 0.1
num_warmup_steps: 100

regime:
num_steps: 100000
checkpoint_every_n_steps: 100
validate_every_n_steps: 100
keep_last_n_checkpoints: 10
publish_metrics_every_n_steps: 1
save_model_only: false

common:
seed: 0
metric_recorders:
wandb:
enabled: true
entity: "yunchaoyang1"
project: "fairseq2"
tensorboard:
enabled: false
75 changes: 75 additions & 0 deletions src/fairseq2/assets/cards/models/qwen35.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

name: qwen35_0.8b
model_family: qwen3_5
model_arch: qwen35_0.8b
checkpoint: "/checkpoint/smallomnillm/shared/models/Qwen3.5-0.8B"
tokenizer: "hg://Qwen/Qwen3.5-0.8B"
tokenizer_family: qwen

---

name: qwen35_2b
model_family: qwen3_5
model_arch: qwen35_2b
checkpoint: "hg://Qwen/Qwen3.5-2B"
tokenizer: "hg://Qwen/Qwen3.5-2B"
tokenizer_family: qwen

---

name: qwen35_2b_base
model_family: qwen3_5
model_arch: qwen35_2b
checkpoint: "hg://Qwen/Qwen3.5-2B-Base"
tokenizer: "hg://Qwen/Qwen3.5-2B-Base"
tokenizer_family: qwen

---

name: qwen35_9b
model_family: qwen3_5
model_arch: qwen35_9b
checkpoint: "hg://Qwen/Qwen3.5-9B"
tokenizer: "hg://Qwen/Qwen3.5-9B"
tokenizer_family: qwen

---

name: qwen35_9b_base
model_family: qwen3_5
model_arch: qwen35_9b
checkpoint: "hg://Qwen/Qwen3.5-9B-Base"
tokenizer: "hg://Qwen/Qwen3.5-9B-Base"
tokenizer_family: qwen

---

name: qwen35_27b
model_family: qwen3_5
model_arch: qwen35_27b
checkpoint: "hg://Qwen/Qwen3.5-27B"
tokenizer: "hg://Qwen/Qwen3.5-27B"
tokenizer_family: qwen

---

name: qwen35_moe_35b_a3b
model_family: qwen3_5_moe
model_arch: qwen35_moe_35b_a3b
checkpoint: "hg://Qwen/Qwen3.5-35B-A3B"
tokenizer: "hg://Qwen/Qwen3.5-35B-A3B"
tokenizer_family: qwen

---

name: qwen35_moe_35b_a3b_base
model_family: qwen3_5_moe
model_arch: qwen35_moe_35b_a3b
checkpoint: "hg://Qwen/Qwen3.5-35B-A3B-Base"
tokenizer: "hg://Qwen/Qwen3.5-35B-A3B-Base"
tokenizer_family: qwen
50 changes: 50 additions & 0 deletions src/fairseq2/composition/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,11 +107,23 @@
register_olmo_configs,
)
from fairseq2.models.qwen import (
QWEN35_FAMILY,
QWEN35_MOE_FAMILY,
QWEN_FAMILY,
Qwen35Config,
Qwen35MoeConfig,
QwenConfig,
_Qwen35HuggingFaceConverter,
_Qwen35MoeHuggingFaceConverter,
_QwenHuggingFaceConverter,
convert_qwen35_moe_state_dict,
convert_qwen35_state_dict,
convert_qwen_state_dict,
create_qwen35_model,
create_qwen35_moe_model,
create_qwen_model,
register_qwen35_configs,
register_qwen35_moe_configs,
register_qwen_configs,
)
from fairseq2.models.s2t_conformer import (
Expand Down Expand Up @@ -417,6 +429,44 @@ def _register_model_families(container: DependencyContainer) -> None:
HuggingFaceConverter, _QwenHuggingFaceConverter, key=QWEN_FAMILY
)

# Qwen 3.5
register_model_family(
container,
QWEN35_FAMILY,
kls=TransformerLM,
config_kls=Qwen35Config,
factory=create_qwen35_model,
state_dict_converter=convert_qwen35_state_dict,
compiler=compile_transformer_lm,
fsdp_applier=apply_fsdp_to_transformer_lm,
layerwise_ac_applier=apply_ac_to_transformer_lm,
)

register_qwen35_configs(container)

container.register_type(
HuggingFaceConverter, _Qwen35HuggingFaceConverter, key=QWEN35_FAMILY
)

# Qwen 3.5 MoE
register_model_family(
container,
QWEN35_MOE_FAMILY,
kls=TransformerLM,
config_kls=Qwen35MoeConfig,
factory=create_qwen35_moe_model,
state_dict_converter=convert_qwen35_moe_state_dict,
compiler=compile_transformer_lm,
fsdp_applier=apply_fsdp_to_transformer_lm,
layerwise_ac_applier=apply_ac_to_transformer_lm,
)

register_qwen35_moe_configs(container)

container.register_type(
HuggingFaceConverter, _Qwen35MoeHuggingFaceConverter, key=QWEN35_MOE_FAMILY
)

# S2T Conformer
register_model_family(
container,
Expand Down
39 changes: 38 additions & 1 deletion src/fairseq2/models/qwen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,57 @@

from __future__ import annotations

from fairseq2.models.qwen.config import QWEN35_FAMILY as QWEN35_FAMILY
from fairseq2.models.qwen.config import QWEN35_MOE_FAMILY as QWEN35_MOE_FAMILY
from fairseq2.models.qwen.config import QWEN_FAMILY as QWEN_FAMILY
from fairseq2.models.qwen.config import Qwen35Config as Qwen35Config
from fairseq2.models.qwen.config import Qwen35MoeConfig as Qwen35MoeConfig
from fairseq2.models.qwen.config import QwenConfig as QwenConfig
from fairseq2.models.qwen.config import (
register_qwen35_configs as register_qwen35_configs,
)
from fairseq2.models.qwen.config import (
register_qwen35_moe_configs as register_qwen35_moe_configs,
)
from fairseq2.models.qwen.config import register_qwen_configs as register_qwen_configs
from fairseq2.models.qwen.factory import Qwen35Factory as Qwen35Factory
from fairseq2.models.qwen.factory import Qwen35MoeFactory as Qwen35MoeFactory
from fairseq2.models.qwen.factory import QwenFactory as QwenFactory
from fairseq2.models.qwen.factory import create_qwen35_model as create_qwen35_model
from fairseq2.models.qwen.factory import (
create_qwen35_moe_model as create_qwen35_moe_model,
)
from fairseq2.models.qwen.factory import create_qwen_model as create_qwen_model
from fairseq2.models.qwen.hub import get_qwen35_model_hub as get_qwen35_model_hub
from fairseq2.models.qwen.hub import (
get_qwen35_moe_model_hub as get_qwen35_moe_model_hub,
)
from fairseq2.models.qwen.hub import (
get_qwen35_moe_tokenizer_hub as get_qwen35_moe_tokenizer_hub,
)
from fairseq2.models.qwen.hub import (
get_qwen35_tokenizer_hub as get_qwen35_tokenizer_hub,
)
from fairseq2.models.qwen.hub import get_qwen_model_hub as get_qwen_model_hub
from fairseq2.models.qwen.hub import get_qwen_tokenizer_hub as get_qwen_tokenizer_hub
from fairseq2.models.qwen.interop import (
_Qwen35HuggingFaceConverter as _Qwen35HuggingFaceConverter,
)
from fairseq2.models.qwen.interop import (
_Qwen35MoeHuggingFaceConverter as _Qwen35MoeHuggingFaceConverter,
)
from fairseq2.models.qwen.interop import (
_QwenHuggingFaceConverter as _QwenHuggingFaceConverter,
)
from fairseq2.models.qwen.interop import (
convert_qwen35_moe_state_dict as convert_qwen35_moe_state_dict,
)
from fairseq2.models.qwen.interop import (
convert_qwen35_state_dict as convert_qwen35_state_dict,
)
from fairseq2.models.qwen.interop import (
convert_qwen_state_dict as convert_qwen_state_dict,
)
from fairseq2.models.qwen.sharder import get_qwen_shard_specs as get_qwen_shard_specs
from fairseq2.models.qwen.tokenizer import QwenTokenizer as QwenTokenizer
from fairseq2.models.qwen.tokenizer import QwenTokenizerConfig as QwenTokenizerConfig
from fairseq2.models.qwen.tokenizer import load_qwen_tokenizer as load_qwen_tokenizer
Loading
Loading