Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ The following is the list of models supported by MCore-Bridge:
| Series | model_type |
| -------- | ------------------------------------------------------------ |
| Qwen | qwen2, qwen2_moe<br />qwen3, qwen3_moe, qwen3_next |
| DeepSeek | deepseek_v3, deepseek_v32 |
| DeepSeek | deepseek_v3, deepseek_v32, deepseek_v4 |
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The PR adds deepseek_v4 to the list of supported models, but the actual implementation appears to be missing. The file src/mcore_bridge/model/gpts/deepseek_v4.py is empty in the provided context, and there are no changes to model registration or configuration logic to support this new model type. Please ensure the implementation is included or clarify if it relies on an existing model type.

| GLM | glm4, glm4_moe, glm4_moe_lite<br />glm_moe_dsa |
| MiniMax | minimax_m2 |
| Kimi | kimi_k2, kimi_k25 |
Expand Down
2 changes: 1 addition & 1 deletion README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ uv pip install -e . --torch-backend=auto
| 系列 | model_type |
| -------- | ------------------------------------------------------------ |
| Qwen | qwen2, qwen2_moe<br />qwen3, qwen3_moe, qwen3_next |
| DeepSeek | deepseek_v3, deepseek_v32 |
| DeepSeek | deepseek_v3, deepseek_v32, deepseek_v4 |
| GLM | glm4, glm4_moe, glm4_moe_lite<br />glm_moe_dsa |
| MiniMax | minimax_m2 |
| Kimi | kimi_k2, kimi_k25 |
Expand Down
2 changes: 1 addition & 1 deletion src/mcore_bridge/bridge/gpt_bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,7 @@ def _get_hf_experts_attr(self, is_mtp: bool = False):
return True, True
if self.model_type in {'glm4v_moe', 'kimi_vl', 'qwen3_omni_moe', 'qwen3_5_moe'} or self.llm_model_type in {
'qwen2_moe', 'qwen3_moe', 'deepseek_v2', 'deepseek_v3', 'kimi_k2', 'dots1', 'ernie4_5_moe', 'glm4_moe',
'glm4_moe_lite', 'minimax_m2', 'olmoe', 'qwen3_next', 'glm_moe_dsa', 'deepseek_v32'
'glm4_moe_lite', 'minimax_m2', 'olmoe', 'qwen3_next', 'glm_moe_dsa', 'deepseek_v32', 'deepseek_v4'
}:
return False, False
elif self.model_type in {'qwen3_vl_moe', 'llama4', 'gemma4'} or self.llm_model_type in {'gpt_oss'}:
Expand Down
14 changes: 13 additions & 1 deletion src/mcore_bridge/config/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,14 +196,26 @@ class ModelConfig(TransformerConfig):
linear_decoupled_in_proj: bool = False

# dsa
experimental_attention_variant: Optional[Literal['gated_delta_net', 'dsa']] = None
experimental_attention_variant: Optional[Literal['gated_delta_net', 'dsa', 'dsv4_hybrid']] = None
dsa_indexer_n_heads: Optional[int] = None
dsa_indexer_head_dim: Optional[int] = None
dsa_indexer_topk: Optional[int] = None
dsa_indexer_loss_coeff: Optional[float] = None
dsa_indexer_use_sparse_loss: bool = False
dsa_indexer_rotary_interleaved: bool = False

# deepseek-v4
csa_window_size: int = 128
csa_compress_ratios: Optional[List[int]] = None
csa_compress_rotary_base: float = 40000.0
o_groups: int = 8
o_lora_rank: int = 1024
enable_hyper_connections: bool = False
num_residual_streams: int = 4
mhc_sinkhorn_iterations: int = 20
mhc_init_gating_factor: float = 0.01
moe_n_hash_layers: int = 0

# mtp
mtp_decoder_input_detach: bool = False
mtp_shared_weights: bool = False
Expand Down
17 changes: 14 additions & 3 deletions src/mcore_bridge/config/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,15 @@
'dsa_indexer_head_dim': ['index_head_dim'],
'dsa_indexer_topk': ['index_topk'],
'dsa_indexer_rotary_interleaved': ['indexer_rope_interleave'],
# deepseek_v4
'csa_compress_ratios': ['compress_ratios'],
'csa_compress_rotary_base': ['compress_rope_theta'],
'o_groups': ['o_groups'],
'o_lora_rank': ['o_lora_rank'],
'num_residual_streams': ['hc_mult'],
'mhc_sinkhorn_iterations': ['hc_sinkhorn_iters'],
'moe_n_hash_layers': ['num_hash_layers'],
'activation_func_clamp_value': ['swiglu_limit'],
# other
'original_max_position_embeddings': ['original_max_position_embeddings'],
'partial_rotary_factor': ['partial_rotary_factor'],
Expand Down Expand Up @@ -88,7 +97,7 @@ def _convert_config(config, _internal_call=False) -> Dict[str, Any]:
else:
continue
else:
if k == 'kv_lora_rank':
if k in {'q_lora_rank', 'kv_lora_rank'}:
megatron_config['multi_latent_attention'] = True
elif k == 'hf_model_type':
if _internal_call:
Expand Down Expand Up @@ -134,16 +143,18 @@ def hf_to_mcore_config(hf_config: PretrainedConfig) -> Dict[str, Any]:
res.pop('ffn_hidden_size', None)
if llm_model_type in {'qwen2_moe', 'qwen3_next'} or hf_model_type == 'qwen3_5_moe':
res['moe_shared_expert_gate'] = True
if llm_model_type in {'deepseek', 'deepseek_v2', 'deepseek_v3', 'kimi_k2', 'deepseek_v32', 'dots1'
if llm_model_type in {'deepseek', 'deepseek_v2', 'deepseek_v3', 'kimi_k2', 'deepseek_v32', 'dots1', 'deepseek_v4'
} or hf_model_type == 'kimi_vl':
if llm_model_type != 'deepseek':
res['qk_layernorm'] = True
res['moe_router_load_balancing_type'] = 'seq_aux_loss'
res.pop('num_query_groups', None) # https://github.com/NVIDIA/Megatron-LM/issues/1475
if llm_model_type == 'dots1':
res['moe_router_score_function'] = 'sigmoid'
elif llm_model_type == 'deepseek_v32':
res['experimental_attention_variant'] = 'dsa'
elif llm_model_type == 'deepseek_v4':
res['experimental_attention_variant'] = 'dsv4_hybrid'
res['csa_window_size'] = window_size
elif llm_model_type == 'hunyuan':
# Since HunYuan’s attention applies RoPE before using q/k_layernorm,
# which is incompatible with megatron-core, support is not provided here.
Expand Down
1 change: 1 addition & 0 deletions src/mcore_bridge/model/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ class LLMModelType:
minimax_m2 = 'minimax_m2'
hy_v3 = 'hy_v3'
bailing_moe = 'bailing_moe'
deepseek_v4 = 'deepseek_v4'

qwen3_emb = 'qwen3_emb'

Expand Down
2 changes: 1 addition & 1 deletion src/mcore_bridge/model/gpts/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
from . import bailing_moe, glm4, hunyuan, llm, minimax_m2, olmoe, qwen3_emb, qwen3_next
from . import bailing_moe, deepseek_v4, glm4, hunyuan, llm, minimax_m2, olmoe, qwen3_emb, qwen3_next
22 changes: 22 additions & 0 deletions src/mcore_bridge/model/gpts/deepseek_v4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
from mcore_bridge.bridge import GPTBridge

from ..constant import ModelType
from ..register import ModelLoader, ModelMeta, register_model


class DeepseekV4Loader(ModelLoader):
pass


class DeepseekV4Bridge(GPTBridge):
pass


register_model(
ModelMeta(
ModelType.deepseek_v4,
['deepseek_v4'],
bridge_cls=DeepseekV4Bridge,
loader=DeepseekV4Loader,
))
20 changes: 13 additions & 7 deletions src/mcore_bridge/model/modules/transformer_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,13 +191,19 @@ def can_recompute_pre_mlp_layernorm_for_cudagraph():
if 'mlp' in self.config.recompute_modules:
if not self.is_moe_layer:
self.recompute_mlp = True
if hasattr(self.config, 'fine_grained_activation_offloading'):
self.offload_attn_norm = (
self.config.fine_grained_activation_offloading and 'attn_norm' in self.config.offload_modules
and not isinstance(self.input_layernorm, IdentityOp))
self.offload_mlp_norm = (
self.config.fine_grained_activation_offloading and 'mlp_norm' in self.config.offload_modules
and not isinstance(self.pre_mlp_layernorm, IdentityOp))
if hasattr(self, '_set_offload_modules'):
from megatron.core.transformer.transformer_layer import _get_offloading_interface
self._set_offload_modules()
self.off_interface = _get_offloading_interface()
self.mlp_norm_manager = None
Comment on lines +194 to +198
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The initialization of offloading managers for Megatron-Core 0.17+ is incomplete. Setting self.mlp_norm_manager = None without assigning a manager from self.off_interface effectively disables offloading for the MLP layer normalization, even when it is configured in offload_modules. Additionally, self.attn_norm_manager should also be initialized to avoid potential AttributeError in base class methods that expect it to be present in newer versions of Megatron-Core.

Also, the local import of _get_offloading_interface inside __init__ is inefficient as it executes for every layer instantiation; consider moving it to the top of the file if possible.

        if hasattr(self, '_set_offload_modules'):
            from megatron.core.transformer.transformer_layer import _get_offloading_interface
            self._set_offload_modules()
            self.off_interface = _get_offloading_interface()
            offload_modules = getattr(self.config, 'offload_modules', []) or []
            is_offloading = getattr(self.config, 'fine_grained_activation_offloading', False)
            self.attn_norm_manager = self.off_interface.get_manager('attn_norm') if is_offloading and 'attn_norm' in offload_modules else None
            self.mlp_norm_manager = self.off_interface.get_manager('mlp_norm') if is_offloading and 'mlp_norm' in offload_modules else None

else:
if hasattr(self.config, 'fine_grained_activation_offloading'):
self.offload_attn_norm = (
self.config.fine_grained_activation_offloading and 'attn_norm' in self.config.offload_modules
and not isinstance(self.input_layernorm, IdentityOp))
self.offload_mlp_norm = (
self.config.fine_grained_activation_offloading and 'mlp_norm' in self.config.offload_modules
and not isinstance(self.pre_mlp_layernorm, IdentityOp))

# @jcasper how should we handle nvfuser?
# Set bias+dropout+add fusion grad_enable execution handler.
Expand Down
Loading