diff --git a/README.md b/README.md
index ec9236d..ba1c919 100644
--- a/README.md
+++ b/README.md
@@ -127,7 +127,7 @@ The following is the list of models supported by MCore-Bridge:
| Series | model_type |
| -------- | ------------------------------------------------------------ |
| Qwen | qwen2, qwen2_moe
qwen3, qwen3_moe, qwen3_next |
-| DeepSeek | deepseek_v3, deepseek_v32 |
+| DeepSeek | deepseek_v3, deepseek_v32, deepseek_v4 |
| GLM | glm4, glm4_moe, glm4_moe_lite
glm_moe_dsa |
| MiniMax | minimax_m2 |
| Kimi | kimi_k2, kimi_k25 |
diff --git a/README_zh.md b/README_zh.md
index a91e9f2..d600c7d 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -123,7 +123,7 @@ uv pip install -e . --torch-backend=auto
| 系列 | model_type |
| -------- | ------------------------------------------------------------ |
| Qwen | qwen2, qwen2_moe
qwen3, qwen3_moe, qwen3_next |
-| DeepSeek | deepseek_v3, deepseek_v32 |
+| DeepSeek | deepseek_v3, deepseek_v32, deepseek_v4 |
| GLM | glm4, glm4_moe, glm4_moe_lite
glm_moe_dsa |
| MiniMax | minimax_m2 |
| Kimi | kimi_k2, kimi_k25 |
diff --git a/src/mcore_bridge/bridge/gpt_bridge.py b/src/mcore_bridge/bridge/gpt_bridge.py
index 1d95f31..03a4e07 100644
--- a/src/mcore_bridge/bridge/gpt_bridge.py
+++ b/src/mcore_bridge/bridge/gpt_bridge.py
@@ -745,7 +745,7 @@ def _get_hf_experts_attr(self, is_mtp: bool = False):
return True, True
if self.model_type in {'glm4v_moe', 'kimi_vl', 'qwen3_omni_moe', 'qwen3_5_moe'} or self.llm_model_type in {
'qwen2_moe', 'qwen3_moe', 'deepseek_v2', 'deepseek_v3', 'kimi_k2', 'dots1', 'ernie4_5_moe', 'glm4_moe',
- 'glm4_moe_lite', 'minimax_m2', 'olmoe', 'qwen3_next', 'glm_moe_dsa', 'deepseek_v32'
+ 'glm4_moe_lite', 'minimax_m2', 'olmoe', 'qwen3_next', 'glm_moe_dsa', 'deepseek_v32', 'deepseek_v4'
}:
return False, False
elif self.model_type in {'qwen3_vl_moe', 'llama4', 'gemma4'} or self.llm_model_type in {'gpt_oss'}:
diff --git a/src/mcore_bridge/config/model_config.py b/src/mcore_bridge/config/model_config.py
index 8b5438a..bfcd6ce 100644
--- a/src/mcore_bridge/config/model_config.py
+++ b/src/mcore_bridge/config/model_config.py
@@ -196,7 +196,7 @@ class ModelConfig(TransformerConfig):
linear_decoupled_in_proj: bool = False
# dsa
- experimental_attention_variant: Optional[Literal['gated_delta_net', 'dsa']] = None
+ experimental_attention_variant: Optional[Literal['gated_delta_net', 'dsa', 'dsv4_hybrid']] = None
dsa_indexer_n_heads: Optional[int] = None
dsa_indexer_head_dim: Optional[int] = None
dsa_indexer_topk: Optional[int] = None
@@ -204,6 +204,18 @@ class ModelConfig(TransformerConfig):
dsa_indexer_use_sparse_loss: bool = False
dsa_indexer_rotary_interleaved: bool = False
+ # deepseek-v4
+ csa_window_size: int = 128
+ csa_compress_ratios: Optional[List[int]] = None
+ csa_compress_rotary_base: float = 40000.0
+ o_groups: int = 8
+ o_lora_rank: int = 1024
+ enable_hyper_connections: bool = False
+ num_residual_streams: int = 4
+ mhc_sinkhorn_iterations: int = 20
+ mhc_init_gating_factor: float = 0.01
+ moe_n_hash_layers: int = 0
+
# mtp
mtp_decoder_input_detach: bool = False
mtp_shared_weights: bool = False
diff --git a/src/mcore_bridge/config/parser.py b/src/mcore_bridge/config/parser.py
index 842c1bb..5acafbb 100644
--- a/src/mcore_bridge/config/parser.py
+++ b/src/mcore_bridge/config/parser.py
@@ -56,6 +56,15 @@
'dsa_indexer_head_dim': ['index_head_dim'],
'dsa_indexer_topk': ['index_topk'],
'dsa_indexer_rotary_interleaved': ['indexer_rope_interleave'],
+ # deepseek_v4
+ 'csa_compress_ratios': ['compress_ratios'],
+ 'csa_compress_rotary_base': ['compress_rope_theta'],
+ 'o_groups': ['o_groups'],
+ 'o_lora_rank': ['o_lora_rank'],
+ 'num_residual_streams': ['hc_mult'],
+ 'mhc_sinkhorn_iterations': ['hc_sinkhorn_iters'],
+ 'moe_n_hash_layers': ['num_hash_layers'],
+ 'activation_func_clamp_value': ['swiglu_limit'],
# other
'original_max_position_embeddings': ['original_max_position_embeddings'],
'partial_rotary_factor': ['partial_rotary_factor'],
@@ -88,7 +97,7 @@ def _convert_config(config, _internal_call=False) -> Dict[str, Any]:
else:
continue
else:
- if k == 'kv_lora_rank':
+ if k in {'q_lora_rank', 'kv_lora_rank'}:
megatron_config['multi_latent_attention'] = True
elif k == 'hf_model_type':
if _internal_call:
@@ -134,16 +143,18 @@ def hf_to_mcore_config(hf_config: PretrainedConfig) -> Dict[str, Any]:
res.pop('ffn_hidden_size', None)
if llm_model_type in {'qwen2_moe', 'qwen3_next'} or hf_model_type == 'qwen3_5_moe':
res['moe_shared_expert_gate'] = True
- if llm_model_type in {'deepseek', 'deepseek_v2', 'deepseek_v3', 'kimi_k2', 'deepseek_v32', 'dots1'
+ if llm_model_type in {'deepseek', 'deepseek_v2', 'deepseek_v3', 'kimi_k2', 'deepseek_v32', 'dots1', 'deepseek_v4'
} or hf_model_type == 'kimi_vl':
if llm_model_type != 'deepseek':
res['qk_layernorm'] = True
res['moe_router_load_balancing_type'] = 'seq_aux_loss'
- res.pop('num_query_groups', None) # https://github.com/NVIDIA/Megatron-LM/issues/1475
if llm_model_type == 'dots1':
res['moe_router_score_function'] = 'sigmoid'
elif llm_model_type == 'deepseek_v32':
res['experimental_attention_variant'] = 'dsa'
+ elif llm_model_type == 'deepseek_v4':
+ res['experimental_attention_variant'] = 'dsv4_hybrid'
+ res['csa_window_size'] = window_size
elif llm_model_type == 'hunyuan':
# Since HunYuan’s attention applies RoPE before using q/k_layernorm,
# which is incompatible with megatron-core, support is not provided here.
diff --git a/src/mcore_bridge/model/constant.py b/src/mcore_bridge/model/constant.py
index 9b8dc1b..9708f6a 100644
--- a/src/mcore_bridge/model/constant.py
+++ b/src/mcore_bridge/model/constant.py
@@ -9,6 +9,7 @@ class LLMModelType:
minimax_m2 = 'minimax_m2'
hy_v3 = 'hy_v3'
bailing_moe = 'bailing_moe'
+ deepseek_v4 = 'deepseek_v4'
qwen3_emb = 'qwen3_emb'
diff --git a/src/mcore_bridge/model/gpts/__init__.py b/src/mcore_bridge/model/gpts/__init__.py
index 52b007f..6eb44db 100644
--- a/src/mcore_bridge/model/gpts/__init__.py
+++ b/src/mcore_bridge/model/gpts/__init__.py
@@ -1,2 +1,2 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
-from . import bailing_moe, glm4, hunyuan, llm, minimax_m2, olmoe, qwen3_emb, qwen3_next
+from . import bailing_moe, deepseek_v4, glm4, hunyuan, llm, minimax_m2, olmoe, qwen3_emb, qwen3_next
diff --git a/src/mcore_bridge/model/gpts/deepseek_v4.py b/src/mcore_bridge/model/gpts/deepseek_v4.py
new file mode 100644
index 0000000..9109b78
--- /dev/null
+++ b/src/mcore_bridge/model/gpts/deepseek_v4.py
@@ -0,0 +1,22 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+from mcore_bridge.bridge import GPTBridge
+
+from ..constant import ModelType
+from ..register import ModelLoader, ModelMeta, register_model
+
+
+class DeepseekV4Loader(ModelLoader):
+ pass
+
+
+class DeepseekV4Bridge(GPTBridge):
+ pass
+
+
+register_model(
+ ModelMeta(
+ ModelType.deepseek_v4,
+ ['deepseek_v4'],
+ bridge_cls=DeepseekV4Bridge,
+ loader=DeepseekV4Loader,
+ ))
diff --git a/src/mcore_bridge/model/modules/transformer_layer.py b/src/mcore_bridge/model/modules/transformer_layer.py
index 6d500e0..ce742c4 100644
--- a/src/mcore_bridge/model/modules/transformer_layer.py
+++ b/src/mcore_bridge/model/modules/transformer_layer.py
@@ -191,13 +191,19 @@ def can_recompute_pre_mlp_layernorm_for_cudagraph():
if 'mlp' in self.config.recompute_modules:
if not self.is_moe_layer:
self.recompute_mlp = True
- if hasattr(self.config, 'fine_grained_activation_offloading'):
- self.offload_attn_norm = (
- self.config.fine_grained_activation_offloading and 'attn_norm' in self.config.offload_modules
- and not isinstance(self.input_layernorm, IdentityOp))
- self.offload_mlp_norm = (
- self.config.fine_grained_activation_offloading and 'mlp_norm' in self.config.offload_modules
- and not isinstance(self.pre_mlp_layernorm, IdentityOp))
+ if hasattr(self, '_set_offload_modules'):
+ from megatron.core.transformer.transformer_layer import _get_offloading_interface
+ self._set_offload_modules()
+ self.off_interface = _get_offloading_interface()
+ self.mlp_norm_manager = None
+ else:
+ if hasattr(self.config, 'fine_grained_activation_offloading'):
+ self.offload_attn_norm = (
+ self.config.fine_grained_activation_offloading and 'attn_norm' in self.config.offload_modules
+ and not isinstance(self.input_layernorm, IdentityOp))
+ self.offload_mlp_norm = (
+ self.config.fine_grained_activation_offloading and 'mlp_norm' in self.config.offload_modules
+ and not isinstance(self.pre_mlp_layernorm, IdentityOp))
# @jcasper how should we handle nvfuser?
# Set bias+dropout+add fusion grad_enable execution handler.