From 8403e3c7aa44c98086b4be69125e696fbc280f41 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 20 May 2026 11:47:26 +0800 Subject: [PATCH 1/3] support deepseek_v4 --- README.md | 2 +- README_zh.md | 2 +- src/mcore_bridge/model/gpts/deepseek_v4.py | 0 3 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 src/mcore_bridge/model/gpts/deepseek_v4.py diff --git a/README.md b/README.md index ec9236d..ba1c919 100644 --- a/README.md +++ b/README.md @@ -127,7 +127,7 @@ The following is the list of models supported by MCore-Bridge: | Series | model_type | | -------- | ------------------------------------------------------------ | | Qwen | qwen2, qwen2_moe
qwen3, qwen3_moe, qwen3_next | -| DeepSeek | deepseek_v3, deepseek_v32 | +| DeepSeek | deepseek_v3, deepseek_v32, deepseek_v4 | | GLM | glm4, glm4_moe, glm4_moe_lite
glm_moe_dsa | | MiniMax | minimax_m2 | | Kimi | kimi_k2, kimi_k25 | diff --git a/README_zh.md b/README_zh.md index a91e9f2..d600c7d 100644 --- a/README_zh.md +++ b/README_zh.md @@ -123,7 +123,7 @@ uv pip install -e . --torch-backend=auto | 系列 | model_type | | -------- | ------------------------------------------------------------ | | Qwen | qwen2, qwen2_moe
qwen3, qwen3_moe, qwen3_next | -| DeepSeek | deepseek_v3, deepseek_v32 | +| DeepSeek | deepseek_v3, deepseek_v32, deepseek_v4 | | GLM | glm4, glm4_moe, glm4_moe_lite
glm_moe_dsa | | MiniMax | minimax_m2 | | Kimi | kimi_k2, kimi_k25 | diff --git a/src/mcore_bridge/model/gpts/deepseek_v4.py b/src/mcore_bridge/model/gpts/deepseek_v4.py new file mode 100644 index 0000000..e69de29 From 358e51f47618fd84514d7a9ce8eaf9dc796660ad Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 20 May 2026 13:39:59 +0800 Subject: [PATCH 2/3] update --- .../model/modules/transformer_layer.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/mcore_bridge/model/modules/transformer_layer.py b/src/mcore_bridge/model/modules/transformer_layer.py index 6d500e0..ce742c4 100644 --- a/src/mcore_bridge/model/modules/transformer_layer.py +++ b/src/mcore_bridge/model/modules/transformer_layer.py @@ -191,13 +191,19 @@ def can_recompute_pre_mlp_layernorm_for_cudagraph(): if 'mlp' in self.config.recompute_modules: if not self.is_moe_layer: self.recompute_mlp = True - if hasattr(self.config, 'fine_grained_activation_offloading'): - self.offload_attn_norm = ( - self.config.fine_grained_activation_offloading and 'attn_norm' in self.config.offload_modules - and not isinstance(self.input_layernorm, IdentityOp)) - self.offload_mlp_norm = ( - self.config.fine_grained_activation_offloading and 'mlp_norm' in self.config.offload_modules - and not isinstance(self.pre_mlp_layernorm, IdentityOp)) + if hasattr(self, '_set_offload_modules'): + from megatron.core.transformer.transformer_layer import _get_offloading_interface + self._set_offload_modules() + self.off_interface = _get_offloading_interface() + self.mlp_norm_manager = None + else: + if hasattr(self.config, 'fine_grained_activation_offloading'): + self.offload_attn_norm = ( + self.config.fine_grained_activation_offloading and 'attn_norm' in self.config.offload_modules + and not isinstance(self.input_layernorm, IdentityOp)) + self.offload_mlp_norm = ( + self.config.fine_grained_activation_offloading and 'mlp_norm' in self.config.offload_modules + and not isinstance(self.pre_mlp_layernorm, IdentityOp)) # @jcasper how should we handle nvfuser? # Set bias+dropout+add fusion grad_enable execution handler. From 9ea9b44ae6c99288be0531a8af020525949e96e3 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 20 May 2026 15:09:06 +0800 Subject: [PATCH 3/3] update --- src/mcore_bridge/bridge/gpt_bridge.py | 2 +- src/mcore_bridge/config/model_config.py | 14 +++++++++++++- src/mcore_bridge/config/parser.py | 17 ++++++++++++++--- src/mcore_bridge/model/constant.py | 1 + src/mcore_bridge/model/gpts/__init__.py | 2 +- src/mcore_bridge/model/gpts/deepseek_v4.py | 22 ++++++++++++++++++++++ 6 files changed, 52 insertions(+), 6 deletions(-) diff --git a/src/mcore_bridge/bridge/gpt_bridge.py b/src/mcore_bridge/bridge/gpt_bridge.py index 1d95f31..03a4e07 100644 --- a/src/mcore_bridge/bridge/gpt_bridge.py +++ b/src/mcore_bridge/bridge/gpt_bridge.py @@ -745,7 +745,7 @@ def _get_hf_experts_attr(self, is_mtp: bool = False): return True, True if self.model_type in {'glm4v_moe', 'kimi_vl', 'qwen3_omni_moe', 'qwen3_5_moe'} or self.llm_model_type in { 'qwen2_moe', 'qwen3_moe', 'deepseek_v2', 'deepseek_v3', 'kimi_k2', 'dots1', 'ernie4_5_moe', 'glm4_moe', - 'glm4_moe_lite', 'minimax_m2', 'olmoe', 'qwen3_next', 'glm_moe_dsa', 'deepseek_v32' + 'glm4_moe_lite', 'minimax_m2', 'olmoe', 'qwen3_next', 'glm_moe_dsa', 'deepseek_v32', 'deepseek_v4' }: return False, False elif self.model_type in {'qwen3_vl_moe', 'llama4', 'gemma4'} or self.llm_model_type in {'gpt_oss'}: diff --git a/src/mcore_bridge/config/model_config.py b/src/mcore_bridge/config/model_config.py index 8b5438a..bfcd6ce 100644 --- a/src/mcore_bridge/config/model_config.py +++ b/src/mcore_bridge/config/model_config.py @@ -196,7 +196,7 @@ class ModelConfig(TransformerConfig): linear_decoupled_in_proj: bool = False # dsa - experimental_attention_variant: Optional[Literal['gated_delta_net', 'dsa']] = None + experimental_attention_variant: Optional[Literal['gated_delta_net', 'dsa', 'dsv4_hybrid']] = None dsa_indexer_n_heads: Optional[int] = None dsa_indexer_head_dim: Optional[int] = None dsa_indexer_topk: Optional[int] = None @@ -204,6 +204,18 @@ class ModelConfig(TransformerConfig): dsa_indexer_use_sparse_loss: bool = False dsa_indexer_rotary_interleaved: bool = False + # deepseek-v4 + csa_window_size: int = 128 + csa_compress_ratios: Optional[List[int]] = None + csa_compress_rotary_base: float = 40000.0 + o_groups: int = 8 + o_lora_rank: int = 1024 + enable_hyper_connections: bool = False + num_residual_streams: int = 4 + mhc_sinkhorn_iterations: int = 20 + mhc_init_gating_factor: float = 0.01 + moe_n_hash_layers: int = 0 + # mtp mtp_decoder_input_detach: bool = False mtp_shared_weights: bool = False diff --git a/src/mcore_bridge/config/parser.py b/src/mcore_bridge/config/parser.py index 842c1bb..5acafbb 100644 --- a/src/mcore_bridge/config/parser.py +++ b/src/mcore_bridge/config/parser.py @@ -56,6 +56,15 @@ 'dsa_indexer_head_dim': ['index_head_dim'], 'dsa_indexer_topk': ['index_topk'], 'dsa_indexer_rotary_interleaved': ['indexer_rope_interleave'], + # deepseek_v4 + 'csa_compress_ratios': ['compress_ratios'], + 'csa_compress_rotary_base': ['compress_rope_theta'], + 'o_groups': ['o_groups'], + 'o_lora_rank': ['o_lora_rank'], + 'num_residual_streams': ['hc_mult'], + 'mhc_sinkhorn_iterations': ['hc_sinkhorn_iters'], + 'moe_n_hash_layers': ['num_hash_layers'], + 'activation_func_clamp_value': ['swiglu_limit'], # other 'original_max_position_embeddings': ['original_max_position_embeddings'], 'partial_rotary_factor': ['partial_rotary_factor'], @@ -88,7 +97,7 @@ def _convert_config(config, _internal_call=False) -> Dict[str, Any]: else: continue else: - if k == 'kv_lora_rank': + if k in {'q_lora_rank', 'kv_lora_rank'}: megatron_config['multi_latent_attention'] = True elif k == 'hf_model_type': if _internal_call: @@ -134,16 +143,18 @@ def hf_to_mcore_config(hf_config: PretrainedConfig) -> Dict[str, Any]: res.pop('ffn_hidden_size', None) if llm_model_type in {'qwen2_moe', 'qwen3_next'} or hf_model_type == 'qwen3_5_moe': res['moe_shared_expert_gate'] = True - if llm_model_type in {'deepseek', 'deepseek_v2', 'deepseek_v3', 'kimi_k2', 'deepseek_v32', 'dots1' + if llm_model_type in {'deepseek', 'deepseek_v2', 'deepseek_v3', 'kimi_k2', 'deepseek_v32', 'dots1', 'deepseek_v4' } or hf_model_type == 'kimi_vl': if llm_model_type != 'deepseek': res['qk_layernorm'] = True res['moe_router_load_balancing_type'] = 'seq_aux_loss' - res.pop('num_query_groups', None) # https://github.com/NVIDIA/Megatron-LM/issues/1475 if llm_model_type == 'dots1': res['moe_router_score_function'] = 'sigmoid' elif llm_model_type == 'deepseek_v32': res['experimental_attention_variant'] = 'dsa' + elif llm_model_type == 'deepseek_v4': + res['experimental_attention_variant'] = 'dsv4_hybrid' + res['csa_window_size'] = window_size elif llm_model_type == 'hunyuan': # Since HunYuan’s attention applies RoPE before using q/k_layernorm, # which is incompatible with megatron-core, support is not provided here. diff --git a/src/mcore_bridge/model/constant.py b/src/mcore_bridge/model/constant.py index 9b8dc1b..9708f6a 100644 --- a/src/mcore_bridge/model/constant.py +++ b/src/mcore_bridge/model/constant.py @@ -9,6 +9,7 @@ class LLMModelType: minimax_m2 = 'minimax_m2' hy_v3 = 'hy_v3' bailing_moe = 'bailing_moe' + deepseek_v4 = 'deepseek_v4' qwen3_emb = 'qwen3_emb' diff --git a/src/mcore_bridge/model/gpts/__init__.py b/src/mcore_bridge/model/gpts/__init__.py index 52b007f..6eb44db 100644 --- a/src/mcore_bridge/model/gpts/__init__.py +++ b/src/mcore_bridge/model/gpts/__init__.py @@ -1,2 +1,2 @@ # Copyright (c) ModelScope Contributors. All rights reserved. -from . import bailing_moe, glm4, hunyuan, llm, minimax_m2, olmoe, qwen3_emb, qwen3_next +from . import bailing_moe, deepseek_v4, glm4, hunyuan, llm, minimax_m2, olmoe, qwen3_emb, qwen3_next diff --git a/src/mcore_bridge/model/gpts/deepseek_v4.py b/src/mcore_bridge/model/gpts/deepseek_v4.py index e69de29..9109b78 100644 --- a/src/mcore_bridge/model/gpts/deepseek_v4.py +++ b/src/mcore_bridge/model/gpts/deepseek_v4.py @@ -0,0 +1,22 @@ +# Copyright (c) ModelScope Contributors. All rights reserved. +from mcore_bridge.bridge import GPTBridge + +from ..constant import ModelType +from ..register import ModelLoader, ModelMeta, register_model + + +class DeepseekV4Loader(ModelLoader): + pass + + +class DeepseekV4Bridge(GPTBridge): + pass + + +register_model( + ModelMeta( + ModelType.deepseek_v4, + ['deepseek_v4'], + bridge_cls=DeepseekV4Bridge, + loader=DeepseekV4Loader, + ))