-
Notifications
You must be signed in to change notification settings - Fork 13
[model] Support deepseek-v4 #86
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,2 +1,2 @@ | ||
| # Copyright (c) ModelScope Contributors. All rights reserved. | ||
| from . import bailing_moe, glm4, hunyuan, llm, minimax_m2, olmoe, qwen3_emb, qwen3_next | ||
| from . import bailing_moe, deepseek_v4, glm4, hunyuan, llm, minimax_m2, olmoe, qwen3_emb, qwen3_next |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| # Copyright (c) ModelScope Contributors. All rights reserved. | ||
| from mcore_bridge.bridge import GPTBridge | ||
|
|
||
| from ..constant import ModelType | ||
| from ..register import ModelLoader, ModelMeta, register_model | ||
|
|
||
|
|
||
| class DeepseekV4Loader(ModelLoader): | ||
| pass | ||
|
|
||
|
|
||
| class DeepseekV4Bridge(GPTBridge): | ||
| pass | ||
|
|
||
|
|
||
| register_model( | ||
| ModelMeta( | ||
| ModelType.deepseek_v4, | ||
| ['deepseek_v4'], | ||
| bridge_cls=DeepseekV4Bridge, | ||
| loader=DeepseekV4Loader, | ||
| )) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -191,13 +191,19 @@ def can_recompute_pre_mlp_layernorm_for_cudagraph(): | |
| if 'mlp' in self.config.recompute_modules: | ||
| if not self.is_moe_layer: | ||
| self.recompute_mlp = True | ||
| if hasattr(self.config, 'fine_grained_activation_offloading'): | ||
| self.offload_attn_norm = ( | ||
| self.config.fine_grained_activation_offloading and 'attn_norm' in self.config.offload_modules | ||
| and not isinstance(self.input_layernorm, IdentityOp)) | ||
| self.offload_mlp_norm = ( | ||
| self.config.fine_grained_activation_offloading and 'mlp_norm' in self.config.offload_modules | ||
| and not isinstance(self.pre_mlp_layernorm, IdentityOp)) | ||
| if hasattr(self, '_set_offload_modules'): | ||
| from megatron.core.transformer.transformer_layer import _get_offloading_interface | ||
| self._set_offload_modules() | ||
| self.off_interface = _get_offloading_interface() | ||
| self.mlp_norm_manager = None | ||
|
Comment on lines
+194
to
+198
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The initialization of offloading managers for Megatron-Core 0.17+ is incomplete. Setting Also, the local import of if hasattr(self, '_set_offload_modules'):
from megatron.core.transformer.transformer_layer import _get_offloading_interface
self._set_offload_modules()
self.off_interface = _get_offloading_interface()
offload_modules = getattr(self.config, 'offload_modules', []) or []
is_offloading = getattr(self.config, 'fine_grained_activation_offloading', False)
self.attn_norm_manager = self.off_interface.get_manager('attn_norm') if is_offloading and 'attn_norm' in offload_modules else None
self.mlp_norm_manager = self.off_interface.get_manager('mlp_norm') if is_offloading and 'mlp_norm' in offload_modules else None |
||
| else: | ||
| if hasattr(self.config, 'fine_grained_activation_offloading'): | ||
| self.offload_attn_norm = ( | ||
| self.config.fine_grained_activation_offloading and 'attn_norm' in self.config.offload_modules | ||
| and not isinstance(self.input_layernorm, IdentityOp)) | ||
| self.offload_mlp_norm = ( | ||
| self.config.fine_grained_activation_offloading and 'mlp_norm' in self.config.offload_modules | ||
| and not isinstance(self.pre_mlp_layernorm, IdentityOp)) | ||
|
|
||
| # @jcasper how should we handle nvfuser? | ||
| # Set bias+dropout+add fusion grad_enable execution handler. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The PR adds
deepseek_v4to the list of supported models, but the actual implementation appears to be missing. The filesrc/mcore_bridge/model/gpts/deepseek_v4.pyis empty in the provided context, and there are no changes to model registration or configuration logic to support this new model type. Please ensure the implementation is included or clarify if it relies on an existing model type.