sgl-project · bluecoffee8 · May 27, 2026 · May 28, 2026 · May 28, 2026 · May 29, 2026
@@ -0,0 +1,33 @@
+{
+  "architectures": [
+    "LlamaForCausalLMEagle3"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "fc_norm": true,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "max_position_embeddings": 2048,
+  "max_window_layers": 48,
+  "model_type": "llama",
+  "norm_output": true,
+  "num_attention_heads": 32,
+  "num_hidden_layers": 1,
+  "num_key_value_heads":4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.53.2",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936,
+  "draft_vocab_size": 32000
+}
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ROOT_DIR=$(dirname $SCRIPT_DIR)
+export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels
+
+# support tp4/tp8 train eagle3 for Qwen3-30B-A3B
+NUM_GPUS=${1:-4}
+TP_SIZE=${2:-4}
+BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64}
+
+torchrun \
+    --standalone \
+    --nproc_per_node $NUM_GPUS \
+    $ROOT_DIR/scripts/train_eagle3.py \
+    --target-model-path Qwen/Qwen3-30B-A3B-Instruct-2507 \
+    --draft-model-config $ROOT_DIR/configs/qwen3-30B-A3B-eagle3.1.json \
+    --train-data-path $ROOT_DIR/cache/dataset/sharegpt_train.jsonl \
+    --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \
+    --output-dir $ROOT_DIR/outputs/qwen3-30b-a3b-instruct-eagle3-sharegpt \
+    --num-epochs 10 \
+    --batch-size 1 \
+    --learning-rate 1e-4 \
+    --max-length 4096 \
+    --chat-template qwen \
+    --cache-dir $ROOT_DIR/cache \
+    --embedding-key model.embed_tokens.weight \
+    --tp-size $TP_SIZE \
+    --target-model-backend sglang
@@ -41,6 +41,15 @@ class Eagle3DraftModel(PreTrainedModel, ABC):
     the abstract methods to support training with TTT.
     """
 
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.num_aux_hidden_states = getattr(config, "num_aux_hidden_states", None)
+        if self.num_aux_hidden_states is None:
+            eagle_config = getattr(config, "eagle_config", None) or {}
+            layer_ids = eagle_config.get("eagle_aux_hidden_state_layer_ids")
+            self.num_aux_hidden_states = len(layer_ids) if layer_ids else 3
+
     @abstractmethod
     def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         """

@@ -1326,16 +1326,26 @@ def __init__(self, config, quant_config=None, attention_backend="sdpa") -> None:
         )
         self.midlayer = LlamaDecoderLayer(config, attention_backend=attention_backend)
 
-        if hasattr(config, "target_hidden_size"):
-            self.fc = torch.nn.Linear(
-                config.target_hidden_size * 3, config.hidden_size, bias=False
+        self.target_hidden_size = getattr(config, "target_hidden_size", config.hidden_size)
+
+        self.fc = torch.nn.Linear(
+            self.target_hidden_size * self.num_aux_hidden_states,
+            config.hidden_size,
+            bias=False,
+        )
+        use_fc_norm = getattr(config, "fc_norm", None)
+        if use_fc_norm:
+            self.fc_norm = nn.ModuleList(
+                [
+                    LlamaRMSNorm(self.target_hidden_size, eps=config.rms_norm_eps)
+                    for _ in range(self.num_aux_hidden_states)
+                ]
             )
         else:
-            self.fc = torch.nn.Linear(
-                config.hidden_size * 3, config.hidden_size, bias=False
-            )
+            self.fc_norm = None
 
         self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm_output = getattr(config, "norm_output", True)
         self.lm_head = nn.Linear(
             config.hidden_size, config.draft_vocab_size, bias=False
         )
@@ -1406,12 +1416,20 @@ def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def project_hidden_states(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        # eagle 3 requires hidden states from 3 layers
-        assert hidden_states.size(-1) == self.config.hidden_size * 3
+        assert hidden_states.size(-1) == self.target_hidden_size * self.num_aux_hidden_states
+        if self.fc_norm is not None:
+            chunks = hidden_states.chunk(self.num_aux_hidden_states, dim=-1)
+            hidden_states = torch.cat(
+                [norm(chunk) for norm, chunk in zip(self.fc_norm, chunks)],
+                dim=-1,
+            )
         return self.fc(hidden_states)
 
     def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        norm_hidden_states = self.norm(hidden_states)
+        if self.norm_output:
+            norm_hidden_states = self.norm(hidden_states)
+        else:
+            norm_hidden_states = hidden_states
         return self.lm_head(norm_hidden_states)
 
     def backbone(