sgl-project · elad-inferize · Apr 16, 2026 · gemini-code-assist · Apr 16, 2026 · gemini-code-assist
@@ -28,7 +28,7 @@
 from transformers.cache_utils import DynamicCache
 
 from specforge.core.eagle3_adapters import BackendAdapter, SdpaLikeAdapter, UspAdapter
-from specforge.core.loss import LogSoftmaxLoss
+from specforge.core.loss import LogSoftmaxLoss, _compute_loss
 from specforge.modeling.draft import Eagle3DraftModel
 from specforge.utils import padding
 
@@ -92,7 +92,12 @@ def _acc_and_loss(
             )
             acc = local_correct / local_denom
 
-        loss = LogSoftmaxLoss.apply(logits, target_p, position_mask)
+        try:
+            loss = LogSoftmaxLoss.apply(logits, target_p, position_mask)
+        except RuntimeError:
+            # Fused Triton kernel has a block-size ceiling (131072); fall back
+            # to the @torch.compile reference for large-vocab models.
+            loss = _compute_loss(logits, target_p, position_mask)
-        try:
-            loss = LogSoftmaxLoss.apply(logits, target_p, position_mask)
-        except RuntimeError:
-            # Fused Triton kernel has a block-size ceiling (131072); fall back
-            # to the @torch.compile reference for large-vocab models.
-            loss = _compute_loss(logits, target_p, position_mask)
+        if logits.shape[-1] <= 131072:
+            loss = LogSoftmaxLoss.apply(logits, target_p, position_mask)
+        else:
+            # Fused Triton kernel has a block-size ceiling (131072); fall back
+            # to the @torch.compile reference for large-vocab models.
+            loss = _compute_loss(logits, target_p, position_mask)
-        try:
-            loss = LogSoftmaxLoss.apply(logits, target_p, position_mask)
-        except RuntimeError:
-            # Fused Triton kernel has a block-size ceiling (131072); fall back
-            # to the @torch.compile reference for large-vocab models.
-            loss = _compute_loss(logits, target_p, position_mask)
+        if logits.shape[-1] <= 131072:
+            loss = LogSoftmaxLoss.apply(logits, target_p, position_mask)
+        else:
+            # Fused Triton kernel has a block-size ceiling (131072); fall back
+            # to the @torch.compile reference for large-vocab models.
+            loss = _compute_loss(logits, target_p, position_mask)
         loss = adapter.reduce_loss(loss)
         return acc, loss
 
@@ -553,7 +558,12 @@ def forward(
                 )
 
             # Step 5.6: calculate loss, in-place modifies logits!
-            loss = LogSoftmaxLoss.apply(logits, target_p, position_mask)
+            try:
+                loss = LogSoftmaxLoss.apply(logits, target_p, position_mask)
+            except RuntimeError:
+                # Fused Triton kernel has a block-size ceiling (131072); fall
+                # back to the @torch.compile reference for large-vocab models.
+                loss = _compute_loss(logits, target_p, position_mask)
-            try:
-                loss = LogSoftmaxLoss.apply(logits, target_p, position_mask)
-            except RuntimeError:
-                # Fused Triton kernel has a block-size ceiling (131072); fall
-                # back to the @torch.compile reference for large-vocab models.
-                loss = _compute_loss(logits, target_p, position_mask)
+            if logits.shape[-1] <= 131072:
+                loss = LogSoftmaxLoss.apply(logits, target_p, position_mask)
+            else:
+                # Fused Triton kernel has a block-size ceiling (131072); fall
+                # back to the @torch.compile reference for large-vocab models.
+                loss = _compute_loss(logits, target_p, position_mask)
-            try:
-                loss = LogSoftmaxLoss.apply(logits, target_p, position_mask)
-            except RuntimeError:
-                # Fused Triton kernel has a block-size ceiling (131072); fall
-                # back to the @torch.compile reference for large-vocab models.
-                loss = _compute_loss(logits, target_p, position_mask)
+            if logits.shape[-1] <= 131072:
+                loss = LogSoftmaxLoss.apply(logits, target_p, position_mask)
+            else:
+                # Fused Triton kernel has a block-size ceiling (131072); fall
+                # back to the @torch.compile reference for large-vocab models.
+                loss = _compute_loss(logits, target_p, position_mask)
             plosses.append(loss)
 
             if not is_last:

@@ -349,7 +349,14 @@ def from_pretrained(
     def set_aux_hidden_states_layers(
         self, aux_hidden_states_layers: Optional[List[int]] = None
     ) -> None:
-        self.model_runner.model.set_eagle3_layers_to_capture(aux_hidden_states_layers)
+        # Some target models (e.g., Kimi-K2.5) load via a multimodal wrapper
+        # that delegates to a text backbone at .language_model.  The EAGLE-3
+        # helper set_eagle3_layers_to_capture is defined on the text backbone,
+        # not the outer wrapper.
+        inner = getattr(
+            self.model_runner.model, "language_model", self.model_runner.model
+        )
+        inner.set_eagle3_layers_to_capture(aux_hidden_states_layers)
 
     @torch.no_grad
     def _extend(

@@ -164,10 +164,27 @@ def wrap_eagle3_logits_processors_in_module(
     module: nn.Module, return_full_logits: bool = False
 ):
     """
-    This function will wrap the SGLang's original logits processor with the modified one for EAGLE3.
+    Wrap SGLang's original logits processors with the EAGLE3 variant.
+
+    Fixes:
+      1. Iterate over a materialized list so mutations to _modules do not
+         corrupt the iterator returned by named_modules().
+      2. Use module.set_submodule(dotted_name, wrapped) so nested
+         LogitsProcessors (e.g. language_model.logits_processor) are actually
+         replaced in their parent module, instead of creating a literal
+         dotted-name attribute on the root module.
     """
-    for name, submodule in module.named_modules():
-        if isinstance(submodule, LogitsProcessor):
-            wrapped = LogitsProcessorForEAGLE3(submodule, return_full_logits)
-            setattr(module, name, wrapped)
-            print(f"wrapped {name} with LogitsProcessorForEAGLE3")
+    to_wrap = [
+        (name, submodule)
+        for name, submodule in list(module.named_modules())
+        if isinstance(submodule, LogitsProcessor)
+    ]
+    for name, submodule in to_wrap:
+        wrapped = LogitsProcessorForEAGLE3(submodule, return_full_logits)
-        wrapped = LogitsProcessorForEAGLE3(submodule, return_full_logits)
+        wrapped = LogitsProcessorForEAGLE3(submodule, return_logits=return_full_logits)
-        wrapped = LogitsProcessorForEAGLE3(submodule, return_full_logits)
+        wrapped = LogitsProcessorForEAGLE3(submodule, return_logits=return_full_logits)
+        if name == "":
+            print(
+                "warning: root module is a LogitsProcessor; cannot replace in-place"
+            )
+            continue
+        module.set_submodule(name, wrapped)
+        print(f"wrapped {name} with LogitsProcessorForEAGLE3")