From 88beb3b85df2545259aef3b892997605672db2a5 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 4 Jun 2026 23:42:12 +0000
Subject: [PATCH 01/34] Enable fused linear layers to load themselves

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/layers/linear.py | 32 ++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index e50a0e6b0025..62f978489f54 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -3,6 +3,7 @@
 
 import itertools
 from abc import abstractmethod
+from collections.abc import Iterable
 
 import torch
 from torch.nn.parameter import Parameter, UninitializedParameter
@@ -968,6 +969,21 @@ def weight_loader_v2(
             tp_rank=self.tp_rank,
         )
 
+    def load_weights(
+        self, weights: Iterable[tuple[str, torch.Tensor]]
+    ) -> Iterable[str]:
+        for name, loaded_weight in weights:
+            shard_id_str, _, param_name = name.partition(".")
+            # If the shard_id is not an integer, the weight is not sharded
+            try:
+                shard_id = int(shard_id_str)
+            except ValueError:
+                shard_id = None
+            # If param_name is "bias" get it from self, otherwise load into self
+            param: Parameter = getattr(self, param_name, self)
+            param.weight_loader(param, loaded_weight, shard_id)
+            yield param_name
+
 
 class QKVParallelLinear(ColumnParallelLinear):
     """Linear layers for the attention's QKV transformation.
@@ -1383,6 +1399,22 @@ def weight_loader(
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
+    def load_weights(
+        self, weights: Iterable[tuple[str, torch.Tensor]]
+    ) -> Iterable[str]:
+        for name, loaded_weight in weights:
+            shard_id_str, _, param_name = name.partition(".")
+            # If the shard_id is not valid, the weight is not sharded
+            try:
+                self.validate_shard_id(shard_id_str)
+                shard_id = shard_id_str
+            except ValueError:
+                shard_id = None
+            # If param_name is "bias" get it from self, otherwise load into self
+            param: Parameter = getattr(self, param_name, self)
+            param.weight_loader(param, loaded_weight, shard_id)
+            yield param_name
+
 
 # --8<-- [start:row_parallel_linear]
 @PluggableLayer.register("row_parallel_linear")

From 41f4584b00b019160dda51d9ea97d419d6f3cb8b Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 4 Jun 2026 23:42:50 +0000
Subject: [PATCH 02/34] Enable GPTQ extra bias skipping in AutoWeightsLoader

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../model_executor/models/transformers/base.py |  3 ---
 vllm/model_executor/models/utils.py            | 18 ++++++++----------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py
index 35897ce7dbca..e26ef8f4b0b3 100644
--- a/vllm/model_executor/models/transformers/base.py
+++ b/vllm/model_executor/models/transformers/base.py
@@ -154,9 +154,6 @@ def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
                     "Transformers modeling backend does "
                     "not support MXFP4 quantization yet."
                 )
-            # Skip loading extra bias for GPTQ models.
-            if "gptq" in quant_method_name:
-                self.ignore_unexpected_suffixes.append(".bias")
 
         self._patch_config()
         from_config_kwargs = dict(
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 83d113415dce..6a23d95e138b 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -348,16 +348,14 @@ def load_weights(
         # We look at the causal model's direct children for this reason.
         modules = (self.module, *self.module.children())
         iterator = (m.quant_config for m in modules if hasattr(m, "quant_config"))
-        quant_config = next(iterator, None)
-        cache_scale_mapper = (
-            quant_config.get_cache_scale_mapper() if quant_config is not None else None
-        )
-        if cache_scale_mapper is not None:
-            mapper = (
-                mapper | cache_scale_mapper
-                if mapper is not None
-                else cache_scale_mapper
-            )
+        if quant_config := next(iterator, None):
+            # Skip loading extra bias for GPTQ models
+            if "gptq" in quant_config.get_name():
+                self.ignore_unexpected_suffixes.append(".bias")
+            # Get mappings for KV cache quantization scales
+            if cache_scale_mapper := quant_config.get_cache_scale_mapper():
+                mapper = mapper or WeightsMapper()
+                mapper |= cache_scale_mapper
         if mapper is not None:
             weights = mapper.apply(weights)
         # filter out weights with first-prefix/substr to skip in name

From fcd151b147c926e4eeb2ada5e9ce690a7daaf278 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 4 Jun 2026 23:44:31 +0000
Subject: [PATCH 03/34] Try it on a couple of simple weight loaders

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/olmo.py | 52 +++++++-----------------------
 vllm/model_executor/models/opt.py  | 44 ++++---------------------
 2 files changed, 18 insertions(+), 78 deletions(-)

diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 4491a6a3ea1b..73645c2b72b4 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -48,13 +48,12 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
     AutoWeightsLoader,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -300,43 +299,6 @@ def forward(
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     """
@@ -355,6 +317,16 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         ],
     }
 
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -409,4 +381,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 ["lm_head.weight"] if self.config.tie_word_embeddings else None
             ),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 81653b9516ac..80e05836b07e 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -44,14 +44,12 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
     AutoWeightsLoader,
     WeightsMapper,
-    is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -325,41 +323,6 @@ def forward(
             input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
         )
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class OPTForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     packed_modules_mapping = {
@@ -367,9 +330,14 @@ class OPTForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     }
 
     hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+        },
         orig_to_new_prefix={
             "decoder.": "model.decoder.",
-        }
+        },
     )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):

From a9788ab71eb33f37b849129b1090131f27e8a1a5 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 5 Jun 2026 10:39:22 +0000
Subject: [PATCH 04/34] Fix LoRA loading for these two models

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/lora/layers/column_parallel_linear.py | 10 ++++------
 vllm/lora/utils.py                         | 23 ++++++++++++++++------
 vllm/model_executor/layers/linear.py       |  8 +++++++-
 vllm/model_executor/models/olmo.py         | 12 -----------
 vllm/model_executor/models/opt.py          |  4 ----
 5 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py
index 8a86191b8918..f9643809c7f4 100644
--- a/vllm/lora/layers/column_parallel_linear.py
+++ b/vllm/lora/layers/column_parallel_linear.py
@@ -422,9 +422,8 @@ def can_replace_layer(
         packed_modules_list: list,
         model_config: PretrainedConfig | None = None,
     ) -> bool:
-        return (
-            type(source_layer) is maybe_get_oot_by_class(QKVParallelLinear)
-            and len(packed_modules_list) == 1
+        return type(source_layer) is maybe_get_oot_by_class(QKVParallelLinear) and (
+            len(packed_modules_list) == 1 or source_layer.checkpoint_format == "fused"
         )
 
 
@@ -483,9 +482,8 @@ def can_replace_layer(
         packed_modules_list: list,
         model_config: PretrainedConfig | None = None,
     ) -> bool:
-        return (
-            type(source_layer) is maybe_get_oot_by_class(QKVParallelLinear)
-            and len(packed_modules_list) == 3
+        return type(source_layer) is maybe_get_oot_by_class(QKVParallelLinear) and (
+            len(packed_modules_list) == 3 or source_layer.checkpoint_format == "sharded"
         )
 
 
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index d5c9a1a6ff8a..bd23fb69214b 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -34,7 +34,11 @@
     VocabParallelEmbeddingWithLoRA,
 )
 from vllm.model_executor.layers.fused_moe import FusedMoE
-from vllm.model_executor.layers.linear import LinearBase
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+)
 from vllm.model_executor.utils import get_moe_expert_mapping, get_packed_modules_mapping
 from vllm.transformers_utils.repo_utils import hf_api
 
@@ -219,11 +223,18 @@ def get_supported_lora_modules(model: nn.Module) -> list[str]:
             for name in embedding_modules:
                 supported_lora_modules.add(name)
 
-        # get all the linear subfixes.
-        if isinstance(module, (LinearBase,)):
-            supported_lora_modules.add(name.split(".")[-1])
-
-        if isinstance(module, (FusedMoE,)):
+        if (
+            isinstance(module, QKVParallelLinear)
+            and module.checkpoint_format == "sharded"
+        ):
+            supported_lora_modules.update(["q", "k", "v"])
+        elif (
+            isinstance(module, MergedColumnParallelLinear)
+            and module.checkpoint_format == "sharded"
+        ):
+            shard_ids = [str(i) for i in range(len(module.output_sizes))]
+            supported_lora_modules.update(shard_ids)
+        elif isinstance(module, (LinearBase, FusedMoE)):
             supported_lora_modules.add(name.split(".")[-1])
 
     return list(supported_lora_modules)
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 62f978489f54..40e98ac34ad6 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -4,6 +4,7 @@
 import itertools
 from abc import abstractmethod
 from collections.abc import Iterable
+from typing import Literal
 
 import torch
 from torch.nn.parameter import Parameter, UninitializedParameter
@@ -645,8 +646,8 @@ def __init__(
         self.output_sizes = output_sizes
         self.tp_size = get_tensor_model_parallel_world_size() if not disable_tp else 1
         self.tp_rank = get_tensor_model_parallel_rank() if not disable_tp else 0
-
         assert all(output_size % self.tp_size == 0 for output_size in output_sizes)
+        self.checkpoint_format: Literal["sharded", "fused"] | None = None
         super().__init__(
             input_size=input_size,
             output_size=sum(output_sizes),
@@ -977,8 +978,10 @@ def load_weights(
             # If the shard_id is not an integer, the weight is not sharded
             try:
                 shard_id = int(shard_id_str)
+                self.checkpoint_format = "sharded"
             except ValueError:
                 shard_id = None
+                self.checkpoint_format = "fused"
             # If param_name is "bias" get it from self, otherwise load into self
             param: Parameter = getattr(self, param_name, self)
             param.weight_loader(param, loaded_weight, shard_id)
@@ -1056,6 +1059,7 @@ def __init__(
             self.num_kv_heads * self.head_size * tp_size,  # k_proj
             self.num_kv_heads * self.v_head_size * tp_size,  # v_proj
         ]
+        self.checkpoint_format: Literal["fused", "sharded"] | None = None
 
         super().__init__(
             input_size=input_size,
@@ -1408,8 +1412,10 @@ def load_weights(
             try:
                 self.validate_shard_id(shard_id_str)
                 shard_id = shard_id_str
+                self.checkpoint_format = "sharded"
             except ValueError:
                 shard_id = None
+                self.checkpoint_format = "fused"
             # If param_name is "bias" get it from self, otherwise load into self
             param: Parameter = getattr(self, param_name, self)
             param.weight_loader(param, loaded_weight, shard_id)
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 73645c2b72b4..297f39726a1e 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -305,18 +305,6 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     Extremely barebones HF model wrapper.
     """
 
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             ".q_proj": ".qkv_proj.q",
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 80e05836b07e..ccb6798dec75 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -325,10 +325,6 @@ def forward(
 
 
 class OPTForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
-    packed_modules_mapping = {
-        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-    }
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             ".q_proj": ".qkv_proj.q",

From cca665e8984fbbcd4c923939dde073b4192bf7a6 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 5 Jun 2026 13:13:48 +0000
Subject: [PATCH 05/34] Delete some more load_weights methods

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/baichuan.py  | 54 ++++-----------------
 vllm/model_executor/models/gemma.py     | 63 +++++--------------------
 vllm/model_executor/models/internlm2.py | 51 ++++----------------
 vllm/model_executor/models/olmo2.py     | 61 +++++-------------------
 vllm/model_executor/models/orion.py     | 52 +++++---------------
 vllm/model_executor/models/phi.py       | 60 ++++-------------------
 vllm/model_executor/models/qwen.py      | 60 ++++++-----------------
 vllm/model_executor/models/qwen_vl.py   |  8 ----
 vllm/model_executor/models/stablelm.py  | 52 +++++---------------
 vllm/model_executor/models/step1.py     | 56 +++++-----------------
 10 files changed, 96 insertions(+), 421 deletions(-)

diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index bc1cd2ed811b..85cb254670e3 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -60,7 +60,7 @@
 from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
 from .utils import (
     AutoWeightsLoader,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -342,52 +342,14 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
-    packed_modules_mapping = {
-        "W_pack": ["W_pack"],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
 
     def __init__(
         self,
@@ -447,7 +409,7 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
     def lm_head_weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
         # Unlike Baichuan, Baichuan2 normalizes the head weights.
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 6e35020a6eac..64808e95ae2b 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -42,13 +42,12 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
     AutoWeightsLoader,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -324,57 +323,17 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            for param_name, shard_name, shard_id in stacked_params_mapping:
-                if shard_name not in name:
-                    continue
-                name = name.replace(shard_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-
-        return loaded_params
-
 
 class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -421,4 +380,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             self,
             skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 6b1712ede320..4010dd54d7d4 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -35,7 +35,6 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
@@ -43,7 +42,7 @@
 from .utils import (
     AutoWeightsLoader,
     StageMissingLayer,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -309,48 +308,14 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("gate_up_proj", "w1", 0),
-            ("gate_up_proj", "w3", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
-    packed_modules_mapping = {
-        "wqkv": ["wqkv"],
-        "gate_up_proj": ["w1", "w3"],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".w1": ".gate_up_proj.0",
+            ".w3": ".gate_up_proj.1",
+        }
+    )
 
     def __init__(
         self,
@@ -409,7 +374,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             self,
             skip_prefixes=(["output."] if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
 @default_pooling_type(tok_pooling_type="ALL")
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 212140fe15ea..5f3b1c9a839c 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -52,12 +52,11 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
 from vllm.model_executor.models.utils import (
     AutoWeightsLoader,
+    WeightsMapper,
     extract_layer_index,
-    is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -342,59 +341,21 @@ def forward(
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if is_pp_missing_parameter(name, self):
-                continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader  # type: ignore
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     """
     Extremely barebones HF model wrapper.
     """
 
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -450,4 +411,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
                 ["lm_head.weight"] if self.config.tie_word_embeddings else None
             ),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 3cacb9d61cd5..52addf4cef97 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -32,13 +32,12 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
 from .utils import (
     AutoWeightsLoader,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -277,45 +276,18 @@ def forward(
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class OrionForCausalLM(nn.Module, SupportsPP):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -362,4 +334,4 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 75c42c0d3930..c417c658d2e2 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -62,13 +62,12 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
     AutoWeightsLoader,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -257,56 +256,15 @@ def forward(
 
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # pylint: disable=E1136
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ]
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+        }
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -360,4 +318,4 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index b4526beac637..f5fa9e7e1d24 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -34,12 +34,12 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
-    is_pp_missing_parameter,
+    AutoWeightsLoader,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -263,6 +263,13 @@ def forward(
 
 
 class QWenBaseModel(nn.Module):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".w2": ".gate_up_proj.0",
+            ".w1": ".gate_up_proj.1",
+        }
+    )
+
     def __init__(
         self,
         *,
@@ -304,53 +311,14 @@ def compute_logits(
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("gate_up_proj", "w2", 0),
-            ("gate_up_proj", "w1", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
 class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA):
-    packed_modules_mapping = {
-        "c_attn": ["c_attn"],
-        "gate_up_proj": [
-            "w2",
-            "w1",
-        ],
-    }
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         if hasattr(config, "visual"):
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index e2232956ea86..9d3980af76a2 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -586,14 +586,6 @@ def _get_prompt_updates(
 class QwenVLForConditionalGeneration(
     QWenBaseModel, SupportsPP, SupportsLoRA, SupportsMultiModal
 ):
-    packed_modules_mapping = {
-        "c_attn": ["c_attn"],
-        "gate_up_proj": [
-            "w2",
-            "w1",
-        ],
-    }
-
     embed_input_ids = SupportsMultiModal.embed_input_ids
 
     def get_mm_mapping(self) -> MultiModelKeys:
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 034c9c18ff7b..17349767b94c 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -45,13 +45,12 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
 from .utils import (
     AutoWeightsLoader,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -266,45 +265,18 @@ def forward(
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class StablelmForCausalLM(nn.Module, SupportsPP):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -351,4 +323,4 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/step1.py b/vllm/model_executor/models/step1.py
index 07653fa6b377..529b405533ae 100644
--- a/vllm/model_executor/models/step1.py
+++ b/vllm/model_executor/models/step1.py
@@ -30,7 +30,6 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import (
     EagleModelMixin,
     SupportsEagle,
@@ -40,7 +39,7 @@
 from vllm.model_executor.models.utils import (
     AutoWeightsLoader,
     PPMissingLayer,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -48,11 +47,6 @@
 from vllm.sequence import IntermediateTensors
 from vllm.v1.attention.backend import AttentionType
 
-STEP_PACKED_MODULES_MAPPING = {
-    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-    "gate_up_proj": ["gate_proj", "up_proj"],
-}
-
 
 def _get_step_alibi_slopes(total_num_heads: int) -> torch.Tensor:
     """Reference ALiBi slopes used by Step models."""
@@ -242,42 +236,6 @@ def forward(
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".gate_proj", 0),
-            (".gate_up_proj", ".up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)  # type: ignore[name-defined]
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class StepDecoderModel(nn.Module, EagleModelMixin):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -354,7 +312,15 @@ def forward(
 
 
 class Step1ForCausalLM(nn.Module, SupportsPP, SupportsEagle, SupportsEagle3):
-    packed_modules_mapping = STEP_PACKED_MODULES_MAPPING
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -413,4 +379,4 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

From e316238295637f433ce8d522afdffb91b39a5197 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 5 Jun 2026 14:57:07 +0000
Subject: [PATCH 06/34] Add patterns from `maybe_remap_kv_scale_name` to
 `QuantizationConfig.get_cache_scale_mapper`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/model_executor/test_weight_utils.py     | 121 ++++++++++++++++++
 .../layers/quantization/base_config.py        |  51 +++++++-
 .../model_executor/layers/quantization/fp8.py |  19 +--
 .../layers/quantization/quark/quark.py        |  19 +--
 4 files changed, 187 insertions(+), 23 deletions(-)

diff --git a/tests/model_executor/test_weight_utils.py b/tests/model_executor/test_weight_utils.py
index 260ebdcefb3b..9e67609b78e4 100644
--- a/tests/model_executor/test_weight_utils.py
+++ b/tests/model_executor/test_weight_utils.py
@@ -160,5 +160,126 @@ def test_missing_target_returns_none(self):
         assert result is None
 
 
+class TestKvCacheScaleMapper:
+    """The `WeightsMapper` returned by `get_cache_scale_mapper` replaces the
+    per-model `maybe_remap_kv_scale_name` calls. It must remap the same set of
+    checkpoint formats (the non-`params_dict`-dependent ones) and be idempotent
+    so it composes safely with a model's own qkv/gate_up `hf_to_vllm_mapper`."""
+
+    def _mapper(self):
+        # `get_cache_scale_mapper` does not use `self`; call it on the base
+        # class to get the default (non-config-specific) mapper.
+        from vllm.model_executor.layers.quantization.base_config import (
+            QuantizationConfig,
+        )
+
+        return QuantizationConfig.get_cache_scale_mapper()
+
+    def _map(self, name: str) -> str | None:
+        return self._mapper()._map_name(name)
+
+    @pytest.mark.parametrize(
+        "name,expected",
+        [
+            # Qwen3-MoE / llm-compressor fused qkv_proj
+            (
+                "model.layers.0.self_attn.qkv_proj.k_scale",
+                "model.layers.0.self_attn.attn.k_scale",
+            ),
+            (
+                "model.layers.0.self_attn.qkv_proj.v_scale",
+                "model.layers.0.self_attn.attn.v_scale",
+            ),
+            # ModelOpt / NVFP4 k_proj/v_proj
+            (
+                "model.layers.0.self_attn.k_proj.k_scale",
+                "model.layers.0.self_attn.attn.k_scale",
+            ),
+            (
+                "model.layers.0.self_attn.v_proj.v_scale",
+                "model.layers.0.self_attn.attn.v_scale",
+            ),
+            # deprecated fused kv_scale and bare scales
+            (
+                "model.layers.0.self_attn.kv_scale",
+                "model.layers.0.self_attn.attn.k_scale",
+            ),
+            (
+                "model.layers.0.self_attn.k_scale",
+                "model.layers.0.self_attn.attn.k_scale",
+            ),
+            # NemotronH mixer
+            (
+                "model.layers.0.mixer.k_proj.k_scale",
+                "model.layers.0.mixer.attn.k_scale",
+            ),
+            # already in vLLM form -> unchanged (idempotent)
+            (
+                "model.layers.0.self_attn.attn.k_scale",
+                "model.layers.0.self_attn.attn.k_scale",
+            ),
+            # non-kv scales must not be touched
+            (
+                "model.layers.0.self_attn.k_proj.weight_scale",
+                "model.layers.0.self_attn.k_proj.weight_scale",
+            ),
+            (
+                "model.layers.0.self_attn.k_proj.input_scale",
+                "model.layers.0.self_attn.k_proj.input_scale",
+            ),
+            # regular weights untouched
+            (
+                "model.layers.0.self_attn.q_proj.weight",
+                "model.layers.0.self_attn.q_proj.weight",
+            ),
+        ],
+    )
+    def test_remap(self, name, expected):
+        assert self._map(name) == expected
+
+    @pytest.mark.parametrize(
+        "name",
+        [
+            "model.layers.0.self_attn.k_scale",
+            "model.layers.0.self_attn.k_proj.k_scale",
+            "model.layers.0.self_attn.qkv_proj.v_scale",
+            "model.layers.0.mixer.k_proj.k_scale",
+        ],
+    )
+    def test_idempotent(self, name):
+        once = self._map(name)
+        assert once is not None
+        assert self._map(once) == once
+
+    def test_composes_with_qkv_mapper(self):
+        """Applied together with a model's qkv/gate_up mapper, the regex scale
+        rules run before the substr rename, so scales are normalized to `.attn.`
+        and regular projections are still fused correctly."""
+        from vllm.model_executor.models.utils import WeightsMapper
+
+        model_mapper = WeightsMapper(
+            orig_to_new_substr={
+                ".q_proj": ".qkv_proj.q",
+                ".k_proj": ".qkv_proj.k",
+                ".v_proj": ".qkv_proj.v",
+            }
+        )
+        # AutoWeightsLoader does `mapper |= cache_scale_mapper`
+        combined = model_mapper | self._mapper()
+
+        assert (
+            combined._map_name("model.layers.0.self_attn.q_proj.weight")
+            == "model.layers.0.self_attn.qkv_proj.q.weight"
+        )
+        assert (
+            combined._map_name("model.layers.0.self_attn.k_proj.k_scale")
+            == "model.layers.0.self_attn.attn.k_scale"
+        )
+        assert (
+            combined._map_name("model.layers.0.self_attn.k_scale")
+            == "model.layers.0.self_attn.attn.k_scale"
+        )
+
+
 if __name__ == "__main__":
     test_download_weights_from_hf()
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 3c03ff2233b0..141fcf1b4113 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -5,6 +5,7 @@
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any
 
+import regex as re
 import torch
 from torch import nn
 from transformers import PretrainedConfig
@@ -19,10 +20,12 @@
 class QuantizeMethodBase(ABC):
     """Base class for different quantized methods."""
 
-    # Whether this method creates weights on meta device for online quantization.
-    # When True, weights are created on meta device and quantized layer-wise
-    # in process_weights_after_loading, reducing peak memory during loading.
     uses_meta_device: bool = False
+    """
+    Whether this method creates weights on meta device for online quantization.
+    When True, weights are created on meta device and quantized layer-wise
+    in process_weights_after_loading, reducing peak memory during loading.
+    """
 
     @abstractmethod
     def create_weights(
@@ -70,6 +73,18 @@ def method_has_implemented_embedding(method_class: type[QuantizeMethodBase]) ->
 class QuantizationConfig(ABC):
     """Base class for quantization configs."""
 
+    _ignore_unexpected_suffixes = (
+        ".q_scale",
+        ".k_scale",
+        ".v_scale",
+        ".q_zero_point",
+        ".k_zero_point",
+        ".v_zero_point",
+    )
+    """Suffixes of quantization parameters that may be present in the checkpoint but
+    not in the model, and should be ignored if unexpected during loading. These are used
+    after remapping, so should be in vLLM format (e.g. .q_scale, not .q.scale)."""
+
     def __init__(self):
         super().__init__()
         # mapping is updated by models as they initialize
@@ -162,14 +177,40 @@ def get_quant_method(
         """
         raise NotImplementedError
 
-    def get_cache_scale_mapper(self) -> "WeightsMapper | None":
+    @staticmethod
+    def get_cache_scale_mapper() -> "WeightsMapper":
         """Mapping from checkpoint KV-cache scale names to vLLM scale names.
 
         Returning a mapper here causes `AutoWeightsLoader` to apply it to the
         weight stream automatically; individual model `load_weights` methods
         do not need to know about KV-cache scales.
         """
-        return None
+        from vllm.model_executor.models.utils import WeightsMapper
+
+        orig_to_new_regex = {
+            # Deprecated fused kv_scale -> attn.k_scale
+            re.compile(r"\.kv_scale$"): r".attn.k_scale",
+            # ModelOpt: .self_attn.{k,v}_proj.{k,v}_scale -> .self_attn.attn.*
+            re.compile(r"\.self_attn\.[kv]_proj\.([kv])_scale$"): (
+                r".self_attn.attn.\1_scale"
+            ),
+            # Fused QKV / qkqkv proj: .self_attn.qk(qk)v_proj.{k,v}_scale -> attn
+            re.compile(r"\.self_attn\.qk(?:qk)?v_proj\.([kv])_scale$"): (
+                r".self_attn.attn.\1_scale"
+            ),
+            # NemotronH: .mixer.{k,v}_proj.{k,v}_scale -> .mixer.attn.*
+            re.compile(r"\.mixer\.[kv]_proj\.([kv])_scale$"): r".mixer.attn.\1_scale",
+            # HYV3: .self_attn.q.scale -> .self_attn.attn.q_scale
+            re.compile(r"\.self_attn\.q\.scale$"): r".self_attn.attn.q_scale",
+            # HYV3: .self_attn.{k,v}_cache.scale -> .self_attn.attn.{k,v}_scale
+            re.compile(r"\.self_attn\.([kv])_cache\.scale$"): (
+                r".self_attn.attn.\1_scale"
+            ),
+            # Default: .{q,k,v}_scale -> .attn.{q,k,v}_scale (unless already .attn)
+            re.compile(r"(?<!\.attn)\.([qkv])_scale$"): r".attn.\1_scale",
+            re.compile(r"(?<!\.attn)\.([qkv])_zero_point$"): r".attn.\1_zero_point",
+        }
+        return WeightsMapper(orig_to_new_regex=orig_to_new_regex)
 
     def apply_vllm_mapper(  # noqa: B027
         self, hf_to_vllm_mapper: "WeightsMapper"
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index a6461900d137..7b411b86b6f6 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -207,18 +207,19 @@ def get_quant_method(
             return Fp8KVCacheMethod(self)
         return None
 
-    def get_cache_scale_mapper(self) -> "WeightsMapper":
+    @staticmethod
+    def get_cache_scale_mapper() -> "WeightsMapper":
         """Map compressed-tensors KV-cache scale names to vLLM names."""
         from vllm.model_executor.models.utils import WeightsMapper
 
-        return WeightsMapper(
-            orig_to_new_suffix={
-                ".k_proj.output_scale": ".attn.k_scale",
-                ".v_proj.output_scale": ".attn.v_scale",
-                ".q_proj.output_scale": ".attn.q_scale",
-                ".self_attn.prob_output_scale": ".self_attn.attn.prob_scale",
-            }
-        )
+        orig_to_new_suffix = {
+            ".k_proj.output_scale": ".attn.k_scale",
+            ".v_proj.output_scale": ".attn.v_scale",
+            ".q_proj.output_scale": ".attn.q_scale",
+            ".self_attn.prob_output_scale": ".self_attn.attn.prob_scale",
+        }
+        cache_scale_mapper = WeightsMapper(orig_to_new_suffix=orig_to_new_suffix)
+        return cache_scale_mapper | QuantizationConfig.get_cache_scale_mapper()
 
 
 class CopyNumelCounter(TorchDispatchMode):
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index 54dea48973b0..c888e523b20d 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -646,16 +646,17 @@ def get_scheme(
 
         return scheme
 
-    def get_cache_scale_mapper(self) -> "WeightsMapper":
+    @staticmethod
+    def get_cache_scale_mapper() -> "WeightsMapper":
         """Map Quark KV-cache scale names to vLLM names."""
-        return WeightsMapper(
-            orig_to_new_suffix={
-                ".k_proj.output_scale": ".attn.k_scale",
-                ".v_proj.output_scale": ".attn.v_scale",
-                ".q_proj.output_scale": ".attn.q_scale",
-                ".self_attn.prob_output_scale": ".self_attn.attn.prob_scale",
-            }
-        )
+        orig_to_new_suffix = {
+            ".k_proj.output_scale": ".attn.k_scale",
+            ".v_proj.output_scale": ".attn.v_scale",
+            ".q_proj.output_scale": ".attn.q_scale",
+            ".self_attn.prob_output_scale": ".self_attn.attn.prob_scale",
+        }
+        cache_scale_mapper = WeightsMapper(orig_to_new_suffix=orig_to_new_suffix)
+        return cache_scale_mapper | QuantizationConfig.get_cache_scale_mapper()
 
 
 class QuarkLinearMethod(LinearMethodBase):

From 665ca0c5879d7c7bc616d87d8872a89d1b6a6add Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 5 Jun 2026 14:59:28 +0000
Subject: [PATCH 07/34] Use new mappings in `AutoWeightsLoader`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/utils.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 6a23d95e138b..f0f722299d78 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -19,9 +19,7 @@
     get_tensor_model_parallel_world_size,
 )
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig,
-)
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.model_loader.reload import (
     support_quantized_model_reload_from_hp_weights,
 )
@@ -59,6 +57,16 @@ def __or__(self, other: "WeightsMapper") -> "WeightsMapper":
         )
 
     def _map_name(self, key: str) -> str | None:
+        # Deprecation warnings
+        if key.endswith(".kv_scale"):
+            logger.warning_once(
+                "DEPRECATED. Found kv_scale in the checkpoint. "
+                "This format is deprecated in favor of separate k_scale and "
+                "v_scale tensors and will be removed in a future release. "
+                "Functionally, we will remap kv_scale to k_scale and duplicate "
+                "k_scale to v_scale"
+            )
+
         for pattern, new_key in self.orig_to_new_regex.items():
             if pattern.search(key):
                 if new_key is None:
@@ -353,9 +361,11 @@ def load_weights(
             if "gptq" in quant_config.get_name():
                 self.ignore_unexpected_suffixes.append(".bias")
             # Get mappings for KV cache quantization scales
-            if cache_scale_mapper := quant_config.get_cache_scale_mapper():
-                mapper = mapper or WeightsMapper()
-                mapper |= cache_scale_mapper
+            mapper = mapper or WeightsMapper()
+            mapper |= quant_config.get_cache_scale_mapper()
+            self.ignore_unexpected_suffixes.extend(
+                quant_config._ignore_unexpected_suffixes
+            )
         if mapper is not None:
             weights = mapper.apply(weights)
         # filter out weights with first-prefix/substr to skip in name

From edf67e6bfe40b811d27c14fcdfc571b7ba20c433 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 5 Jun 2026 15:08:58 +0000
Subject: [PATCH 08/34] Remove some more load_weights methods

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/arcee.py        | 81 +++------------------
 vllm/model_executor/models/commandr.py     | 69 +++---------------
 vllm/model_executor/models/exaone.py       | 80 +++-----------------
 vllm/model_executor/models/exaone4.py      | 80 +++-----------------
 vllm/model_executor/models/gemma2.py       | 70 +++---------------
 vllm/model_executor/models/glm4.py         | 83 +++------------------
 vllm/model_executor/models/hyperclovax.py  | 81 +++------------------
 vllm/model_executor/models/jais2.py        | 71 +++---------------
 vllm/model_executor/models/nemotron.py     | 66 +++--------------
 vllm/model_executor/models/nemotron_nas.py | 73 +++----------------
 vllm/model_executor/models/ouro.py         | 74 +++----------------
 vllm/model_executor/models/rnj1.py         | 85 +++-------------------
 vllm/model_executor/models/seed_oss.py     | 71 +++---------------
 vllm/model_executor/models/solar.py        | 74 +++----------------
 vllm/model_executor/models/starcoder2.py   | 49 +++----------
 15 files changed, 159 insertions(+), 948 deletions(-)

diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py
index d25c954fc19e..844f7ff44342 100644
--- a/vllm/model_executor/models/arcee.py
+++ b/vllm/model_executor/models/arcee.py
@@ -26,10 +26,6 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import (
@@ -42,7 +38,7 @@
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -276,67 +272,6 @@ def forward(
             return hidden_states, aux_hidden_states
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        """Load weights, mapping q/k/v projections to fused qkv_proj."""
-        stacked_params_mapping = [
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-        ]
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
-                continue
-
-            if "scale" in name or "zero_point" in name:
-                remapped_name = maybe_remap_kv_scale_name(name, params_dict)
-                if remapped_name is None:
-                    continue
-                name = remapped_name
-
-            mapped = False
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-
-                name = name.replace(weight_name, param_name)
-
-                if name.endswith(".bias") and name not in params_dict:
-                    mapped = True
-                    break
-
-                if is_pp_missing_parameter(name, self):
-                    mapped = True
-                    break
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader  # type: ignore[attr-defined]
-                weight_loader(param, loaded_weight, shard_id)
-                loaded_params.add(name)
-                mapped = True
-                break
-
-            if mapped:
-                continue
-
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-
-            if is_pp_missing_parameter(name, self):
-                continue
-
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader", default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-
-        return loaded_params
-
 
 class ArceeForCausalLM(
     nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3
@@ -344,11 +279,13 @@ class ArceeForCausalLM(
     """Arcee Model for causal language modeling, integrated with vLLM
     runtime."""
 
-    # Map fused module names to their submodule components
-    # (for quantization and LoRA)
-    packed_modules_mapping = {
-        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+        }
+    )
 
     def __init__(self, *, vllm_config, prefix: str = "") -> None:
         super().__init__()
@@ -420,4 +357,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         )
         # AutoWeightLoader handles weight name remapping, including fusing
         # separate q_proj, k_proj, v_proj into qkv_proj
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 317269ec3b6b..20241433c484 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -45,8 +45,6 @@
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
     row_parallel_weight_loader,
 )
 from vllm.model_executor.utils import set_weight_attrs
@@ -56,8 +54,8 @@
 from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
 from .utils import (
     AutoWeightsLoader,
+    WeightsMapper,
     extract_layer_index,
-    is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -340,61 +338,18 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            for param_name, shard_name, shard_id in stacked_params_mapping:
-                if shard_name not in name:
-                    continue
-                name = name.replace(shard_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
+
     # LoRA specific attributes
     embedding_modules = {"embed_tokens": "input_embeddings"}
 
@@ -453,4 +408,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self, skip_prefixes=["lm_head", "rotary_emb.inv_freq"]
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index dca05f72c696..13b29e433d34 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -50,17 +50,13 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -373,71 +369,17 @@ def forward(
         hidden_states, _ = self.ln_f(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".c_fc_0", 0),
-            (".gate_up_proj", ".c_fc_1", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "c_fc_0",
-            "c_fc_1",
-        ],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".c_fc_0": ".gate_up_proj.0",
+            ".c_fc_1": ".gate_up_proj.1",
+        }
+    )
 
     # LoRA specific attributes
     embedding_modules = {
@@ -509,4 +451,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             # processed with quantization, LoRA, fine-tuning, etc.
             skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index e38dbb5ee294..c583a776e390 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -46,10 +46,6 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import set_default_rope_theta
 
@@ -57,8 +53,8 @@
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
+    WeightsMapper,
     extract_layer_index,
-    is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -371,71 +367,17 @@ def forward(
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".gate_proj", 0),
-            (".gate_up_proj", ".up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
 
     # LoRA specific attributes
     embedding_modules = {
@@ -506,4 +448,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             # processed with quantization, LoRA, fine-tuning, etc.
             skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 733eb3ed3c19..fd1d2027297d 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -39,17 +39,13 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
     AutoWeightsLoader,
+    WeightsMapper,
     extract_layer_index,
-    is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -316,61 +312,17 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            for param_name, shard_name, shard_id in stacked_params_mapping:
-                if shard_name not in name:
-                    continue
-                name = name.replace(shard_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-
-        return loaded_params
-
 
 class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
@@ -418,4 +370,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             self,
             skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index 4587a6927663..b1ad99637fa9 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -39,10 +39,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.sequence import IntermediateTensors
 from vllm.v1.attention.backend import AttentionType
 
@@ -52,7 +48,7 @@
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
-    is_pp_missing_parameter,
+    WeightsMapper,
     maybe_prefix,
 )
 
@@ -237,74 +233,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             vllm_config=vllm_config, prefix=prefix, layer_type=Glm4DecoderLayer
         )
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".gate_proj", 0),
-            (".gate_up_proj", ".up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
-            if spec_layer is not None:
-                continue
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            if "scale" in name or "zero_point" in name:
-                # Remapping the name of FP8 kv-scale or zero point.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -364,7 +303,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             self,
             skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
 def get_spec_layer_idx_from_weight_name(
diff --git a/vllm/model_executor/models/hyperclovax.py b/vllm/model_executor/models/hyperclovax.py
index 2f54f78e7580..3eff84a4fe1a 100644
--- a/vllm/model_executor/models/hyperclovax.py
+++ b/vllm/model_executor/models/hyperclovax.py
@@ -50,10 +50,6 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.hyperclovax import HyperCLOVAXConfig
 
@@ -61,7 +57,7 @@
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -377,72 +373,17 @@ def forward(
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".gate_proj", 0),
-            (".gate_up_proj", ".up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            if "scale" in name or "zero_point" in name:
-                # Remapping the name of FP8 kv-scale or zero point.
-                remapped_name = maybe_remap_kv_scale_name(name, params_dict)
-                if remapped_name is None:
-                    continue
-                name = remapped_name
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader  # type: ignore[attr-defined]
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class HyperCLOVAXForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
 
     # LoRA specific attributes
     embedding_modules = {
@@ -536,4 +477,4 @@ def load_weights(
             self,
             skip_prefixes=["lm_head."] if self.config.tie_word_embeddings else None,
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py
index dafa0f03ae9d..5a6d2f0c5fe0 100644
--- a/vllm/model_executor/models/jais2.py
+++ b/vllm/model_executor/models/jais2.py
@@ -51,18 +51,14 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
+    WeightsMapper,
     extract_layer_index,
-    is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -370,64 +366,15 @@ def forward(
         hidden_states, _ = self.norm(hidden_states + residual), residual
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            if "scale" in name:
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class Jais2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+        }
+    )
 
     embedding_modules = {
         "embed_tokens": "input_embeddings",
@@ -494,4 +441,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             self,
             skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 7b2e6b93b27e..4e3a4efc5a70 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -47,10 +47,6 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 
@@ -58,7 +54,7 @@
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -365,59 +361,15 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+        }
+    )
 
     # LoRA specific attributes
     embedding_modules = {
@@ -484,4 +436,4 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index b974a3eb0851..86b348fcc9d8 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -42,10 +42,6 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.model_executor.models.llama import LlamaAttention, LlamaMLP
 from vllm.sequence import IntermediateTensors
 from vllm.v1.attention.backend import AttentionType
@@ -54,7 +50,7 @@
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -316,64 +312,17 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".gate_proj", 0),
-            (".gate_up_proj", ".up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            if "scale" in name or "zero_point" in name:
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
-    packed_modules_mapping = {
-        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
 
     # LoRA specific attributes
     embedding_modules = {
@@ -463,4 +412,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             self,
             skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py
index 503d4b5c8343..ebbe2999829e 100644
--- a/vllm/model_executor/models/ouro.py
+++ b/vllm/model_executor/models/ouro.py
@@ -51,16 +51,13 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.sequence import IntermediateTensors
 from vllm.v1.attention.backend import AttentionType
 
 from .interfaces import SupportsLoRA
 from .utils import (
     AutoWeightsLoader,
+    WeightsMapper,
     extract_layer_index,
     make_empty_intermediate_tensors_factory,
     make_layers,
@@ -376,66 +373,17 @@ def forward(
             hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if name.endswith("scale"):
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                if weight_loader == default_weight_loader:
-                    weight_loader(param, loaded_weight)
-                else:
-                    weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class OuroForCausalLM(nn.Module, SupportsLoRA):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -492,4 +440,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             self,
             skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/rnj1.py b/vllm/model_executor/models/rnj1.py
index 68c3722e2bc1..37f0f6e1684a 100644
--- a/vllm/model_executor/models/rnj1.py
+++ b/vllm/model_executor/models/rnj1.py
@@ -30,18 +30,14 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.sequence import IntermediateTensors
 from vllm.v1.attention.backend import AttentionType
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
     AutoWeightsLoader,
+    WeightsMapper,
     extract_layer_index,
-    is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -331,76 +327,17 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if (
-                self.quant_config
-                and self.quant_config.get_name() == "gguf"
-                and name.endswith("norm.weight")
-            ):
-                loaded_weight -= 1
-
-            if name.endswith((".k_scale", ".v_scale", ".q_scale", ".prob_scale")):
-                remapped_name = maybe_remap_kv_scale_name(name, params_dict)
-                if remapped_name is not None and remapped_name in params_dict:
-                    param = params_dict[remapped_name]
-                    weight_loader = getattr(
-                        param, "weight_loader", default_weight_loader
-                    )
-                    weight_loader(param, loaded_weight)
-                    loaded_params.add(remapped_name)
-                    continue
-
-            for param_name, shard_name, shard_id in stacked_params_mapping:
-                if shard_name not in name:
-                    continue
-                name = name.replace(shard_name, param_name)
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-
-        return loaded_params
-
 
 class Rnj1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
@@ -457,4 +394,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             self,
             skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
index 48147f7334e8..af21f3ba17a9 100644
--- a/vllm/model_executor/models/seed_oss.py
+++ b/vllm/model_executor/models/seed_oss.py
@@ -49,10 +49,6 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import set_default_rope_theta
 from vllm.v1.attention.backend import AttentionType
@@ -61,7 +57,7 @@
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -362,62 +358,17 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -477,4 +428,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             self,
             skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 454a0e971125..275b8b4b9655 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -48,17 +48,13 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -348,65 +344,17 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".gate_proj", 0),
-            (".gate_up_proj", ".up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
 
     # LoRA specific attributes
     embedding_modules = {
@@ -469,4 +417,4 @@ def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 5f08a59e2364..5ff3a4cbeeed 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -45,16 +45,12 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
 from .utils import (
     AutoWeightsLoader,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -272,41 +268,16 @@ def forward(
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class Starcoder2ForCausalLM(nn.Module, SupportsPP):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+        }
+    )
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -362,4 +333,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 ["lm_head.weight"] if self.config.tie_word_embeddings else None
             ),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

From b1f1c9d919a8e36f05e5b14988fde3c07f95b750 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 11 Jun 2026 10:30:58 +0000
Subject: [PATCH 09/34] Fix `load_weights` methods for fused case

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/layers/linear.py | 34 ++++++++++++++--------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 40e98ac34ad6..6e3bd57085f8 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -647,7 +647,7 @@ def __init__(
         self.tp_size = get_tensor_model_parallel_world_size() if not disable_tp else 1
         self.tp_rank = get_tensor_model_parallel_rank() if not disable_tp else 0
         assert all(output_size % self.tp_size == 0 for output_size in output_sizes)
-        self.checkpoint_format: Literal["sharded", "fused"] | None = None
+        self.checkpoint_format: Literal["fused", "sharded"] | None = None
         super().__init__(
             input_size=input_size,
             output_size=sum(output_sizes),
@@ -974,18 +974,18 @@ def load_weights(
         self, weights: Iterable[tuple[str, torch.Tensor]]
     ) -> Iterable[str]:
         for name, loaded_weight in weights:
-            shard_id_str, _, param_name = name.partition(".")
-            # If the shard_id is not an integer, the weight is not sharded
-            try:
+            if "." in name:
+                # Checkpoint is sharded
+                shard_id_str, _, name = name.partition(".")
                 shard_id = int(shard_id_str)
                 self.checkpoint_format = "sharded"
-            except ValueError:
+            else:
                 shard_id = None
                 self.checkpoint_format = "fused"
-            # If param_name is "bias" get it from self, otherwise load into self
-            param: Parameter = getattr(self, param_name, self)
+            # If name is "bias" get it from self, otherwise load into self
+            param: Parameter = getattr(self, name, self)
             param.weight_loader(param, loaded_weight, shard_id)
-            yield param_name
+            yield name
 
 
 class QKVParallelLinear(ColumnParallelLinear):
@@ -1407,19 +1407,19 @@ def load_weights(
         self, weights: Iterable[tuple[str, torch.Tensor]]
     ) -> Iterable[str]:
         for name, loaded_weight in weights:
-            shard_id_str, _, param_name = name.partition(".")
-            # If the shard_id is not valid, the weight is not sharded
-            try:
-                self.validate_shard_id(shard_id_str)
-                shard_id = shard_id_str
+            if "." in name:
+                # Checkpoint is sharded
+                shard_id, _, name = name.partition(".")
+                self.validate_shard_id(shard_id)
                 self.checkpoint_format = "sharded"
-            except ValueError:
+            else:
+                # Checkpoint is fused
                 shard_id = None
                 self.checkpoint_format = "fused"
-            # If param_name is "bias" get it from self, otherwise load into self
-            param: Parameter = getattr(self, param_name, self)
+            # If name is "bias" get it from self, otherwise load into self
+            param: Parameter = getattr(self, name, self)
             param.weight_loader(param, loaded_weight, shard_id)
-            yield param_name
+            yield name
 
 
 # --8<-- [start:row_parallel_linear]

From b5a719172b2d8f0a806302a7f99f561b95071d5d Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 11 Jun 2026 11:35:38 +0000
Subject: [PATCH 10/34] Fix BaiChuan tests that depend on old behaviour

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/lora/conftest.py              | 21 ++++++++++++++
 tests/lora/test_lora_checkpoints.py | 44 ++++++++++-------------------
 2 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index dea54ed21aea..7d9e8444827f 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -19,6 +19,7 @@
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     MergedColumnParallelLinear,
+    QKVParallelLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -166,6 +167,26 @@ def dummy_model_gate_up(default_vllm_config) -> nn.Module:
     return model
 
 
+@pytest.fixture
+def baichuan_dummy_model(default_vllm_config, dist_init) -> nn.Module:
+    # Only includes BaiChuan's lora modules so get_supported_lora_modules will work
+    model = DummyLoRAModel(
+        OrderedDict(
+            [
+                ("W_pack", QKVParallelLinear(64, 8, 8)),
+                ("o_proj", RowParallelLinear(64, 64)),
+                ("gate_up_proj", MergedColumnParallelLinear(64, [16, 16])),
+                ("down_proj", RowParallelLinear(16, 64)),
+            ]
+        )
+    )
+    model.config = MagicMock()
+    # Match the expected format for BaiChuan checkpoints
+    model.W_pack.checkpoint_format = "fused"
+    model.gate_up_proj.checkpoint_format = "sharded"
+    return model
+
+
 @pytest.fixture(scope="session")
 def mixtral_lora_files():
     # Note: this module has incorrect adapter_config.json to test
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 7c263e2a2276..0a54a80242be 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -5,37 +5,26 @@
 
 from vllm.lora.lora_model import LoRAModel
 from vllm.lora.peft_helper import PEFTHelper
-from vllm.lora.utils import parse_fine_tuned_lora_name
+from vllm.lora.utils import get_supported_lora_modules, parse_fine_tuned_lora_name
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
 from vllm.model_executor.models.gemma4 import Gemma4ForCausalLM
 from vllm.model_executor.models.utils import WeightsMapper
 
-lora_lst = ["baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"]
-BAICHUAN_LORA_MODULES = [
-    "W_pack",
-    "o_proj",
-    "gate_up_proj",
-    "down_proj",
-]
 
-
-@pytest.mark.parametrize("lora_name", lora_lst)
+@pytest.mark.parametrize(
+    "lora_name",
+    ["baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"],
+)
 def test_load_checkpoints(
     lora_name,
     baichuan_lora_files,
     baichuan_zero_lora_files,
     baichuan_regex_lora_files,
     chatglm3_lora_files,
+    baichuan_dummy_model,
 ):
-    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
-
-    expected_lora_lst: list[str] = []
-    for module in BAICHUAN_LORA_MODULES:
-        if module in packed_modules_mapping:
-            expected_lora_lst.extend(packed_modules_mapping[module])
-        else:
-            expected_lora_lst.append(module)
-    expected_lora_modules = set(expected_lora_lst)
+    expected_lora_modules = set(get_supported_lora_modules(baichuan_dummy_model))
+    weights_mapper = BaiChuanBaseForCausalLM.hf_to_vllm_mapper
     if lora_name == "baichuan7B":
         peft_helper = PEFTHelper.from_local_dir(
             baichuan_lora_files, max_position_embeddings=4096
@@ -49,6 +38,7 @@ def test_load_checkpoints(
             lora_model_id=1,
             device="cpu",
             model_vocab_size=64000,
+            weights_mapper=weights_mapper,
         )
     elif lora_name == "baichuan7B-zero":
         # Test that the target_modules contain prefix
@@ -64,6 +54,7 @@ def test_load_checkpoints(
             lora_model_id=1,
             device="cpu",
             model_vocab_size=64000,
+            weights_mapper=weights_mapper,
         )
     elif lora_name == "baichuan7B-zero-regex":
         # Test that the `target_modules` in the form of regular expressions,
@@ -78,6 +69,7 @@ def test_load_checkpoints(
             lora_model_id=1,
             device="cpu",
             model_vocab_size=64000,
+            weights_mapper=weights_mapper,
         )
     else:
         # For the baichuan7B model, load chatglm3-6b's LoRA,
@@ -97,22 +89,16 @@ def test_load_checkpoints(
             )
 
 
-def test_lora_weights_mapping(baichuan_lora_files):
-    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
-
-    expected_lora_lst: list[str] = []
-    for module in BAICHUAN_LORA_MODULES:
-        if module in packed_modules_mapping:
-            expected_lora_lst.extend(packed_modules_mapping[module])
-        else:
-            expected_lora_lst.append(module)
-    expected_lora_modules = set(expected_lora_lst)
+def test_lora_weights_mapping(baichuan_lora_files, baichuan_dummy_model):
+    expected_lora_modules = set(get_supported_lora_modules(baichuan_dummy_model))
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.": "language_model.model.",
         },
         orig_to_new_substr={
             ".layers.": ".baichuan_layers.",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
         },
     )
     peft_helper = PEFTHelper.from_local_dir(

From f5383aa2d5dedd033770a0fbce1ac655387ac032 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 11 Jun 2026 12:52:19 +0000
Subject: [PATCH 11/34] Handle MergedColumnParallelLinear for LoRA too

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/lora/layers/column_parallel_linear.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py
index f9643809c7f4..7ec31c31253e 100644
--- a/vllm/lora/layers/column_parallel_linear.py
+++ b/vllm/lora/layers/column_parallel_linear.py
@@ -167,7 +167,10 @@ def can_replace_layer(
         if type(source_layer) is maybe_get_oot_by_class(ColumnParallelLinear):
             return True
         if isinstance(source_layer, maybe_get_oot_by_class(MergedColumnParallelLinear)):
-            if len(packed_modules_list) != 1:
+            if (
+                len(packed_modules_list) != 1
+                or source_layer.checkpoint_format == "sharded"
+            ):
                 return False
             # Exclude layers with 3+ output sizes - those are handled by
             # MergedColumnParallelLinearVariableSliceWithLoRA since this
@@ -347,7 +350,11 @@ def can_replace_layer(
         decorate: bool = True,
     ) -> bool:
         merged_cls = maybe_get_oot_by_class(MergedColumnParallelLinear)
-        if not isinstance(source_layer, merged_cls) or len(packed_modules_list) != 2:
+        if (
+            not isinstance(source_layer, merged_cls)
+            or len(source_layer.output_sizes) != 2
+            or source_layer.checkpoint_format == "fused"
+        ):
             return False
 
         tp_size = getattr(source_layer, "tp_size", 1)

From 82a7a64f5f824393eac66efa6c4ef5972c17f798 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 11 Jun 2026 13:59:45 +0000
Subject: [PATCH 12/34] Delete some more load_weights methods

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/chatglm.py        | 53 ++-----------
 vllm/model_executor/models/cohere_eagle.py   | 41 +---------
 vllm/model_executor/models/fairseq2_llama.py |  6 +-
 vllm/model_executor/models/glm4.py           | 26 +++----
 vllm/model_executor/models/glm4v.py          |  9 +++
 vllm/model_executor/models/gpt_j.py          | 61 +++------------
 vllm/model_executor/models/granite.py        | 74 ++++--------------
 vllm/model_executor/models/internlm2.py      | 20 +++--
 vllm/model_executor/models/jina.py           |  5 +-
 vllm/model_executor/models/llama.py          | 73 ++++--------------
 vllm/model_executor/models/mamba.py          | 25 +-----
 vllm/model_executor/models/mamba2.py         | 26 +------
 vllm/model_executor/models/mimo.py           | 57 ++------------
 vllm/model_executor/models/mistral_eagle.py  | 10 +--
 vllm/model_executor/models/mpt.py            | 17 ----
 vllm/model_executor/models/qwen2.py          | 81 ++++----------------
 vllm/model_executor/models/qwen2_rm.py       | 12 ---
 vllm/model_executor/models/qwen3.py          | 19 ++---
 vllm/model_executor/models/whisper.py        | 48 +++---------
 19 files changed, 133 insertions(+), 530 deletions(-)

diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index c5d857e7c3df..4363188ff6e1 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -30,7 +30,6 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 
@@ -38,7 +37,6 @@
 from .utils import (
     AutoWeightsLoader,
     WeightsMapper,
-    is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -316,12 +314,9 @@ def forward(
 
 @support_torch_compile
 class ChatGLMModel(nn.Module, SupportsQuant):
-    packed_modules_mapping = {
-        "linear_proj.merged_proj": [
-            "linear_proj.gate_proj",
-            "linear_proj.dense_h_to_4h",
-        ]
-    }
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={".word_embeddings": ""},
+    )
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -386,47 +381,11 @@ def forward(
         return hidden_states
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("linear_proj.merged_proj", "linear_proj.gate_proj", 0),
-            ("linear_proj.merged_proj", "linear_proj.dense_h_to_4h", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-
-        for name, loaded_weight in weights:
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                if "rotary_pos_emb.inv_freq" in name:
-                    continue
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
 class ChatGLMBaseModel(nn.Module):
-    hf_to_vllm_mapper = WeightsMapper(
-        orig_to_new_substr={".word_embeddings": ""},
-    )
-
     def __init__(
         self,
         *,
@@ -467,7 +426,7 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+        return loader.load_weights(weights)
 
 
 class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, SupportsQuant):
diff --git a/vllm/model_executor/models/cohere_eagle.py b/vllm/model_executor/models/cohere_eagle.py
index 7b57c739ffe9..64ec0d6dd544 100644
--- a/vllm/model_executor/models/cohere_eagle.py
+++ b/vllm/model_executor/models/cohere_eagle.py
@@ -14,7 +14,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.commandr import (
     CohereDecoderLayer,
     CohereForCausalLM,
@@ -134,42 +133,6 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states, hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".gate_proj", 0),
-            (".gate_up_proj", ".up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class EagleCohereForCausalLM(CohereForCausalLM):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -225,7 +188,9 @@ def _track_and_forward(inputs):
             ),
         )
 
-        loaded_weight_names = loader.load_weights(map(_track_and_forward, weights))
+        loaded_weight_names = loader.load_weights(
+            map(_track_and_forward, weights), mapper=self.hf_to_vllm_mapper
+        )
 
         # Embed tokens are tied with the target model and therefore not
         # present in the EAGLE checkpoint; mark them as loaded explicitly to
diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py
index ca0e7e64df53..e898034fbfa5 100644
--- a/vllm/model_executor/models/fairseq2_llama.py
+++ b/vllm/model_executor/models/fairseq2_llama.py
@@ -79,10 +79,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(
-            (
-                self.reshape_fairseq2_weights(name, loaded_weight, params)
-                for name, loaded_weight in weights
-            )
+            self.reshape_fairseq2_weights(name, loaded_weight, params)
+            for name, loaded_weight in weights
         )
 
     def flag_sharded_weights(self, params: dict[str, Parameter]):
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index b1ad99637fa9..e7414e799861 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -48,7 +48,6 @@
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
-    WeightsMapper,
     maybe_prefix,
 )
 
@@ -235,16 +234,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
 
 class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    hf_to_vllm_mapper = WeightsMapper(
-        orig_to_new_substr={
-            ".q_proj": ".qkv_proj.q",
-            ".k_proj": ".qkv_proj.k",
-            ".v_proj": ".qkv_proj.v",
-            ".gate_proj": ".gate_up_proj.0",
-            ".up_proj": ".gate_up_proj.1",
-        }
-    )
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -299,11 +288,16 @@ def compute_logits(
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(
-            self,
-            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
-        )
-        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+        skip_prefixes = ["lm_head."] if self.config.tie_word_embeddings else []
+        # Skip the speculative (MTP) layers, which are loaded by the
+        # draft model instead.
+        num_nextn_layers = getattr(self.config, "num_nextn_predict_layers", 0)
+        skip_prefixes += [
+            f"model.layers.{self.config.num_hidden_layers + i}."
+            for i in range(num_nextn_layers)
+        ]
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
+        return loader.load_weights(weights)
 
 
 def get_spec_layer_idx_from_weight_name(
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 9d08df4df8dc..8b9a8f088930 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -61,6 +61,7 @@
     SupportsMultiModal,
     SupportsPP,
 )
+from .utils import WeightsMapper
 
 
 class GLMVImagePixelInputs(TensorSchema):
@@ -376,6 +377,14 @@ def forward(self, images: torch.Tensor) -> torch.Tensor:
 
 
 class GLM4VModel(ChatGLMModel):
+    hf_to_vllm_mapper = ChatGLMModel.hf_to_vllm_mapper | WeightsMapper(
+        orig_to_new_substr={
+            # Vision GLU projections
+            "linear_proj.gate_proj": "linear_proj.merged_proj.0",
+            "linear_proj.dense_h_to_4h": "linear_proj.merged_proj.1",
+        }
+    )
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
 
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 30da9b4dea23..12c90a53d7ff 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -43,16 +43,12 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
 from .utils import (
     AutoWeightsLoader,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -239,51 +235,16 @@ def forward(
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "attn.bias" in name or "attn.masked_bias" in name:
-                continue
-
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class GPTJForCausalLM(nn.Module, SupportsPP):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+        }
+    )
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -329,5 +290,5 @@ def compute_logits(
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
+        loader = AutoWeightsLoader(self, skip_substrs=["attn.bias", "attn.masked_bias"])
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 2adc29f8d252..518dce8453d5 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -49,17 +49,13 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_layers,
     maybe_prefix,
 )
@@ -253,6 +249,16 @@ def forward(
 
 @support_torch_compile
 class GraniteModel(nn.Module):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -323,65 +329,11 @@ def forward(
         return hidden_states
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".gate_proj", 0),
-            (".gate_up_proj", ".up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
 class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-
     # LoRA specific attributes
     embedding_modules = {
         "embed_tokens": "input_embeddings",
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 4010dd54d7d4..8eeadf2d0202 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -248,6 +248,13 @@ def forward(
 
 @support_torch_compile
 class InternLM2Model(nn.Module):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".w1": ".gate_up_proj.0",
+            ".w3": ".gate_up_proj.1",
+        }
+    )
+
     def __init__(
         self,
         *,
@@ -308,15 +315,12 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
 
 class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
-    hf_to_vllm_mapper = WeightsMapper(
-        orig_to_new_substr={
-            ".w1": ".gate_up_proj.0",
-            ".w3": ".gate_up_proj.1",
-        }
-    )
-
     def __init__(
         self,
         *,
@@ -374,7 +378,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             self,
             skip_prefixes=(["output."] if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+        return loader.load_weights(weights)
 
 
 @default_pooling_type(tok_pooling_type="ALL")
diff --git a/vllm/model_executor/models/jina.py b/vllm/model_executor/models/jina.py
index 2b07937df08e..f1f585cdae8d 100644
--- a/vllm/model_executor/models/jina.py
+++ b/vllm/model_executor/models/jina.py
@@ -254,5 +254,8 @@ def _merge_weights(
                         tensor = tensor + (lora_B @ lora_A) * scaling
                 yield name, tensor
 
-        loaded = self.model.load_weights(_merge_weights(weights))
+        loader = AutoWeightsLoader(self.model, ignore_unexpected_prefixes=["lm_head."])
+        loaded = loader.load_weights(
+            _merge_weights(weights), mapper=self.model.hf_to_vllm_mapper
+        )
         return {f"model.{name}" for name in loaded}
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 39044f5e8b4a..6290509923a3 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -52,10 +52,6 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.sequence import IntermediateTensors
 from vllm.v1.attention.backend import AttentionType
 
@@ -71,8 +67,8 @@
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
+    WeightsMapper,
     extract_layer_index,
-    is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -349,6 +345,16 @@ def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None
     },
 )
 class LlamaModel(nn.Module, EagleModelMixin):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
+
     def __init__(
         self,
         *,
@@ -435,66 +441,13 @@ def forward(
         return hidden_states
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".gate_proj", 0),
-            (".gate_up_proj", ".up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            if "scale" in name or "zero_point" in name:
-                # Remapping the name of FP8 kv-scale or zero point.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
 class LlamaForCausalLM(
     LocalArgmaxMixin, nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3
 ):
-    packed_modules_mapping = {
-        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"],
-    }
-
     # LoRA specific attributes
     embedding_modules = {
         "embed_tokens": "input_embeddings",
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index ec2a7255eb66..6a77a58abf4d 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -26,7 +26,6 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import (
     HasInnerState,
     IsAttentionFree,
@@ -37,7 +36,7 @@
 
 from .utils import (
     AutoWeightsLoader,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -170,28 +169,12 @@ def forward(
 
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "A_log" in name:
-                name = name.replace("A_log", "A")
-            # Skip loading extra bias for GPTQ models.
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader", default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class MambaForCausalLM(
     nn.Module, HasInnerState, IsAttentionFree, SupportsPP, SupportsMambaPrefixCaching
 ):
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={".A_log": ".A"})
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
 
@@ -279,4 +262,4 @@ def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index deb20852a26a..343111ee0151 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -25,7 +25,6 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import (
     HasInnerState,
     IsAttentionFree,
@@ -35,7 +34,7 @@
 
 from .utils import (
     AutoWeightsLoader,
-    is_pp_missing_parameter,
+    WeightsMapper,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -167,29 +166,12 @@ def forward(
 
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "A_log" in name:
-                name = name.replace("A_log", "A")
-
-            # Skip loading extra bias for GPTQ models.
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader", default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class Mamba2ForCausalLM(
     nn.Module, HasInnerState, IsAttentionFree, SupportsMambaPrefixCaching
 ):
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={".A_log": ".A"})
+
     @classmethod
     def get_mamba_state_dtype_from_config(
         cls,
@@ -292,4 +274,4 @@ def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py
index 4f67d468ace5..e4247fa8d8df 100644
--- a/vllm/model_executor/models/mimo.py
+++ b/vllm/model_executor/models/mimo.py
@@ -38,14 +38,10 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM, Qwen2Model
 from vllm.sequence import IntermediateTensors
 
-from .utils import PPMissingLayer, is_pp_missing_parameter, maybe_prefix
+from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix
 
 logger = init_logger(__name__)
 
@@ -89,50 +85,6 @@ def forward(
         hidden_states = hidden_states + residual
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "mtp_layers" in name:
-                continue
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -167,6 +119,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        skip_prefixes = ["lm_head."] if self.config.tie_word_embeddings else []
+        # MTP layers are loaded by the draft model, not the main model.
+        skip_prefixes.append("model.mtp_layers.")
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
+        return loader.load_weights(weights)
+
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/mistral_eagle.py b/vllm/model_executor/models/mistral_eagle.py
index 8865742d6495..75d1ebb91a80 100644
--- a/vllm/model_executor/models/mistral_eagle.py
+++ b/vllm/model_executor/models/mistral_eagle.py
@@ -108,11 +108,6 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states, hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        # Pretend embed_tokens is loaded; the actual weight is shared
-        # from the target model at runtime by `load_eagle_model`.
-        return super().load_weights(weights) | {"embed_tokens.weight"}
-
 
 class EagleMistralForCausalLM(MistralForCausalLM):
     mistral_mapping = MistralForCausalLM.mistral_mapping | {
@@ -166,3 +161,8 @@ def embed_input_ids(
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
         )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        # Pretend embed_tokens is loaded; the actual weight is shared
+        # from the target model at runtime by `load_eagle_model`.
+        return super().load_weights(weights) | {"model.embed_tokens.weight"}
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 85933626cd30..8e509fbcb4c6 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -27,13 +27,11 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
 from .utils import (
     AutoWeightsLoader,
-    is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -274,21 +272,6 @@ def forward(
         hidden_states = self.norm_f(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            # Skip loading extra bias for GPTQ models.
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader", default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class MPTForCausalLM(nn.Module, SupportsPP):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 9c39c6497082..e27ed683d3e1 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -54,10 +54,6 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import is_interleaved, set_default_rope_theta
 from vllm.v1.attention.backend import AttentionType
@@ -72,8 +68,8 @@
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
+    WeightsMapper,
     extract_layer_index,
-    is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -323,6 +319,16 @@ def forward(
     }
 )
 class Qwen2Model(nn.Module, EagleModelMixin):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+        }
+    )
+
     def __init__(
         self,
         *,
@@ -426,74 +432,13 @@ def forward(
         return hidden_states
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                if name.endswith("scale"):
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                if weight_loader == default_weight_loader:
-                    weight_loader(param, loaded_weight)
-                else:
-                    weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                if name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
 class Qwen2ForCausalLM(
     nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3
 ):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config.get_text_config()
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index cdf1a327efe5..08c036e1a9b8 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -28,18 +28,6 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
     is_pooling_model = True
     pooler: Pooler
 
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index b070eac32551..0bebd7d367e9 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -57,7 +57,12 @@
 )
 from .qwen2 import Qwen2MLP as Qwen3MLP
 from .qwen2 import Qwen2Model
-from .utils import AutoWeightsLoader, PPMissingLayer, extract_layer_index, maybe_prefix
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    maybe_prefix,
+)
 
 logger = init_logger(__name__)
 
@@ -267,18 +272,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 class Qwen3ForCausalLM(
     LocalArgmaxMixin, nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3
 ):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-
     embedding_modules = {
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 628186e7598b..a0a06264a7f6 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -44,7 +44,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.whisper_utils import (
     ISO639_1_SUPPORTED_LANGS,
 )
@@ -617,42 +616,6 @@ def get_encoder_outputs(
             return None
         return self.encoder(input_features)
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
-            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
-            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
-            # MergedColumnParallelLinear uses integer indices (0, 1)
-            (".encoder_attn.kv_proj", ".encoder_attn.k_proj", 0),
-            (".encoder_attn.kv_proj", ".encoder_attn.v_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class WhisperProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self) -> WhisperConfig:
@@ -808,7 +771,16 @@ class WhisperForConditionalGeneration(
     }
 
     hf_to_vllm_mapper = WeightsMapper(
-        orig_to_new_substr={".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."}
+        orig_to_new_substr={
+            ".fc1.": ".mlp.fc1.",
+            ".fc2.": ".mlp.fc2.",
+            ".self_attn.q_proj": ".self_attn.qkv_proj.q",
+            ".self_attn.k_proj": ".self_attn.qkv_proj.k",
+            ".self_attn.v_proj": ".self_attn.qkv_proj.v",
+            # MergedColumnParallelLinear uses integer indices (0, 1)
+            ".encoder_attn.k_proj": ".encoder_attn.kv_proj.0",
+            ".encoder_attn.v_proj": ".encoder_attn.kv_proj.1",
+        }
     )
 
     # Whisper only supports audio-conditioned generation.

From c3a316a79e48efc4d2f0dab8ff55e6c79c5e8178 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 11 Jun 2026 14:09:33 +0000
Subject: [PATCH 13/34] Add debug logs while loading

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/layers/linear.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 6e3bd57085f8..dbcdc6e32ca2 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -979,9 +979,22 @@ def load_weights(
                 shard_id_str, _, name = name.partition(".")
                 shard_id = int(shard_id_str)
                 self.checkpoint_format = "sharded"
+                logger.debug(
+                    "Loaded shard %s into %s for layer %s.%s",
+                    shard_id,
+                    name,
+                    self.prefix,
+                    name,
+                )
             else:
                 shard_id = None
                 self.checkpoint_format = "fused"
+                logger.debug(
+                    "Loaded weight %s.%s with shape %s",
+                    self.prefix,
+                    name,
+                    loaded_weight.shape,
+                )
             # If name is "bias" get it from self, otherwise load into self
             param: Parameter = getattr(self, name, self)
             param.weight_loader(param, loaded_weight, shard_id)
@@ -1412,10 +1425,23 @@ def load_weights(
                 shard_id, _, name = name.partition(".")
                 self.validate_shard_id(shard_id)
                 self.checkpoint_format = "sharded"
+                logger.debug(
+                    "Loaded shard %s into %s for layer %s.%s",
+                    shard_id,
+                    name,
+                    self.prefix,
+                    name,
+                )
             else:
                 # Checkpoint is fused
                 shard_id = None
                 self.checkpoint_format = "fused"
+                logger.debug(
+                    "Loaded weight %s.%s with shape %s",
+                    self.prefix,
+                    name,
+                    loaded_weight.shape,
+                )
             # If name is "bias" get it from self, otherwise load into self
             param: Parameter = getattr(self, name, self)
             param.weight_loader(param, loaded_weight, shard_id)

From c802faadb715fbf4a5310fc6630050cd99c1fc33 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 11 Jun 2026 14:59:13 +0000
Subject: [PATCH 14/34] Fix late initialised biases

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/model_loader/reload/layerwise.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/reload/layerwise.py b/vllm/model_executor/model_loader/reload/layerwise.py
index 6cf1c19cba43..d0d26fed3e6c 100644
--- a/vllm/model_executor/model_loader/reload/layerwise.py
+++ b/vllm/model_executor/model_loader/reload/layerwise.py
@@ -131,8 +131,11 @@ def initialize_online_processing(layer: torch.nn.Module):
     # Track loading progress to determine when to process/copy
     info.load_numel = 0
     info.load_numel_total = get_layer_size(layer)
+    _wrap_parameters_weight_loader(layer)
 
-    # Wrap each parameter's weight loader
+
+def _wrap_parameters_weight_loader(layer: torch.nn.Module) -> None:
+    """Wrap each parameter's weight loader."""
     # Note that nested wrapping will occur for shared tensors
     for name, tensor in get_layer_tensors(layer).items():
         if name in SKIP_TENSORS:
@@ -168,6 +171,12 @@ def online_process_loader(*args, **kwargs):
             logger.debug("%s: Excessive loading", layer.__class__.__name__)
             return
 
+        # Re-run on each load: layers may register parameters later (e.g., `bias`).
+        # Wrap late parameters and refresh `load_numel_total` so processing waits
+        # until all parameters are loaded.
+        info.load_numel_total = get_layer_size(layer)
+        _wrap_parameters_weight_loader(layer)
+
         # Bind and normalize arguments
         bound_args = loader_signature.bind(*args, **kwargs)
         bound_args.apply_defaults()

From 88de67d0a67a588881c78a361adcbe63fbc1c84f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 11 Jun 2026 16:00:08 +0000
Subject: [PATCH 15/34] Fix GPTQ tests

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/model_executor/test_weight_utils.py | 34 +++++++++++++++++++++++
 vllm/model_executor/layers/linear.py      | 14 ++++++++++
 vllm/model_executor/models/gemma.py       |  4 +--
 vllm/model_executor/models/granite.py     |  4 +--
 vllm/model_executor/models/interfaces.py  |  5 +++-
 vllm/model_executor/models/internlm2.py   |  4 +--
 vllm/model_executor/models/llama.py       |  3 +-
 vllm/model_executor/models/qwen2.py       |  3 +-
 vllm/model_executor/models/utils.py       | 28 +++++++++++++++++++
 9 files changed, 90 insertions(+), 9 deletions(-)

diff --git a/tests/model_executor/test_weight_utils.py b/tests/model_executor/test_weight_utils.py
index 9e67609b78e4..f2ac7a8ba26c 100644
--- a/tests/model_executor/test_weight_utils.py
+++ b/tests/model_executor/test_weight_utils.py
@@ -281,5 +281,39 @@ def test_composes_with_qkv_mapper(self):
         )
 
 
+def test_weights_mapper_get_packed_modules_mapping():
+    from vllm.model_executor.models.utils import WeightsMapper
+
+    mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".q_proj": ".qkv_proj.q",
+            ".k_proj": ".qkv_proj.k",
+            ".v_proj": ".qkv_proj.v",
+            ".gate_proj": ".gate_up_proj.0",
+            ".up_proj": ".gate_up_proj.1",
+            # Non-fusion entries must not contribute
+            ".word_embeddings": "",
+            "llm.model.": "model.decoder.",
+            "llm.lm_head": "lm_head",
+        }
+    )
+    assert mapper.get_packed_modules_mapping() == {
+        "qkv_proj": ["qkv_proj.q", "qkv_proj.k", "qkv_proj.v"],
+        "gate_up_proj": ["gate_up_proj.0", "gate_up_proj.1"],
+    }
+
+    # Shard order comes from the shard id, not declaration order, and
+    # dotted module paths reduce to the last component
+    mapper = WeightsMapper(
+        orig_to_new_substr={
+            "linear_proj.dense_h_to_4h": "linear_proj.merged_proj.1",
+            "linear_proj.gate_proj": "linear_proj.merged_proj.0",
+        }
+    )
+    assert mapper.get_packed_modules_mapping() == {
+        "merged_proj": ["merged_proj.0", "merged_proj.1"],
+    }
+
+
 if __name__ == "__main__":
     test_download_weights_from_hf()
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index dbcdc6e32ca2..1dc03b34ecc4 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -997,6 +997,13 @@ def load_weights(
                 )
             # If name is "bias" get it from self, otherwise load into self
             param: Parameter = getattr(self, name, self)
+            if (
+                param is None
+                and name == "bias"
+                and self.quant_config is not None
+                and "gptq" in self.quant_config.get_name()
+            ):
+                continue
             param.weight_loader(param, loaded_weight, shard_id)
             yield name
 
@@ -1444,6 +1451,13 @@ def load_weights(
                 )
             # If name is "bias" get it from self, otherwise load into self
             param: Parameter = getattr(self, name, self)
+            if (
+                param is None
+                and name == "bias"
+                and self.quant_config is not None
+                and "gptq" in self.quant_config.get_name()
+            ):
+                continue
             param.weight_loader(param, loaded_weight, shard_id)
             yield name
 
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 64808e95ae2b..f95cdc161482 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -44,7 +44,7 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
 from .utils import (
     AutoWeightsLoader,
     WeightsMapper,
@@ -324,7 +324,7 @@ def forward(
         return hidden_states
 
 
-class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             ".q_proj": ".qkv_proj.q",
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 518dce8453d5..77775873de8d 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -51,7 +51,7 @@
 )
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
@@ -248,7 +248,7 @@ def forward(
 
 
 @support_torch_compile
-class GraniteModel(nn.Module):
+class GraniteModel(nn.Module, SupportsQuant):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             ".q_proj": ".qkv_proj.q",
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 68dbcf90f877..093514cc0da3 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1032,8 +1032,11 @@ def _maybe_apply_model_mapping(self):
             return
         if (hf_to_vllm_mapper := self.hf_to_vllm_mapper) is not None:
             self.quant_config.apply_vllm_mapper(hf_to_vllm_mapper)
-        if self.packed_modules_mapping is not None:
+        if self.packed_modules_mapping:
             self.quant_config.packed_modules_mapping.update(self.packed_modules_mapping)
+        elif hf_to_vllm_mapper is not None:
+            packed_modules_mapping = hf_to_vllm_mapper.get_packed_modules_mapping()
+            self.quant_config.packed_modules_mapping.update(packed_modules_mapping)
 
 
 @runtime_checkable
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 8eeadf2d0202..eb726e48f956 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -37,7 +37,7 @@
 )
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
 from .interfaces_base import default_pooling_type
 from .utils import (
     AutoWeightsLoader,
@@ -247,7 +247,7 @@ def forward(
 
 
 @support_torch_compile
-class InternLM2Model(nn.Module):
+class InternLM2Model(nn.Module, SupportsQuant):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             ".w1": ".gate_up_proj.0",
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 6290509923a3..b9474648b78e 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -63,6 +63,7 @@
     SupportsEagle3,
     SupportsLoRA,
     SupportsPP,
+    SupportsQuant,
 )
 from .utils import (
     AutoWeightsLoader,
@@ -344,7 +345,7 @@ def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None
         "inputs_embeds": {0: "b"},
     },
 )
-class LlamaModel(nn.Module, EagleModelMixin):
+class LlamaModel(nn.Module, EagleModelMixin, SupportsQuant):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             ".q_proj": ".qkv_proj.q",
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index e27ed683d3e1..d5a9e1aab50a 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -64,6 +64,7 @@
     SupportsEagle3,
     SupportsLoRA,
     SupportsPP,
+    SupportsQuant,
 )
 from .utils import (
     AutoWeightsLoader,
@@ -318,7 +319,7 @@ def forward(
         "inputs_embeds": {0: "b"},
     }
 )
-class Qwen2Model(nn.Module, EagleModelMixin):
+class Qwen2Model(nn.Module, EagleModelMixin, SupportsQuant):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             ".q_proj": ".qkv_proj.q",
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index b89f5b6db684..7161640f4e5f 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -120,6 +120,34 @@ def apply_dict(self, values: dict[str, Any]) -> dict[str, Any]:
             if (out_name := self._map_name(name)) is not None
         }
 
+    def get_packed_modules_mapping(self) -> dict[str, list[str]]:
+        """Derive a `packed_modules_mapping` from this mapper's fusion entries."""
+        qkv_order = {"q": 0, "k": 1, "v": 2}
+        packed: dict[str, list[tuple[int, str]]] = {}
+        mappings = (
+            self.orig_to_new_substr,
+            self.orig_to_new_prefix,
+            self.orig_to_new_suffix,
+        )
+        for mapping in mappings:
+            for new in mapping.values():
+                if new is None or "." not in new:
+                    continue
+                param_path, _, shard_id = new.rpartition(".")
+                if shard_id.isdigit():
+                    order = int(shard_id)
+                elif shard_id in qkv_order:
+                    order = qkv_order[shard_id]
+                else:
+                    continue
+                param_name = param_path.lstrip(".").rpartition(".")[2]
+                shards = packed.setdefault(param_name, [])
+                shards.append((order, f"{param_name}.{shard_id}"))
+        return {
+            name: [shard for _, shard in sorted(shards)]
+            for name, shards in packed.items()
+        }
+
 
 class AutoWeightsLoader:
     """

From 4069aaef2fcc0ca2b7c9ba9b70a65ba370c7faf2 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 11 Jun 2026 16:36:19 +0000
Subject: [PATCH 16/34] fix bnb

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../model_loader/bitsandbytes_loader.py         | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index d10f3bfcbe9b..86be8ad1d179 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -563,7 +563,6 @@ def _initialize_loader_state(
         configuration.
         """
         self.is_pool_model = is_pooling_model(model)
-        self.modules_mapping = ParamMapping(get_packed_modules_mapping(model))
 
         if is_moe_model(model):
             self.expert_params_mapping = get_moe_expert_mapping(model)
@@ -573,10 +572,18 @@ def _initialize_loader_state(
                     "BitsAndBytes quantization yet. Ensure this model has "
                     "'get_expert_mapping' method."
                 )
-        # For some models like Molmo, we need to use hf_to_vllm_mapper
-        # to ensure correct loading of weights.
-        if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None):
-            self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name)
+        packed_modules_mapping = get_packed_modules_mapping(model)
+        # `hf_to_vllm_mapper` may belong to model or base model
+        for module in (model, *model.children()):
+            if hf_to_vllm_mapper := getattr(module, "hf_to_vllm_mapper", None):
+                self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name)
+                # If model had no `packed_modules_mapping`, try to get it from mapper
+                if not packed_modules_mapping:
+                    packed_modules_mapping = (
+                        hf_to_vllm_mapper.get_packed_modules_mapping()
+                    )
+                break
+        self.modules_mapping = ParamMapping(packed_modules_mapping)
 
         self._get_bnb_target_modules(model)
         self._classify_module_sharding(model)

From 872ff37de619f5c91ad442af908d15c9ce2793d1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 15:01:30 +0000
Subject: [PATCH 17/34] Make
 `vllm.model_executor.utils.get_packed_modules_mapping` check
 `hf_to_vllm_mapper` too

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/utils.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index a0269be855a9..d41cbf0f75ce 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -98,9 +98,26 @@ def replace_parameter(
     setattr(layer, param_name, new_param)
 
 
+def _get_packed_modules_mapping(module: torch.nn.Module) -> dict[str, list[str]]:
+    """Get the packed modules mapping from a module.
+
+    It could come from one of two places:
+
+    1. The module has a `packed_modules_mapping` attribute.
+    2. The module has a `hf_to_vllm_mapper` attribute, which can generate the mapping.
+
+    No module should have both attributes, and if it does,
+    the `packed_modules_mapping` attribute takes precedence."""
+    if packed_modules_mapping := getattr(module, "packed_modules_mapping", None):
+        return copy.deepcopy(packed_modules_mapping)
+    elif hf_to_vllm_mapper := getattr(module, "hf_to_vllm_mapper", None):
+        return hf_to_vllm_mapper.get_packed_modules_mapping()
+    else:
+        return {}
+
+
 def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
-    parent_map = getattr(model, "packed_modules_mapping", None)
-    parent_map = copy.deepcopy(parent_map) if parent_map is not None else {}
+    parent_map = _get_packed_modules_mapping(model)
 
     # don't infer mapping if the model has defined it explicitly.
     if parent_map:
@@ -108,8 +125,7 @@ def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
 
     # We only check main components instead of whole model submodules
     for child in model.children():
-        child_map = getattr(child, "packed_modules_mapping", None)
-        child_map = copy.deepcopy(child_map) if child_map is not None else {}
+        child_map = _get_packed_modules_mapping(child)
 
         if any((k in parent_map and parent_map[k] != v) for k, v in child_map.items()):
             raise ValueError(

From 3b73687bced12246532f3e6383c6b24ce4418c94 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 15:02:29 +0000
Subject: [PATCH 18/34] Fix `WeightsMapper.get_packed_modules_mapping`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/utils.py | 47 +++++++++++------------------
 1 file changed, 17 insertions(+), 30 deletions(-)

diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index e5366256f296..a5b8410c00c3 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -129,32 +129,20 @@ def apply_dict(self, values: dict[str, Any]) -> dict[str, Any]:
         }
 
     def get_packed_modules_mapping(self) -> dict[str, list[str]]:
-        """Derive a `packed_modules_mapping` from this mapper's fusion entries."""
-        qkv_order = {"q": 0, "k": 1, "v": 2}
-        packed: dict[str, list[tuple[int, str]]] = {}
-        mappings = (
-            self.orig_to_new_substr,
-            self.orig_to_new_prefix,
-            self.orig_to_new_suffix,
-        )
-        for mapping in mappings:
-            for new in mapping.values():
-                if new is None or "." not in new:
-                    continue
-                param_path, _, shard_id = new.rpartition(".")
-                if shard_id.isdigit():
-                    order = int(shard_id)
-                elif shard_id in qkv_order:
-                    order = qkv_order[shard_id]
-                else:
-                    continue
-                param_name = param_path.lstrip(".").rpartition(".")[2]
-                shards = packed.setdefault(param_name, [])
-                shards.append((order, f"{param_name}.{shard_id}"))
-        return {
-            name: [shard for _, shard in sorted(shards)]
-            for name, shards in packed.items()
-        }
+        """Derive a `packed_modules_mapping` from `self.orig_to_new_substr`."""
+        qkv_shards = {"q", "k", "v"}
+        packed_modules_mapping: dict[str, list[str]] = {}
+        for old, new in self.orig_to_new_substr.items():
+            if new is None or "." not in new:
+                continue
+            param_path, _, shard_id = new.rpartition(".")
+            # Is shard_id actually a shard ID?
+            if not (shard_id.isdigit() or shard_id in qkv_shards):
+                continue
+            _, _, weight_name = old.rpartition(".")
+            _, _, param_name = param_path.rpartition(".")
+            packed_modules_mapping.setdefault(param_name, []).append(weight_name)
+        return packed_modules_mapping
 
 
 class AutoWeightsLoader:
@@ -396,12 +384,11 @@ def load_weights(
             # Skip loading extra bias for GPTQ models
             if "gptq" in quant_config.get_name():
                 self.ignore_unexpected_suffixes.append(".bias")
-            # Get mappings for KV cache quantization scales
+            # Get mappings and ignore prefixes for KV cache quantization scales
             mapper = mapper or WeightsMapper()
             mapper |= quant_config.get_cache_scale_mapper()
-            self.ignore_unexpected_suffixes.extend(
-                quant_config._ignore_unexpected_suffixes
-            )
+            ignore_unexpected_prefixes = quant_config._ignore_unexpected_prefixes
+            self.ignore_unexpected_suffixes.extend(ignore_unexpected_prefixes)
         if mapper is not None:
             weights = mapper.apply(weights)
         # filter out weights with first-prefix/substr to skip in name

From c657d7d2855dd501594a8c6cf888183815ced54b Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 15:03:18 +0000
Subject: [PATCH 19/34] Better `SupportsQuant._maybe_apply_model_mapping`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/interfaces.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 093514cc0da3..89815722a1d7 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1032,11 +1032,11 @@ def _maybe_apply_model_mapping(self):
             return
         if (hf_to_vllm_mapper := self.hf_to_vllm_mapper) is not None:
             self.quant_config.apply_vllm_mapper(hf_to_vllm_mapper)
+            if packed_modules_mapping := hf_to_vllm_mapper.get_packed_modules_mapping():
+                self.packed_modules_mapping = self.packed_modules_mapping or {}
+                self.packed_modules_mapping.update(packed_modules_mapping)
         if self.packed_modules_mapping:
             self.quant_config.packed_modules_mapping.update(self.packed_modules_mapping)
-        elif hf_to_vllm_mapper is not None:
-            packed_modules_mapping = hf_to_vllm_mapper.get_packed_modules_mapping()
-            self.quant_config.packed_modules_mapping.update(packed_modules_mapping)
 
 
 @runtime_checkable

From f2d548b1772934db9e91eb35f9d605e616f2a8ff Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 15:03:37 +0000
Subject: [PATCH 20/34] `BitsAndBytesModelLoader` can be simpler now

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/model_loader/bitsandbytes_loader.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index 0162ad300a37..87a310d1fb23 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -563,6 +563,7 @@ def _initialize_loader_state(
         configuration.
         """
         self.is_pool_model = is_pooling_model(model)
+        self.modules_mapping = ParamMapping(get_packed_modules_mapping(model))
 
         if is_moe_model(model):
             self.expert_params_mapping = get_moe_expert_mapping(model)
@@ -572,18 +573,11 @@ def _initialize_loader_state(
                     "BitsAndBytes quantization yet. Ensure this model has "
                     "'get_expert_mapping' method."
                 )
-        packed_modules_mapping = get_packed_modules_mapping(model)
         # `hf_to_vllm_mapper` may belong to model or base model
         for module in (model, *model.children()):
             if hf_to_vllm_mapper := getattr(module, "hf_to_vllm_mapper", None):
                 self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name)
-                # If model had no `packed_modules_mapping`, try to get it from mapper
-                if not packed_modules_mapping:
-                    packed_modules_mapping = (
-                        hf_to_vllm_mapper.get_packed_modules_mapping()
-                    )
                 break
-        self.modules_mapping = ParamMapping(packed_modules_mapping)
 
         self._get_bnb_target_modules(model)
         self._classify_module_sharding(model)

From 41e3a9e9c9e63b7d9bc1aaca5b38c0d02fa3809a Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 15:04:42 +0000
Subject: [PATCH 21/34] Use `get_packed_modules_mapping` for
 `get_supported_lora_modules`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/lora/utils.py | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 11ff37465339..fd624593b998 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -34,11 +34,7 @@
     VocabParallelEmbeddingWithLoRA,
 )
 from vllm.model_executor.layers.fused_moe import MoERunner
-from vllm.model_executor.layers.linear import (
-    LinearBase,
-    MergedColumnParallelLinear,
-    QKVParallelLinear,
-)
+from vllm.model_executor.layers.linear import LinearBase
 from vllm.model_executor.utils import get_moe_expert_mapping, get_packed_modules_mapping
 from vllm.transformers_utils.repo_utils import hf_api
 
@@ -214,7 +210,8 @@ def get_supported_lora_modules(model: nn.Module) -> list[str]:
     In vLLM, all linear layers support LoRA.
     """
 
-    supported_lora_modules: set[str] = set()
+    packed_modules_mapping = get_packed_modules_mapping(model)
+    supported_lora_modules: set[str] = set(sum(packed_modules_mapping.values(), []))
     for name, module in model.named_modules():
         # get the embedding modules if the module's embedding_modules
         # is not empty.
@@ -224,18 +221,10 @@ def get_supported_lora_modules(model: nn.Module) -> list[str]:
                 supported_lora_modules.add(name)
 
         if (
-            isinstance(module, QKVParallelLinear)
-            and module.checkpoint_format == "sharded"
-        ):
-            supported_lora_modules.update(["q", "k", "v"])
-        elif (
-            isinstance(module, MergedColumnParallelLinear)
-            and module.checkpoint_format == "sharded"
+            isinstance(module, (LinearBase, MoERunner))
+            and (supported_name := name.split(".")[-1]) not in packed_modules_mapping
         ):
-            shard_ids = [str(i) for i in range(len(module.output_sizes))]
-            supported_lora_modules.update(shard_ids)
-        elif isinstance(module, (LinearBase, MoERunner)):
-            supported_lora_modules.add(name.split(".")[-1])
+            supported_lora_modules.add(supported_name)
 
     return list(supported_lora_modules)
 

From 68085a0ca4395cb96ecaf546bddc52e052fc4fc5 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 15:12:50 +0000
Subject: [PATCH 22/34] Fix test

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/model_executor/test_weight_utils.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/tests/model_executor/test_weight_utils.py b/tests/model_executor/test_weight_utils.py
index f2ac7a8ba26c..202a41d36e3b 100644
--- a/tests/model_executor/test_weight_utils.py
+++ b/tests/model_executor/test_weight_utils.py
@@ -298,20 +298,8 @@ def test_weights_mapper_get_packed_modules_mapping():
         }
     )
     assert mapper.get_packed_modules_mapping() == {
-        "qkv_proj": ["qkv_proj.q", "qkv_proj.k", "qkv_proj.v"],
-        "gate_up_proj": ["gate_up_proj.0", "gate_up_proj.1"],
-    }
-
-    # Shard order comes from the shard id, not declaration order, and
-    # dotted module paths reduce to the last component
-    mapper = WeightsMapper(
-        orig_to_new_substr={
-            "linear_proj.dense_h_to_4h": "linear_proj.merged_proj.1",
-            "linear_proj.gate_proj": "linear_proj.merged_proj.0",
-        }
-    )
-    assert mapper.get_packed_modules_mapping() == {
-        "merged_proj": ["merged_proj.0", "merged_proj.1"],
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
 

From 2f56a4264fb063e1a94818687952cd5517ed5d27 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 15:25:41 +0000
Subject: [PATCH 23/34] tweaks

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/interfaces.py | 2 +-
 vllm/model_executor/models/jina.py       | 6 ++----
 vllm/model_executor/models/qwen3.py      | 7 +------
 3 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 89815722a1d7..3040f3283f08 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1035,7 +1035,7 @@ def _maybe_apply_model_mapping(self):
             if packed_modules_mapping := hf_to_vllm_mapper.get_packed_modules_mapping():
                 self.packed_modules_mapping = self.packed_modules_mapping or {}
                 self.packed_modules_mapping.update(packed_modules_mapping)
-        if self.packed_modules_mapping:
+        if self.packed_modules_mapping is not None:
             self.quant_config.packed_modules_mapping.update(self.packed_modules_mapping)
 
 
diff --git a/vllm/model_executor/models/jina.py b/vllm/model_executor/models/jina.py
index f1f585cdae8d..82a534404027 100644
--- a/vllm/model_executor/models/jina.py
+++ b/vllm/model_executor/models/jina.py
@@ -255,7 +255,5 @@ def _merge_weights(
                 yield name, tensor
 
         loader = AutoWeightsLoader(self.model, ignore_unexpected_prefixes=["lm_head."])
-        loaded = loader.load_weights(
-            _merge_weights(weights), mapper=self.model.hf_to_vllm_mapper
-        )
-        return {f"model.{name}" for name in loaded}
+        weights = _merge_weights(weights)
+        return loader.load_weights(weights, mapper=self.model.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 0bebd7d367e9..06f721209fd0 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -57,12 +57,7 @@
 )
 from .qwen2 import Qwen2MLP as Qwen3MLP
 from .qwen2 import Qwen2Model
-from .utils import (
-    AutoWeightsLoader,
-    PPMissingLayer,
-    extract_layer_index,
-    maybe_prefix,
-)
+from .utils import AutoWeightsLoader, PPMissingLayer, extract_layer_index, maybe_prefix
 
 logger = init_logger(__name__)
 

From d0518136e7d5d2a5299b21819abd70e558b2893c Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 15:54:32 +0000
Subject: [PATCH 24/34] typo

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index a5b8410c00c3..0d6ece74c435 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -387,8 +387,8 @@ def load_weights(
             # Get mappings and ignore prefixes for KV cache quantization scales
             mapper = mapper or WeightsMapper()
             mapper |= quant_config.get_cache_scale_mapper()
-            ignore_unexpected_prefixes = quant_config._ignore_unexpected_prefixes
-            self.ignore_unexpected_suffixes.extend(ignore_unexpected_prefixes)
+            ignore_unexpected_suffixes = quant_config._ignore_unexpected_suffixes
+            self.ignore_unexpected_suffixes.extend(ignore_unexpected_suffixes)
         if mapper is not None:
             weights = mapper.apply(weights)
         # filter out weights with first-prefix/substr to skip in name

From 5c2a354a2b1ff4b81cd1c0dfde3d4860b059ea8c Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 17:22:52 +0000
Subject: [PATCH 25/34] Mapper must present both shard id and weight name as
 supported packings

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 0d6ece74c435..7c10c2e939ff 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -141,7 +141,8 @@ def get_packed_modules_mapping(self) -> dict[str, list[str]]:
                 continue
             _, _, weight_name = old.rpartition(".")
             _, _, param_name = param_path.rpartition(".")
-            packed_modules_mapping.setdefault(param_name, []).append(weight_name)
+            packed_names = packed_modules_mapping.setdefault(param_name, [])
+            packed_names.extend([weight_name, shard_id])
         return packed_modules_mapping
 
 

From b5bdb582b186999c6e864e958c1afe023f301eb9 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 19:08:54 +0000
Subject: [PATCH 26/34] Fix test

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/lora/test_lora_huggingface.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 7c7f4eb4b626..49601a7e0786 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -6,7 +6,7 @@
 from vllm.lora.lora_model import LoRAModel
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.utils import get_adapter_absolute_path
-from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
+from vllm.model_executor.models.llama import LlamaModel
 
 # Provide absolute path and huggingface lora ids
 lora_fixture_name = ["llama32_lora_files", "llama32_lora_huggingface_id"]
@@ -23,7 +23,7 @@
 @pytest.mark.parametrize("lora_fixture_name", lora_fixture_name)
 def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
     lora_name = request.getfixturevalue(lora_fixture_name)
-    packed_modules_mapping = Qwen3ForCausalLM.packed_modules_mapping
+    packed_modules_mapping = LlamaModel.hf_to_vllm_mapper.get_packed_modules_mapping()
 
     expected_lora_lst: list[str] = []
     for module in LLAMA_LORA_MODULES:

From dbf02b1cbdebf8c7076eabb030e731b2bf501274 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 19:10:47 +0000
Subject: [PATCH 27/34] `AutoWeightsLoader` inject packed mappings from mapper
 at load time

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/utils.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 7c10c2e939ff..1a22ec92a3b9 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -5,7 +5,7 @@
 from collections.abc import Callable, Iterable, Mapping
 from contextlib import contextmanager
 from dataclasses import dataclass, field
-from typing import Any, Literal, Protocol, overload
+from typing import TYPE_CHECKING, Any, Literal, Protocol, overload
 
 import regex as re
 import torch
@@ -19,7 +19,6 @@
     get_tensor_model_parallel_world_size,
 )
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.model_loader.reload import (
     support_quantized_model_reload_from_hp_weights,
 )
@@ -33,6 +32,9 @@
     direct_register_custom_op,
 )
 
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.quantization import QuantizationConfig
+
 logger = init_logger(__name__)
 
 
@@ -390,6 +392,9 @@ def load_weights(
             mapper |= quant_config.get_cache_scale_mapper()
             ignore_unexpected_suffixes = quant_config._ignore_unexpected_suffixes
             self.ignore_unexpected_suffixes.extend(ignore_unexpected_suffixes)
+            # If mapper contains packed_modules_mapping, update them in quant_config
+            if packed_modules_mapping := mapper.get_packed_modules_mapping():
+                quant_config.packed_modules_mapping.update(packed_modules_mapping)
         if mapper is not None:
             weights = mapper.apply(weights)
         # filter out weights with first-prefix/substr to skip in name
@@ -758,9 +763,7 @@ def maybe_prefix(prefix: str, name: str) -> str:
     return name if not prefix else f"{prefix}.{name}"
 
 
-def get_draft_quant_config(
-    vllm_config: VllmConfig,
-) -> QuantizationConfig | None:
+def get_draft_quant_config(vllm_config: VllmConfig) -> "QuantizationConfig | None":
     """Get quantization config for Draft models.
 
     Draft models should use their own quantization config instead of the verifier/target

From a97a1a15395dcce03a886c707ad7320128765b5f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 22:55:05 +0000
Subject: [PATCH 28/34] revert lora test changes

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/lora/conftest.py              | 21 --------------
 tests/lora/test_lora_checkpoints.py | 44 +++++++++++++++++++----------
 tests/lora/test_lora_huggingface.py |  4 +--
 3 files changed, 31 insertions(+), 38 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 7d9e8444827f..dea54ed21aea 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -19,7 +19,6 @@
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     MergedColumnParallelLinear,
-    QKVParallelLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -167,26 +166,6 @@ def dummy_model_gate_up(default_vllm_config) -> nn.Module:
     return model
 
 
-@pytest.fixture
-def baichuan_dummy_model(default_vllm_config, dist_init) -> nn.Module:
-    # Only includes BaiChuan's lora modules so get_supported_lora_modules will work
-    model = DummyLoRAModel(
-        OrderedDict(
-            [
-                ("W_pack", QKVParallelLinear(64, 8, 8)),
-                ("o_proj", RowParallelLinear(64, 64)),
-                ("gate_up_proj", MergedColumnParallelLinear(64, [16, 16])),
-                ("down_proj", RowParallelLinear(16, 64)),
-            ]
-        )
-    )
-    model.config = MagicMock()
-    # Match the expected format for BaiChuan checkpoints
-    model.W_pack.checkpoint_format = "fused"
-    model.gate_up_proj.checkpoint_format = "sharded"
-    return model
-
-
 @pytest.fixture(scope="session")
 def mixtral_lora_files():
     # Note: this module has incorrect adapter_config.json to test
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 0a54a80242be..7c263e2a2276 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -5,26 +5,37 @@
 
 from vllm.lora.lora_model import LoRAModel
 from vllm.lora.peft_helper import PEFTHelper
-from vllm.lora.utils import get_supported_lora_modules, parse_fine_tuned_lora_name
+from vllm.lora.utils import parse_fine_tuned_lora_name
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
 from vllm.model_executor.models.gemma4 import Gemma4ForCausalLM
 from vllm.model_executor.models.utils import WeightsMapper
 
+lora_lst = ["baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"]
+BAICHUAN_LORA_MODULES = [
+    "W_pack",
+    "o_proj",
+    "gate_up_proj",
+    "down_proj",
+]
 
-@pytest.mark.parametrize(
-    "lora_name",
-    ["baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"],
-)
+
+@pytest.mark.parametrize("lora_name", lora_lst)
 def test_load_checkpoints(
     lora_name,
     baichuan_lora_files,
     baichuan_zero_lora_files,
     baichuan_regex_lora_files,
     chatglm3_lora_files,
-    baichuan_dummy_model,
 ):
-    expected_lora_modules = set(get_supported_lora_modules(baichuan_dummy_model))
-    weights_mapper = BaiChuanBaseForCausalLM.hf_to_vllm_mapper
+    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
+
+    expected_lora_lst: list[str] = []
+    for module in BAICHUAN_LORA_MODULES:
+        if module in packed_modules_mapping:
+            expected_lora_lst.extend(packed_modules_mapping[module])
+        else:
+            expected_lora_lst.append(module)
+    expected_lora_modules = set(expected_lora_lst)
     if lora_name == "baichuan7B":
         peft_helper = PEFTHelper.from_local_dir(
             baichuan_lora_files, max_position_embeddings=4096
@@ -38,7 +49,6 @@ def test_load_checkpoints(
             lora_model_id=1,
             device="cpu",
             model_vocab_size=64000,
-            weights_mapper=weights_mapper,
         )
     elif lora_name == "baichuan7B-zero":
         # Test that the target_modules contain prefix
@@ -54,7 +64,6 @@ def test_load_checkpoints(
             lora_model_id=1,
             device="cpu",
             model_vocab_size=64000,
-            weights_mapper=weights_mapper,
         )
     elif lora_name == "baichuan7B-zero-regex":
         # Test that the `target_modules` in the form of regular expressions,
@@ -69,7 +78,6 @@ def test_load_checkpoints(
             lora_model_id=1,
             device="cpu",
             model_vocab_size=64000,
-            weights_mapper=weights_mapper,
         )
     else:
         # For the baichuan7B model, load chatglm3-6b's LoRA,
@@ -89,16 +97,22 @@ def test_load_checkpoints(
             )
 
 
-def test_lora_weights_mapping(baichuan_lora_files, baichuan_dummy_model):
-    expected_lora_modules = set(get_supported_lora_modules(baichuan_dummy_model))
+def test_lora_weights_mapping(baichuan_lora_files):
+    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
+
+    expected_lora_lst: list[str] = []
+    for module in BAICHUAN_LORA_MODULES:
+        if module in packed_modules_mapping:
+            expected_lora_lst.extend(packed_modules_mapping[module])
+        else:
+            expected_lora_lst.append(module)
+    expected_lora_modules = set(expected_lora_lst)
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.": "language_model.model.",
         },
         orig_to_new_substr={
             ".layers.": ".baichuan_layers.",
-            ".gate_proj": ".gate_up_proj.0",
-            ".up_proj": ".gate_up_proj.1",
         },
     )
     peft_helper = PEFTHelper.from_local_dir(
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 49601a7e0786..53253278ad80 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -6,7 +6,7 @@
 from vllm.lora.lora_model import LoRAModel
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.utils import get_adapter_absolute_path
-from vllm.model_executor.models.llama import LlamaModel
+from vllm.model_executor.models.llama import LlamaForCausalLM
 
 # Provide absolute path and huggingface lora ids
 lora_fixture_name = ["llama32_lora_files", "llama32_lora_huggingface_id"]
@@ -23,7 +23,7 @@
 @pytest.mark.parametrize("lora_fixture_name", lora_fixture_name)
 def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
     lora_name = request.getfixturevalue(lora_fixture_name)
-    packed_modules_mapping = LlamaModel.hf_to_vllm_mapper.get_packed_modules_mapping()
+    packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping
 
     expected_lora_lst: list[str] = []
     for module in LLAMA_LORA_MODULES:

From a33e90ebe146d2f99bc18254fc5b2cb128ae4049 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 23:00:15 +0000
Subject: [PATCH 29/34] Revert quant/lora hacks; `get_packed_modules_mapping`
 -> `get_unfused_mapper`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/model_executor/test_weight_utils.py | 22 --------------
 vllm/lora/utils.py                        | 14 ++++-----
 vllm/lora/worker_manager.py               |  6 +++-
 vllm/model_executor/model_loader/utils.py |  2 +-
 vllm/model_executor/models/interfaces.py  |  5 +---
 vllm/model_executor/models/utils.py       | 35 +++++++++++------------
 vllm/model_executor/utils.py              | 24 +++-------------
 7 files changed, 35 insertions(+), 73 deletions(-)

diff --git a/tests/model_executor/test_weight_utils.py b/tests/model_executor/test_weight_utils.py
index 202a41d36e3b..9e67609b78e4 100644
--- a/tests/model_executor/test_weight_utils.py
+++ b/tests/model_executor/test_weight_utils.py
@@ -281,27 +281,5 @@ def test_composes_with_qkv_mapper(self):
         )
 
 
-def test_weights_mapper_get_packed_modules_mapping():
-    from vllm.model_executor.models.utils import WeightsMapper
-
-    mapper = WeightsMapper(
-        orig_to_new_substr={
-            ".q_proj": ".qkv_proj.q",
-            ".k_proj": ".qkv_proj.k",
-            ".v_proj": ".qkv_proj.v",
-            ".gate_proj": ".gate_up_proj.0",
-            ".up_proj": ".gate_up_proj.1",
-            # Non-fusion entries must not contribute
-            ".word_embeddings": "",
-            "llm.model.": "model.decoder.",
-            "llm.lm_head": "lm_head",
-        }
-    )
-    assert mapper.get_packed_modules_mapping() == {
-        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"],
-    }
-
-
 if __name__ == "__main__":
     test_download_weights_from_hf()
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index fd624593b998..828aea712d01 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -210,8 +210,7 @@ def get_supported_lora_modules(model: nn.Module) -> list[str]:
     In vLLM, all linear layers support LoRA.
     """
 
-    packed_modules_mapping = get_packed_modules_mapping(model)
-    supported_lora_modules: set[str] = set(sum(packed_modules_mapping.values(), []))
+    supported_lora_modules: set[str] = set()
     for name, module in model.named_modules():
         # get the embedding modules if the module's embedding_modules
         # is not empty.
@@ -220,11 +219,12 @@ def get_supported_lora_modules(model: nn.Module) -> list[str]:
             for name in embedding_modules:
                 supported_lora_modules.add(name)
 
-        if (
-            isinstance(module, (LinearBase, MoERunner))
-            and (supported_name := name.split(".")[-1]) not in packed_modules_mapping
-        ):
-            supported_lora_modules.add(supported_name)
+        # get all the linear subfixes.
+        if isinstance(module, (LinearBase,)):
+            supported_lora_modules.add(name.split(".")[-1])
+
+        if isinstance(module, (MoERunner,)):
+            supported_lora_modules.add(name.split(".")[-1])
 
     return list(supported_lora_modules)
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 166d5c36ba57..785df09fe400 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -122,9 +122,13 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
             peft_helper.validate_legal(self.lora_config)
 
             # For some models like Qwen2VL, we need to use hf_to_vllm_mapper
-            # to ensure correct loading of lora weights.
+            # to ensure correct loading of lora weights. Drop the QKV/MLP fusion
+            # substr maps so constituent names (e.g. `q_proj`) survive for the
+            # LoRA manager to pack, while keeping genuine renames/prefixes.
             model = self._adapter_manager.model
             hf_to_vllm_mapper = getattr(model, "hf_to_vllm_mapper", None)
+            if hf_to_vllm_mapper is not None:
+                hf_to_vllm_mapper = hf_to_vllm_mapper.get_unfused_mapper()
 
             # Get model-defined prefixes to skip during LoRA loading.
             lora_skip_prefixes = getattr(model, "lora_skip_prefixes", None)
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index fc279c7e9c78..b4b4db11ed3a 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -290,6 +290,6 @@ def configure_quant_config(
 
         # pass mappings by reference to quant_config
         if hf_to_vllm_mapper is not None:
-            quant_config.apply_vllm_mapper(hf_to_vllm_mapper)
+            quant_config.apply_vllm_mapper(hf_to_vllm_mapper.get_unfused_mapper())
         if packed_mapping is not None:
             quant_config.packed_modules_mapping = packed_mapping
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 3040f3283f08..26356fce91cd 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1031,10 +1031,7 @@ def _maybe_apply_model_mapping(self):
         if self.quant_config is None:
             return
         if (hf_to_vllm_mapper := self.hf_to_vllm_mapper) is not None:
-            self.quant_config.apply_vllm_mapper(hf_to_vllm_mapper)
-            if packed_modules_mapping := hf_to_vllm_mapper.get_packed_modules_mapping():
-                self.packed_modules_mapping = self.packed_modules_mapping or {}
-                self.packed_modules_mapping.update(packed_modules_mapping)
+            self.quant_config.apply_vllm_mapper(hf_to_vllm_mapper.get_unfused_mapper())
         if self.packed_modules_mapping is not None:
             self.quant_config.packed_modules_mapping.update(self.packed_modules_mapping)
 
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 1a22ec92a3b9..55f40fcdc435 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -4,7 +4,7 @@
 import itertools
 from collections.abc import Callable, Iterable, Mapping
 from contextlib import contextmanager
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, replace
 from typing import TYPE_CHECKING, Any, Literal, Protocol, overload
 
 import regex as re
@@ -130,22 +130,24 @@ def apply_dict(self, values: dict[str, Any]) -> dict[str, Any]:
             if (out_name := self._map_name(name)) is not None
         }
 
-    def get_packed_modules_mapping(self) -> dict[str, list[str]]:
-        """Derive a `packed_modules_mapping` from `self.orig_to_new_substr`."""
+    def get_unfused_mapper(self) -> "WeightsMapper":
+        """Mapper variant that drops the QKV/MLP fusion substr maps, keeping
+        all genuine renames/prefixes.
+
+        Consumers that reference the checkpoint's *unfused* module names — LoRA
+        name parsing and the quantization config's layer lists
+        (`modules_in_block_to_quantize`, ignored layers) — need the constituent
+        names (e.g. `q_proj`) to survive rather than being rewritten to the
+        fused vLLM name (`qkv_proj.q`)."""
         qkv_shards = {"q", "k", "v"}
-        packed_modules_mapping: dict[str, list[str]] = {}
+        substr = {}
         for old, new in self.orig_to_new_substr.items():
-            if new is None or "." not in new:
-                continue
-            param_path, _, shard_id = new.rpartition(".")
-            # Is shard_id actually a shard ID?
-            if not (shard_id.isdigit() or shard_id in qkv_shards):
-                continue
-            _, _, weight_name = old.rpartition(".")
-            _, _, param_name = param_path.rpartition(".")
-            packed_names = packed_modules_mapping.setdefault(param_name, [])
-            packed_names.extend([weight_name, shard_id])
-        return packed_modules_mapping
+            if new is not None and "." in new:
+                shard_id = new.rpartition(".")[2]
+                if shard_id.isdigit() or shard_id in qkv_shards:
+                    continue
+            substr[old] = new
+        return replace(self, orig_to_new_substr=substr)
 
 
 class AutoWeightsLoader:
@@ -392,9 +394,6 @@ def load_weights(
             mapper |= quant_config.get_cache_scale_mapper()
             ignore_unexpected_suffixes = quant_config._ignore_unexpected_suffixes
             self.ignore_unexpected_suffixes.extend(ignore_unexpected_suffixes)
-            # If mapper contains packed_modules_mapping, update them in quant_config
-            if packed_modules_mapping := mapper.get_packed_modules_mapping():
-                quant_config.packed_modules_mapping.update(packed_modules_mapping)
         if mapper is not None:
             weights = mapper.apply(weights)
         # filter out weights with first-prefix/substr to skip in name
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index d41cbf0f75ce..a0269be855a9 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -98,26 +98,9 @@ def replace_parameter(
     setattr(layer, param_name, new_param)
 
 
-def _get_packed_modules_mapping(module: torch.nn.Module) -> dict[str, list[str]]:
-    """Get the packed modules mapping from a module.
-
-    It could come from one of two places:
-
-    1. The module has a `packed_modules_mapping` attribute.
-    2. The module has a `hf_to_vllm_mapper` attribute, which can generate the mapping.
-
-    No module should have both attributes, and if it does,
-    the `packed_modules_mapping` attribute takes precedence."""
-    if packed_modules_mapping := getattr(module, "packed_modules_mapping", None):
-        return copy.deepcopy(packed_modules_mapping)
-    elif hf_to_vllm_mapper := getattr(module, "hf_to_vllm_mapper", None):
-        return hf_to_vllm_mapper.get_packed_modules_mapping()
-    else:
-        return {}
-
-
 def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
-    parent_map = _get_packed_modules_mapping(model)
+    parent_map = getattr(model, "packed_modules_mapping", None)
+    parent_map = copy.deepcopy(parent_map) if parent_map is not None else {}
 
     # don't infer mapping if the model has defined it explicitly.
     if parent_map:
@@ -125,7 +108,8 @@ def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
 
     # We only check main components instead of whole model submodules
     for child in model.children():
-        child_map = _get_packed_modules_mapping(child)
+        child_map = getattr(child, "packed_modules_mapping", None)
+        child_map = copy.deepcopy(child_map) if child_map is not None else {}
 
         if any((k in parent_map and parent_map[k] != v) for k, v in child_map.items()):
             raise ValueError(

From 4e5c1e3b7280f502639a21c1c969361a8fa76b33 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 23:04:32 +0000
Subject: [PATCH 30/34] Add `packed_modules_mapping` attrs back to models

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/arcee.py        | 5 +++++
 vllm/model_executor/models/baichuan.py     | 4 ++++
 vllm/model_executor/models/commandr.py     | 4 ++++
 vllm/model_executor/models/exaone.py       | 4 ++++
 vllm/model_executor/models/exaone4.py      | 4 ++++
 vllm/model_executor/models/gemma.py        | 4 ++++
 vllm/model_executor/models/gemma2.py       | 4 ++++
 vllm/model_executor/models/glm4.py         | 5 +++++
 vllm/model_executor/models/granite.py      | 4 ++++
 vllm/model_executor/models/hyperclovax.py  | 4 ++++
 vllm/model_executor/models/internlm2.py    | 5 +++++
 vllm/model_executor/models/jais2.py        | 3 +++
 vllm/model_executor/models/llama.py        | 4 ++++
 vllm/model_executor/models/nemotron.py     | 3 +++
 vllm/model_executor/models/nemotron_nas.py | 4 ++++
 vllm/model_executor/models/olmo.py         | 4 ++++
 vllm/model_executor/models/olmo2.py        | 4 ++++
 vllm/model_executor/models/opt.py          | 3 +++
 vllm/model_executor/models/ouro.py         | 4 ++++
 vllm/model_executor/models/phi.py          | 3 +++
 vllm/model_executor/models/qwen2.py        | 5 +++++
 vllm/model_executor/models/qwen2_rm.py     | 5 +++++
 vllm/model_executor/models/qwen3.py        | 4 ++++
 vllm/model_executor/models/rnj1.py         | 4 ++++
 vllm/model_executor/models/seed_oss.py     | 4 ++++
 vllm/model_executor/models/solar.py        | 5 ++++-
 vllm/model_executor/models/step1.py        | 4 ++++
 27 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py
index 844f7ff44342..c004fe793d0e 100644
--- a/vllm/model_executor/models/arcee.py
+++ b/vllm/model_executor/models/arcee.py
@@ -286,6 +286,11 @@ class ArceeForCausalLM(
             ".v_proj": ".qkv_proj.v",
         }
     )
+    # Map fused module names to their submodule components
+    # (for quantization and LoRA)
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    }
 
     def __init__(self, *, vllm_config, prefix: str = "") -> None:
         super().__init__()
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 85cb254670e3..d29b72733549 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -350,6 +350,10 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant
             ".up_proj": ".gate_up_proj.1",
         }
     )
+    packed_modules_mapping = {
+        "W_pack": ["W_pack"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
 
     def __init__(
         self,
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 6dae6b4bccda..96c3e4133e21 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -349,6 +349,10 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
             ".up_proj": ".gate_up_proj.1",
         }
     )
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
 
     # LoRA specific attributes
     embedding_modules = {"embed_tokens": "input_embeddings"}
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index d0523459482d..6ef94b099a29 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -377,6 +377,10 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             ".c_fc_1": ".gate_up_proj.1",
         }
     )
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["c_fc_0", "c_fc_1"],
+    }
 
     # LoRA specific attributes
     embedding_modules = {
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index ce14149acf35..7927eea6ac84 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -375,6 +375,10 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             ".up_proj": ".gate_up_proj.1",
         }
     )
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
 
     # LoRA specific attributes
     embedding_modules = {
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index f95cdc161482..8c9f85d84e36 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -334,6 +334,10 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
             ".up_proj": ".gate_up_proj.1",
         }
     )
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index fd1d2027297d..334a5603c7fc 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -323,6 +323,10 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             ".up_proj": ".gate_up_proj.1",
         }
     )
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index e7414e799861..3a25f90ad2a0 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -234,6 +234,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
 
 class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index d8625afd39f4..e520b17c3b16 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -334,6 +334,10 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
 
 class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     # LoRA specific attributes
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
     embedding_modules = {
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
diff --git a/vllm/model_executor/models/hyperclovax.py b/vllm/model_executor/models/hyperclovax.py
index 3eff84a4fe1a..7a531ffce1e6 100644
--- a/vllm/model_executor/models/hyperclovax.py
+++ b/vllm/model_executor/models/hyperclovax.py
@@ -384,6 +384,10 @@ class HyperCLOVAXForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             ".up_proj": ".gate_up_proj.1",
         }
     )
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
 
     # LoRA specific attributes
     embedding_modules = {
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index eb726e48f956..743357e09d62 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -321,6 +321,11 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
 
 
 class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+    packed_modules_mapping = {
+        "wqkv": ["wqkv"],
+        "gate_up_proj": ["w1", "w3"],
+    }
+
     def __init__(
         self,
         *,
diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py
index d24337230942..23e0f640e39f 100644
--- a/vllm/model_executor/models/jais2.py
+++ b/vllm/model_executor/models/jais2.py
@@ -371,6 +371,9 @@ class Jais2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             ".v_proj": ".qkv_proj.v",
         }
     )
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    }
 
     embedding_modules = {
         "embed_tokens": "input_embeddings",
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index eaec6e42a3dc..a512751db41d 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -446,6 +446,10 @@ class LlamaForCausalLM(
     LocalArgmaxMixin, nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3
 ):
     # LoRA specific attributes
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
     embedding_modules = {
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index b62e9f991c4f..da33584bb104 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -369,6 +369,9 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             ".v_proj": ".qkv_proj.v",
         }
     )
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    }
 
     # LoRA specific attributes
     embedding_modules = {
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index b37ff8be0701..04044f6477ba 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -322,6 +322,10 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
             ".up_proj": ".gate_up_proj.1",
         }
     )
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
 
     # LoRA specific attributes
     embedding_modules = {
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index b084ca727af9..e9eaad16399b 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -315,6 +315,10 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
             ".up_proj": ".gate_up_proj.1",
         }
     )
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 893b41451bf5..e85541115e00 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -357,6 +357,10 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
             ".up_proj": ".gate_up_proj.1",
         }
     )
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index ccb6798dec75..8669688aa74d 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -335,6 +335,9 @@ class OPTForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
             "decoder.": "model.decoder.",
         },
     )
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py
index ebbe2999829e..aacce6300399 100644
--- a/vllm/model_executor/models/ouro.py
+++ b/vllm/model_executor/models/ouro.py
@@ -384,6 +384,10 @@ class OuroForCausalLM(nn.Module, SupportsLoRA):
             ".up_proj": ".gate_up_proj.1",
         }
     )
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index c417c658d2e2..e1c0ed0b1625 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -265,6 +265,9 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             ".v_proj": ".qkv_proj.v",
         }
     )
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index d5a9e1aab50a..7f76ffd1f6a1 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -440,6 +440,11 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
 class Qwen2ForCausalLM(
     nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3
 ):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config.get_text_config()
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 08c036e1a9b8..008039e296c2 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -25,6 +25,11 @@
 
 
 class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
     is_pooling_model = True
     pooler: Pooler
 
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 06f721209fd0..3c9517ec9b1c 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -267,6 +267,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 class Qwen3ForCausalLM(
     LocalArgmaxMixin, nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3
 ):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
     embedding_modules = {
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
diff --git a/vllm/model_executor/models/rnj1.py b/vllm/model_executor/models/rnj1.py
index 37f0f6e1684a..1bea77c87935 100644
--- a/vllm/model_executor/models/rnj1.py
+++ b/vllm/model_executor/models/rnj1.py
@@ -338,6 +338,10 @@ class Rnj1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             ".up_proj": ".gate_up_proj.1",
         }
     )
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
index af21f3ba17a9..68d29b6640f6 100644
--- a/vllm/model_executor/models/seed_oss.py
+++ b/vllm/model_executor/models/seed_oss.py
@@ -369,6 +369,10 @@ class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             ".up_proj": ".gate_up_proj.1",
         }
     )
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 427d76d91946..07e2aa83c404 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -354,7 +354,10 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             ".up_proj": ".gate_up_proj.1",
         }
     )
-
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
     # LoRA specific attributes
     embedding_modules = {
         "embed_tokens": "input_embeddings",
diff --git a/vllm/model_executor/models/step1.py b/vllm/model_executor/models/step1.py
index 529b405533ae..eab36640deeb 100644
--- a/vllm/model_executor/models/step1.py
+++ b/vllm/model_executor/models/step1.py
@@ -321,6 +321,10 @@ class Step1ForCausalLM(nn.Module, SupportsPP, SupportsEagle, SupportsEagle3):
             ".up_proj": ".gate_up_proj.1",
         }
     )
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()

From 735d725de9633e17a3263835971975688ac4baff Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 23:14:52 +0000
Subject: [PATCH 31/34] Revert now unused LoRA things

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/lora/layers/column_parallel_linear.py | 21 ++++++++-------------
 vllm/model_executor/layers/linear.py       |  8 +-------
 2 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py
index 7ec31c31253e..8a86191b8918 100644
--- a/vllm/lora/layers/column_parallel_linear.py
+++ b/vllm/lora/layers/column_parallel_linear.py
@@ -167,10 +167,7 @@ def can_replace_layer(
         if type(source_layer) is maybe_get_oot_by_class(ColumnParallelLinear):
             return True
         if isinstance(source_layer, maybe_get_oot_by_class(MergedColumnParallelLinear)):
-            if (
-                len(packed_modules_list) != 1
-                or source_layer.checkpoint_format == "sharded"
-            ):
+            if len(packed_modules_list) != 1:
                 return False
             # Exclude layers with 3+ output sizes - those are handled by
             # MergedColumnParallelLinearVariableSliceWithLoRA since this
@@ -350,11 +347,7 @@ def can_replace_layer(
         decorate: bool = True,
     ) -> bool:
         merged_cls = maybe_get_oot_by_class(MergedColumnParallelLinear)
-        if (
-            not isinstance(source_layer, merged_cls)
-            or len(source_layer.output_sizes) != 2
-            or source_layer.checkpoint_format == "fused"
-        ):
+        if not isinstance(source_layer, merged_cls) or len(packed_modules_list) != 2:
             return False
 
         tp_size = getattr(source_layer, "tp_size", 1)
@@ -429,8 +422,9 @@ def can_replace_layer(
         packed_modules_list: list,
         model_config: PretrainedConfig | None = None,
     ) -> bool:
-        return type(source_layer) is maybe_get_oot_by_class(QKVParallelLinear) and (
-            len(packed_modules_list) == 1 or source_layer.checkpoint_format == "fused"
+        return (
+            type(source_layer) is maybe_get_oot_by_class(QKVParallelLinear)
+            and len(packed_modules_list) == 1
         )
 
 
@@ -489,8 +483,9 @@ def can_replace_layer(
         packed_modules_list: list,
         model_config: PretrainedConfig | None = None,
     ) -> bool:
-        return type(source_layer) is maybe_get_oot_by_class(QKVParallelLinear) and (
-            len(packed_modules_list) == 3 or source_layer.checkpoint_format == "sharded"
+        return (
+            type(source_layer) is maybe_get_oot_by_class(QKVParallelLinear)
+            and len(packed_modules_list) == 3
         )
 
 
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 916c15ca3c1f..e78d8de2b2ce 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -4,7 +4,6 @@
 import itertools
 from abc import abstractmethod
 from collections.abc import Iterable
-from typing import Literal
 
 import torch
 from torch.nn.parameter import Parameter
@@ -619,8 +618,8 @@ def __init__(
         self.output_sizes = output_sizes
         self.tp_size = get_tensor_model_parallel_world_size() if not disable_tp else 1
         self.tp_rank = get_tensor_model_parallel_rank() if not disable_tp else 0
+
         assert all(output_size % self.tp_size == 0 for output_size in output_sizes)
-        self.checkpoint_format: Literal["fused", "sharded"] | None = None
         super().__init__(
             input_size=input_size,
             output_size=sum(output_sizes),
@@ -920,7 +919,6 @@ def load_weights(
                 # Checkpoint is sharded
                 shard_id_str, _, name = name.partition(".")
                 shard_id = int(shard_id_str)
-                self.checkpoint_format = "sharded"
                 logger.debug(
                     "Loaded shard %s into %s for layer %s.%s",
                     shard_id,
@@ -930,7 +928,6 @@ def load_weights(
                 )
             else:
                 shard_id = None
-                self.checkpoint_format = "fused"
                 logger.debug(
                     "Loaded weight %s.%s with shape %s",
                     self.prefix,
@@ -1021,7 +1018,6 @@ def __init__(
             self.num_kv_heads * self.head_size * tp_size,  # k_proj
             self.num_kv_heads * self.v_head_size * tp_size,  # v_proj
         ]
-        self.checkpoint_format: Literal["fused", "sharded"] | None = None
 
         super().__init__(
             input_size=input_size,
@@ -1349,7 +1345,6 @@ def load_weights(
                 # Checkpoint is sharded
                 shard_id, _, name = name.partition(".")
                 self.validate_shard_id(shard_id)
-                self.checkpoint_format = "sharded"
                 logger.debug(
                     "Loaded shard %s into %s for layer %s.%s",
                     shard_id,
@@ -1360,7 +1355,6 @@ def load_weights(
             else:
                 # Checkpoint is fused
                 shard_id = None
-                self.checkpoint_format = "fused"
                 logger.debug(
                     "Loaded weight %s.%s with shape %s",
                     self.prefix,

From a7a65a9729dc56e888e723ddd6c8bd28203e8299 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 23:15:53 +0000
Subject: [PATCH 32/34] Revert lora test

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/lora/test_lora_huggingface.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 53253278ad80..7c7f4eb4b626 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -6,7 +6,7 @@
 from vllm.lora.lora_model import LoRAModel
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.utils import get_adapter_absolute_path
-from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
 
 # Provide absolute path and huggingface lora ids
 lora_fixture_name = ["llama32_lora_files", "llama32_lora_huggingface_id"]
@@ -23,7 +23,7 @@
 @pytest.mark.parametrize("lora_fixture_name", lora_fixture_name)
 def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
     lora_name = request.getfixturevalue(lora_fixture_name)
-    packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping
+    packed_modules_mapping = Qwen3ForCausalLM.packed_modules_mapping
 
     expected_lora_lst: list[str] = []
     for module in LLAMA_LORA_MODULES:

From 2aadacec3b806a6e4061afc3fbd526a0f27452da Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 23:30:31 +0000
Subject: [PATCH 33/34] tweak diff

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/commandr.py | 1 -
 vllm/model_executor/models/qwen2_rm.py | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 96c3e4133e21..2880a2c22103 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -353,7 +353,6 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
     }
-
     # LoRA specific attributes
     embedding_modules = {"embed_tokens": "input_embeddings"}
     # ModelOpt NVFP4 checkpoints carry raw quantizer-module state
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 008039e296c2..47184173d5a2 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -25,14 +25,14 @@
 
 
 class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
+    is_pooling_model = True
+    pooler: Pooler
+
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
-    is_pooling_model = True
-    pooler: Pooler
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config

From ed3b43cade10c2de657cb16ea8e7d901dd73d5b6 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 13 Jun 2026 23:33:22 +0000
Subject: [PATCH 34/34] More accurate comment

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/layers/linear.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index e78d8de2b2ce..0ad5702f35df 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -934,7 +934,7 @@ def load_weights(
                     name,
                     loaded_weight.shape,
                 )
-            # If name is "bias" get it from self, otherwise load into self
+            # Load into self if name is not an attr of self
             param: Parameter = getattr(self, name, self)
             if (
                 param is None
@@ -1361,7 +1361,7 @@ def load_weights(
                     name,
                     loaded_weight.shape,
                 )
-            # If name is "bias" get it from self, otherwise load into self
+            # Load into self if name is not an attr of self
             param: Parameter = getattr(self, name, self)
             if (
                 param is None