From 88beb3b85df2545259aef3b892997605672db2a5 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 4 Jun 2026 23:42:12 +0000 Subject: [PATCH 01/34] Enable fused linear layers to load themselves Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/layers/linear.py | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index e50a0e6b0025..62f978489f54 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -3,6 +3,7 @@ import itertools from abc import abstractmethod +from collections.abc import Iterable import torch from torch.nn.parameter import Parameter, UninitializedParameter @@ -968,6 +969,21 @@ def weight_loader_v2( tp_rank=self.tp_rank, ) + def load_weights( + self, weights: Iterable[tuple[str, torch.Tensor]] + ) -> Iterable[str]: + for name, loaded_weight in weights: + shard_id_str, _, param_name = name.partition(".") + # If the shard_id is not an integer, the weight is not sharded + try: + shard_id = int(shard_id_str) + except ValueError: + shard_id = None + # If param_name is "bias" get it from self, otherwise load into self + param: Parameter = getattr(self, param_name, self) + param.weight_loader(param, loaded_weight, shard_id) + yield param_name + class QKVParallelLinear(ColumnParallelLinear): """Linear layers for the attention's QKV transformation. @@ -1383,6 +1399,22 @@ def weight_loader( assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) + def load_weights( + self, weights: Iterable[tuple[str, torch.Tensor]] + ) -> Iterable[str]: + for name, loaded_weight in weights: + shard_id_str, _, param_name = name.partition(".") + # If the shard_id is not valid, the weight is not sharded + try: + self.validate_shard_id(shard_id_str) + shard_id = shard_id_str + except ValueError: + shard_id = None + # If param_name is "bias" get it from self, otherwise load into self + param: Parameter = getattr(self, param_name, self) + param.weight_loader(param, loaded_weight, shard_id) + yield param_name + # --8<-- [start:row_parallel_linear] @PluggableLayer.register("row_parallel_linear") From 41f4584b00b019160dda51d9ea97d419d6f3cb8b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 4 Jun 2026 23:42:50 +0000 Subject: [PATCH 02/34] Enable GPTQ extra bias skipping in AutoWeightsLoader Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../model_executor/models/transformers/base.py | 3 --- vllm/model_executor/models/utils.py | 18 ++++++++---------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py index 35897ce7dbca..e26ef8f4b0b3 100644 --- a/vllm/model_executor/models/transformers/base.py +++ b/vllm/model_executor/models/transformers/base.py @@ -154,9 +154,6 @@ def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""): "Transformers modeling backend does " "not support MXFP4 quantization yet." ) - # Skip loading extra bias for GPTQ models. - if "gptq" in quant_method_name: - self.ignore_unexpected_suffixes.append(".bias") self._patch_config() from_config_kwargs = dict( diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 83d113415dce..6a23d95e138b 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -348,16 +348,14 @@ def load_weights( # We look at the causal model's direct children for this reason. modules = (self.module, *self.module.children()) iterator = (m.quant_config for m in modules if hasattr(m, "quant_config")) - quant_config = next(iterator, None) - cache_scale_mapper = ( - quant_config.get_cache_scale_mapper() if quant_config is not None else None - ) - if cache_scale_mapper is not None: - mapper = ( - mapper | cache_scale_mapper - if mapper is not None - else cache_scale_mapper - ) + if quant_config := next(iterator, None): + # Skip loading extra bias for GPTQ models + if "gptq" in quant_config.get_name(): + self.ignore_unexpected_suffixes.append(".bias") + # Get mappings for KV cache quantization scales + if cache_scale_mapper := quant_config.get_cache_scale_mapper(): + mapper = mapper or WeightsMapper() + mapper |= cache_scale_mapper if mapper is not None: weights = mapper.apply(weights) # filter out weights with first-prefix/substr to skip in name From fcd151b147c926e4eeb2ada5e9ce690a7daaf278 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 4 Jun 2026 23:44:31 +0000 Subject: [PATCH 03/34] Try it on a couple of simple weight loaders Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/olmo.py | 52 +++++++----------------------- vllm/model_executor/models/opt.py | 44 ++++--------------------- 2 files changed, 18 insertions(+), 78 deletions(-) diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 4491a6a3ea1b..73645c2b72b4 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -48,13 +48,12 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP from .utils import ( AutoWeightsLoader, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -300,43 +299,6 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA): """ @@ -355,6 +317,16 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA): ], } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config @@ -409,4 +381,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: ["lm_head.weight"] if self.config.tie_word_embeddings else None ), ) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 81653b9516ac..80e05836b07e 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -44,14 +44,12 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP from .utils import ( AutoWeightsLoader, WeightsMapper, - is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -325,41 +323,6 @@ def forward( input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds ) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class OPTForCausalLM(nn.Module, SupportsPP, SupportsLoRA): packed_modules_mapping = { @@ -367,9 +330,14 @@ class OPTForCausalLM(nn.Module, SupportsPP, SupportsLoRA): } hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + }, orig_to_new_prefix={ "decoder.": "model.decoder.", - } + }, ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): From a9788ab71eb33f37b849129b1090131f27e8a1a5 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 5 Jun 2026 10:39:22 +0000 Subject: [PATCH 04/34] Fix LoRA loading for these two models Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/lora/layers/column_parallel_linear.py | 10 ++++------ vllm/lora/utils.py | 23 ++++++++++++++++------ vllm/model_executor/layers/linear.py | 8 +++++++- vllm/model_executor/models/olmo.py | 12 ----------- vllm/model_executor/models/opt.py | 4 ---- 5 files changed, 28 insertions(+), 29 deletions(-) diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py index 8a86191b8918..f9643809c7f4 100644 --- a/vllm/lora/layers/column_parallel_linear.py +++ b/vllm/lora/layers/column_parallel_linear.py @@ -422,9 +422,8 @@ def can_replace_layer( packed_modules_list: list, model_config: PretrainedConfig | None = None, ) -> bool: - return ( - type(source_layer) is maybe_get_oot_by_class(QKVParallelLinear) - and len(packed_modules_list) == 1 + return type(source_layer) is maybe_get_oot_by_class(QKVParallelLinear) and ( + len(packed_modules_list) == 1 or source_layer.checkpoint_format == "fused" ) @@ -483,9 +482,8 @@ def can_replace_layer( packed_modules_list: list, model_config: PretrainedConfig | None = None, ) -> bool: - return ( - type(source_layer) is maybe_get_oot_by_class(QKVParallelLinear) - and len(packed_modules_list) == 3 + return type(source_layer) is maybe_get_oot_by_class(QKVParallelLinear) and ( + len(packed_modules_list) == 3 or source_layer.checkpoint_format == "sharded" ) diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index d5c9a1a6ff8a..bd23fb69214b 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -34,7 +34,11 @@ VocabParallelEmbeddingWithLoRA, ) from vllm.model_executor.layers.fused_moe import FusedMoE -from vllm.model_executor.layers.linear import LinearBase +from vllm.model_executor.layers.linear import ( + LinearBase, + MergedColumnParallelLinear, + QKVParallelLinear, +) from vllm.model_executor.utils import get_moe_expert_mapping, get_packed_modules_mapping from vllm.transformers_utils.repo_utils import hf_api @@ -219,11 +223,18 @@ def get_supported_lora_modules(model: nn.Module) -> list[str]: for name in embedding_modules: supported_lora_modules.add(name) - # get all the linear subfixes. - if isinstance(module, (LinearBase,)): - supported_lora_modules.add(name.split(".")[-1]) - - if isinstance(module, (FusedMoE,)): + if ( + isinstance(module, QKVParallelLinear) + and module.checkpoint_format == "sharded" + ): + supported_lora_modules.update(["q", "k", "v"]) + elif ( + isinstance(module, MergedColumnParallelLinear) + and module.checkpoint_format == "sharded" + ): + shard_ids = [str(i) for i in range(len(module.output_sizes))] + supported_lora_modules.update(shard_ids) + elif isinstance(module, (LinearBase, FusedMoE)): supported_lora_modules.add(name.split(".")[-1]) return list(supported_lora_modules) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 62f978489f54..40e98ac34ad6 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -4,6 +4,7 @@ import itertools from abc import abstractmethod from collections.abc import Iterable +from typing import Literal import torch from torch.nn.parameter import Parameter, UninitializedParameter @@ -645,8 +646,8 @@ def __init__( self.output_sizes = output_sizes self.tp_size = get_tensor_model_parallel_world_size() if not disable_tp else 1 self.tp_rank = get_tensor_model_parallel_rank() if not disable_tp else 0 - assert all(output_size % self.tp_size == 0 for output_size in output_sizes) + self.checkpoint_format: Literal["sharded", "fused"] | None = None super().__init__( input_size=input_size, output_size=sum(output_sizes), @@ -977,8 +978,10 @@ def load_weights( # If the shard_id is not an integer, the weight is not sharded try: shard_id = int(shard_id_str) + self.checkpoint_format = "sharded" except ValueError: shard_id = None + self.checkpoint_format = "fused" # If param_name is "bias" get it from self, otherwise load into self param: Parameter = getattr(self, param_name, self) param.weight_loader(param, loaded_weight, shard_id) @@ -1056,6 +1059,7 @@ def __init__( self.num_kv_heads * self.head_size * tp_size, # k_proj self.num_kv_heads * self.v_head_size * tp_size, # v_proj ] + self.checkpoint_format: Literal["fused", "sharded"] | None = None super().__init__( input_size=input_size, @@ -1408,8 +1412,10 @@ def load_weights( try: self.validate_shard_id(shard_id_str) shard_id = shard_id_str + self.checkpoint_format = "sharded" except ValueError: shard_id = None + self.checkpoint_format = "fused" # If param_name is "bias" get it from self, otherwise load into self param: Parameter = getattr(self, param_name, self) param.weight_loader(param, loaded_weight, shard_id) diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 73645c2b72b4..297f39726a1e 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -305,18 +305,6 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA): Extremely barebones HF model wrapper. """ - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - hf_to_vllm_mapper = WeightsMapper( orig_to_new_substr={ ".q_proj": ".qkv_proj.q", diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 80e05836b07e..ccb6798dec75 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -325,10 +325,6 @@ def forward( class OPTForCausalLM(nn.Module, SupportsPP, SupportsLoRA): - packed_modules_mapping = { - "qkv_proj": ["q_proj", "k_proj", "v_proj"], - } - hf_to_vllm_mapper = WeightsMapper( orig_to_new_substr={ ".q_proj": ".qkv_proj.q", From cca665e8984fbbcd4c923939dde073b4192bf7a6 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 5 Jun 2026 13:13:48 +0000 Subject: [PATCH 05/34] Delete some more load_weights methods Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/baichuan.py | 54 ++++----------------- vllm/model_executor/models/gemma.py | 63 +++++-------------------- vllm/model_executor/models/internlm2.py | 51 ++++---------------- vllm/model_executor/models/olmo2.py | 61 +++++------------------- vllm/model_executor/models/orion.py | 52 +++++--------------- vllm/model_executor/models/phi.py | 60 ++++------------------- vllm/model_executor/models/qwen.py | 60 ++++++----------------- vllm/model_executor/models/qwen_vl.py | 8 ---- vllm/model_executor/models/stablelm.py | 52 +++++--------------- vllm/model_executor/models/step1.py | 56 +++++----------------- 10 files changed, 96 insertions(+), 421 deletions(-) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index bc1cd2ed811b..85cb254670e3 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -60,7 +60,7 @@ from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant from .utils import ( AutoWeightsLoader, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -342,52 +342,14 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant): - packed_modules_mapping = { - "W_pack": ["W_pack"], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) def __init__( self, @@ -447,7 +409,7 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) def lm_head_weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): # Unlike Baichuan, Baichuan2 normalizes the head weights. diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 6e35020a6eac..64808e95ae2b 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -42,13 +42,12 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP from .utils import ( AutoWeightsLoader, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -324,57 +323,17 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - for param_name, shard_name, shard_id in stacked_params_mapping: - if shard_name not in name: - continue - name = name.replace(shard_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - - return loaded_params - class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -421,4 +380,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: self, skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 6b1712ede320..4010dd54d7d4 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -35,7 +35,6 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -43,7 +42,7 @@ from .utils import ( AutoWeightsLoader, StageMissingLayer, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -309,48 +308,14 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("gate_up_proj", "w1", 0), - ("gate_up_proj", "w3", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): - packed_modules_mapping = { - "wqkv": ["wqkv"], - "gate_up_proj": ["w1", "w3"], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".w1": ".gate_up_proj.0", + ".w3": ".gate_up_proj.1", + } + ) def __init__( self, @@ -409,7 +374,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: self, skip_prefixes=(["output."] if self.config.tie_word_embeddings else None), ) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) @default_pooling_type(tok_pooling_type="ALL") diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 212140fe15ea..5f3b1c9a839c 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -52,12 +52,11 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP from vllm.model_executor.models.utils import ( AutoWeightsLoader, + WeightsMapper, extract_layer_index, - is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -342,59 +341,21 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if is_pp_missing_parameter(name, self): - continue - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader # type: ignore - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): """ Extremely barebones HF model wrapper. """ - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -450,4 +411,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): ["lm_head.weight"] if self.config.tie_word_embeddings else None ), ) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 3cacb9d61cd5..52addf4cef97 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -32,13 +32,12 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP from .utils import ( AutoWeightsLoader, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -277,45 +276,18 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class OrionForCausalLM(nn.Module, SupportsPP): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config @@ -362,4 +334,4 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 75c42c0d3930..c417c658d2e2 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -62,13 +62,12 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP from .utils import ( AutoWeightsLoader, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -257,56 +256,15 @@ def forward( return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # pylint: disable=E1136 - - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ] - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + } + ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -360,4 +318,4 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index b4526beac637..f5fa9e7e1d24 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -34,12 +34,12 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP from .utils import ( - is_pp_missing_parameter, + AutoWeightsLoader, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -263,6 +263,13 @@ def forward( class QWenBaseModel(nn.Module): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".w2": ".gate_up_proj.0", + ".w1": ".gate_up_proj.1", + } + ) + def __init__( self, *, @@ -304,53 +311,14 @@ def compute_logits( return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("gate_up_proj", "w2", 0), - ("gate_up_proj", "w1", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Skip layers on other devices. - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Skip layers on other devices. - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA): - packed_modules_mapping = { - "c_attn": ["c_attn"], - "gate_up_proj": [ - "w2", - "w1", - ], - } - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config if hasattr(config, "visual"): diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index e2232956ea86..9d3980af76a2 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -586,14 +586,6 @@ def _get_prompt_updates( class QwenVLForConditionalGeneration( QWenBaseModel, SupportsPP, SupportsLoRA, SupportsMultiModal ): - packed_modules_mapping = { - "c_attn": ["c_attn"], - "gate_up_proj": [ - "w2", - "w1", - ], - } - embed_input_ids = SupportsMultiModal.embed_input_ids def get_mm_mapping(self) -> MultiModelKeys: diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 034c9c18ff7b..17349767b94c 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -45,13 +45,12 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP from .utils import ( AutoWeightsLoader, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -266,45 +265,18 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class StablelmForCausalLM(nn.Module, SupportsPP): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config @@ -351,4 +323,4 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/step1.py b/vllm/model_executor/models/step1.py index 07653fa6b377..529b405533ae 100644 --- a/vllm/model_executor/models/step1.py +++ b/vllm/model_executor/models/step1.py @@ -30,7 +30,6 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import ( EagleModelMixin, SupportsEagle, @@ -40,7 +39,7 @@ from vllm.model_executor.models.utils import ( AutoWeightsLoader, PPMissingLayer, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -48,11 +47,6 @@ from vllm.sequence import IntermediateTensors from vllm.v1.attention.backend import AttentionType -STEP_PACKED_MODULES_MAPPING = { - "qkv_proj": ["q_proj", "k_proj", "v_proj"], - "gate_up_proj": ["gate_proj", "up_proj"], -} - def _get_step_alibi_slopes(total_num_heads: int) -> torch.Tensor: """Reference ALiBi slopes used by Step models.""" @@ -242,42 +236,6 @@ def forward( hidden_states = self.mlp(hidden_states) return hidden_states, residual - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - (".gate_up_proj", ".gate_proj", 0), - (".gate_up_proj", ".up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) # type: ignore[name-defined] - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class StepDecoderModel(nn.Module, EagleModelMixin): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -354,7 +312,15 @@ def forward( class Step1ForCausalLM(nn.Module, SupportsPP, SupportsEagle, SupportsEagle3): - packed_modules_mapping = STEP_PACKED_MODULES_MAPPING + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -413,4 +379,4 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) From e316238295637f433ce8d522afdffb91b39a5197 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 5 Jun 2026 14:57:07 +0000 Subject: [PATCH 06/34] Add patterns from `maybe_remap_kv_scale_name` to `QuantizationConfig.get_cache_scale_mapper` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/model_executor/test_weight_utils.py | 121 ++++++++++++++++++ .../layers/quantization/base_config.py | 51 +++++++- .../model_executor/layers/quantization/fp8.py | 19 +-- .../layers/quantization/quark/quark.py | 19 +-- 4 files changed, 187 insertions(+), 23 deletions(-) diff --git a/tests/model_executor/test_weight_utils.py b/tests/model_executor/test_weight_utils.py index 260ebdcefb3b..9e67609b78e4 100644 --- a/tests/model_executor/test_weight_utils.py +++ b/tests/model_executor/test_weight_utils.py @@ -160,5 +160,126 @@ def test_missing_target_returns_none(self): assert result is None +class TestKvCacheScaleMapper: + """The `WeightsMapper` returned by `get_cache_scale_mapper` replaces the + per-model `maybe_remap_kv_scale_name` calls. It must remap the same set of + checkpoint formats (the non-`params_dict`-dependent ones) and be idempotent + so it composes safely with a model's own qkv/gate_up `hf_to_vllm_mapper`.""" + + def _mapper(self): + # `get_cache_scale_mapper` does not use `self`; call it on the base + # class to get the default (non-config-specific) mapper. + from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + ) + + return QuantizationConfig.get_cache_scale_mapper() + + def _map(self, name: str) -> str | None: + return self._mapper()._map_name(name) + + @pytest.mark.parametrize( + "name,expected", + [ + # Qwen3-MoE / llm-compressor fused qkv_proj + ( + "model.layers.0.self_attn.qkv_proj.k_scale", + "model.layers.0.self_attn.attn.k_scale", + ), + ( + "model.layers.0.self_attn.qkv_proj.v_scale", + "model.layers.0.self_attn.attn.v_scale", + ), + # ModelOpt / NVFP4 k_proj/v_proj + ( + "model.layers.0.self_attn.k_proj.k_scale", + "model.layers.0.self_attn.attn.k_scale", + ), + ( + "model.layers.0.self_attn.v_proj.v_scale", + "model.layers.0.self_attn.attn.v_scale", + ), + # deprecated fused kv_scale and bare scales + ( + "model.layers.0.self_attn.kv_scale", + "model.layers.0.self_attn.attn.k_scale", + ), + ( + "model.layers.0.self_attn.k_scale", + "model.layers.0.self_attn.attn.k_scale", + ), + # NemotronH mixer + ( + "model.layers.0.mixer.k_proj.k_scale", + "model.layers.0.mixer.attn.k_scale", + ), + # already in vLLM form -> unchanged (idempotent) + ( + "model.layers.0.self_attn.attn.k_scale", + "model.layers.0.self_attn.attn.k_scale", + ), + # non-kv scales must not be touched + ( + "model.layers.0.self_attn.k_proj.weight_scale", + "model.layers.0.self_attn.k_proj.weight_scale", + ), + ( + "model.layers.0.self_attn.k_proj.input_scale", + "model.layers.0.self_attn.k_proj.input_scale", + ), + # regular weights untouched + ( + "model.layers.0.self_attn.q_proj.weight", + "model.layers.0.self_attn.q_proj.weight", + ), + ], + ) + def test_remap(self, name, expected): + assert self._map(name) == expected + + @pytest.mark.parametrize( + "name", + [ + "model.layers.0.self_attn.k_scale", + "model.layers.0.self_attn.k_proj.k_scale", + "model.layers.0.self_attn.qkv_proj.v_scale", + "model.layers.0.mixer.k_proj.k_scale", + ], + ) + def test_idempotent(self, name): + once = self._map(name) + assert once is not None + assert self._map(once) == once + + def test_composes_with_qkv_mapper(self): + """Applied together with a model's qkv/gate_up mapper, the regex scale + rules run before the substr rename, so scales are normalized to `.attn.` + and regular projections are still fused correctly.""" + from vllm.model_executor.models.utils import WeightsMapper + + model_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + } + ) + # AutoWeightsLoader does `mapper |= cache_scale_mapper` + combined = model_mapper | self._mapper() + + assert ( + combined._map_name("model.layers.0.self_attn.q_proj.weight") + == "model.layers.0.self_attn.qkv_proj.q.weight" + ) + assert ( + combined._map_name("model.layers.0.self_attn.k_proj.k_scale") + == "model.layers.0.self_attn.attn.k_scale" + ) + assert ( + combined._map_name("model.layers.0.self_attn.k_scale") + == "model.layers.0.self_attn.attn.k_scale" + ) + + if __name__ == "__main__": test_download_weights_from_hf() diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 3c03ff2233b0..141fcf1b4113 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -5,6 +5,7 @@ from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Any +import regex as re import torch from torch import nn from transformers import PretrainedConfig @@ -19,10 +20,12 @@ class QuantizeMethodBase(ABC): """Base class for different quantized methods.""" - # Whether this method creates weights on meta device for online quantization. - # When True, weights are created on meta device and quantized layer-wise - # in process_weights_after_loading, reducing peak memory during loading. uses_meta_device: bool = False + """ + Whether this method creates weights on meta device for online quantization. + When True, weights are created on meta device and quantized layer-wise + in process_weights_after_loading, reducing peak memory during loading. + """ @abstractmethod def create_weights( @@ -70,6 +73,18 @@ def method_has_implemented_embedding(method_class: type[QuantizeMethodBase]) -> class QuantizationConfig(ABC): """Base class for quantization configs.""" + _ignore_unexpected_suffixes = ( + ".q_scale", + ".k_scale", + ".v_scale", + ".q_zero_point", + ".k_zero_point", + ".v_zero_point", + ) + """Suffixes of quantization parameters that may be present in the checkpoint but + not in the model, and should be ignored if unexpected during loading. These are used + after remapping, so should be in vLLM format (e.g. .q_scale, not .q.scale).""" + def __init__(self): super().__init__() # mapping is updated by models as they initialize @@ -162,14 +177,40 @@ def get_quant_method( """ raise NotImplementedError - def get_cache_scale_mapper(self) -> "WeightsMapper | None": + @staticmethod + def get_cache_scale_mapper() -> "WeightsMapper": """Mapping from checkpoint KV-cache scale names to vLLM scale names. Returning a mapper here causes `AutoWeightsLoader` to apply it to the weight stream automatically; individual model `load_weights` methods do not need to know about KV-cache scales. """ - return None + from vllm.model_executor.models.utils import WeightsMapper + + orig_to_new_regex = { + # Deprecated fused kv_scale -> attn.k_scale + re.compile(r"\.kv_scale$"): r".attn.k_scale", + # ModelOpt: .self_attn.{k,v}_proj.{k,v}_scale -> .self_attn.attn.* + re.compile(r"\.self_attn\.[kv]_proj\.([kv])_scale$"): ( + r".self_attn.attn.\1_scale" + ), + # Fused QKV / qkqkv proj: .self_attn.qk(qk)v_proj.{k,v}_scale -> attn + re.compile(r"\.self_attn\.qk(?:qk)?v_proj\.([kv])_scale$"): ( + r".self_attn.attn.\1_scale" + ), + # NemotronH: .mixer.{k,v}_proj.{k,v}_scale -> .mixer.attn.* + re.compile(r"\.mixer\.[kv]_proj\.([kv])_scale$"): r".mixer.attn.\1_scale", + # HYV3: .self_attn.q.scale -> .self_attn.attn.q_scale + re.compile(r"\.self_attn\.q\.scale$"): r".self_attn.attn.q_scale", + # HYV3: .self_attn.{k,v}_cache.scale -> .self_attn.attn.{k,v}_scale + re.compile(r"\.self_attn\.([kv])_cache\.scale$"): ( + r".self_attn.attn.\1_scale" + ), + # Default: .{q,k,v}_scale -> .attn.{q,k,v}_scale (unless already .attn) + re.compile(r"(? "WeightsMapper": + @staticmethod + def get_cache_scale_mapper() -> "WeightsMapper": """Map compressed-tensors KV-cache scale names to vLLM names.""" from vllm.model_executor.models.utils import WeightsMapper - return WeightsMapper( - orig_to_new_suffix={ - ".k_proj.output_scale": ".attn.k_scale", - ".v_proj.output_scale": ".attn.v_scale", - ".q_proj.output_scale": ".attn.q_scale", - ".self_attn.prob_output_scale": ".self_attn.attn.prob_scale", - } - ) + orig_to_new_suffix = { + ".k_proj.output_scale": ".attn.k_scale", + ".v_proj.output_scale": ".attn.v_scale", + ".q_proj.output_scale": ".attn.q_scale", + ".self_attn.prob_output_scale": ".self_attn.attn.prob_scale", + } + cache_scale_mapper = WeightsMapper(orig_to_new_suffix=orig_to_new_suffix) + return cache_scale_mapper | QuantizationConfig.get_cache_scale_mapper() class CopyNumelCounter(TorchDispatchMode): diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index 54dea48973b0..c888e523b20d 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -646,16 +646,17 @@ def get_scheme( return scheme - def get_cache_scale_mapper(self) -> "WeightsMapper": + @staticmethod + def get_cache_scale_mapper() -> "WeightsMapper": """Map Quark KV-cache scale names to vLLM names.""" - return WeightsMapper( - orig_to_new_suffix={ - ".k_proj.output_scale": ".attn.k_scale", - ".v_proj.output_scale": ".attn.v_scale", - ".q_proj.output_scale": ".attn.q_scale", - ".self_attn.prob_output_scale": ".self_attn.attn.prob_scale", - } - ) + orig_to_new_suffix = { + ".k_proj.output_scale": ".attn.k_scale", + ".v_proj.output_scale": ".attn.v_scale", + ".q_proj.output_scale": ".attn.q_scale", + ".self_attn.prob_output_scale": ".self_attn.attn.prob_scale", + } + cache_scale_mapper = WeightsMapper(orig_to_new_suffix=orig_to_new_suffix) + return cache_scale_mapper | QuantizationConfig.get_cache_scale_mapper() class QuarkLinearMethod(LinearMethodBase): From 665ca0c5879d7c7bc616d87d8872a89d1b6a6add Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 5 Jun 2026 14:59:28 +0000 Subject: [PATCH 07/34] Use new mappings in `AutoWeightsLoader` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/utils.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 6a23d95e138b..f0f722299d78 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -19,9 +19,7 @@ get_tensor_model_parallel_world_size, ) from vllm.logger import init_logger -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig, -) +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.model_loader.reload import ( support_quantized_model_reload_from_hp_weights, ) @@ -59,6 +57,16 @@ def __or__(self, other: "WeightsMapper") -> "WeightsMapper": ) def _map_name(self, key: str) -> str | None: + # Deprecation warnings + if key.endswith(".kv_scale"): + logger.warning_once( + "DEPRECATED. Found kv_scale in the checkpoint. " + "This format is deprecated in favor of separate k_scale and " + "v_scale tensors and will be removed in a future release. " + "Functionally, we will remap kv_scale to k_scale and duplicate " + "k_scale to v_scale" + ) + for pattern, new_key in self.orig_to_new_regex.items(): if pattern.search(key): if new_key is None: @@ -353,9 +361,11 @@ def load_weights( if "gptq" in quant_config.get_name(): self.ignore_unexpected_suffixes.append(".bias") # Get mappings for KV cache quantization scales - if cache_scale_mapper := quant_config.get_cache_scale_mapper(): - mapper = mapper or WeightsMapper() - mapper |= cache_scale_mapper + mapper = mapper or WeightsMapper() + mapper |= quant_config.get_cache_scale_mapper() + self.ignore_unexpected_suffixes.extend( + quant_config._ignore_unexpected_suffixes + ) if mapper is not None: weights = mapper.apply(weights) # filter out weights with first-prefix/substr to skip in name From edf67e6bfe40b811d27c14fcdfc571b7ba20c433 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 5 Jun 2026 15:08:58 +0000 Subject: [PATCH 08/34] Remove some more load_weights methods Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/arcee.py | 81 +++------------------ vllm/model_executor/models/commandr.py | 69 +++--------------- vllm/model_executor/models/exaone.py | 80 +++----------------- vllm/model_executor/models/exaone4.py | 80 +++----------------- vllm/model_executor/models/gemma2.py | 70 +++--------------- vllm/model_executor/models/glm4.py | 83 +++------------------ vllm/model_executor/models/hyperclovax.py | 81 +++------------------ vllm/model_executor/models/jais2.py | 71 +++--------------- vllm/model_executor/models/nemotron.py | 66 +++-------------- vllm/model_executor/models/nemotron_nas.py | 73 +++---------------- vllm/model_executor/models/ouro.py | 74 +++---------------- vllm/model_executor/models/rnj1.py | 85 +++------------------- vllm/model_executor/models/seed_oss.py | 71 +++--------------- vllm/model_executor/models/solar.py | 74 +++---------------- vllm/model_executor/models/starcoder2.py | 49 +++---------- 15 files changed, 159 insertions(+), 948 deletions(-) diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py index d25c954fc19e..844f7ff44342 100644 --- a/vllm/model_executor/models/arcee.py +++ b/vllm/model_executor/models/arcee.py @@ -26,10 +26,6 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.sequence import IntermediateTensors from .interfaces import ( @@ -42,7 +38,7 @@ from .utils import ( AutoWeightsLoader, PPMissingLayer, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -276,67 +272,6 @@ def forward( return hidden_states, aux_hidden_states return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - """Load weights, mapping q/k/v projections to fused qkv_proj.""" - stacked_params_mapping = [ - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - ] - - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: - continue - - if "scale" in name or "zero_point" in name: - remapped_name = maybe_remap_kv_scale_name(name, params_dict) - if remapped_name is None: - continue - name = remapped_name - - mapped = False - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - - name = name.replace(weight_name, param_name) - - if name.endswith(".bias") and name not in params_dict: - mapped = True - break - - if is_pp_missing_parameter(name, self): - mapped = True - break - - param = params_dict[name] - weight_loader = param.weight_loader # type: ignore[attr-defined] - weight_loader(param, loaded_weight, shard_id) - loaded_params.add(name) - mapped = True - break - - if mapped: - continue - - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - - return loaded_params - class ArceeForCausalLM( nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3 @@ -344,11 +279,13 @@ class ArceeForCausalLM( """Arcee Model for causal language modeling, integrated with vLLM runtime.""" - # Map fused module names to their submodule components - # (for quantization and LoRA) - packed_modules_mapping = { - "qkv_proj": ["q_proj", "k_proj", "v_proj"], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + } + ) def __init__(self, *, vllm_config, prefix: str = "") -> None: super().__init__() @@ -420,4 +357,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: ) # AutoWeightLoader handles weight name remapping, including fusing # separate q_proj, k_proj, v_proj into qkv_proj - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 317269ec3b6b..20241433c484 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -45,8 +45,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, row_parallel_weight_loader, ) from vllm.model_executor.utils import set_weight_attrs @@ -56,8 +54,8 @@ from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant from .utils import ( AutoWeightsLoader, + WeightsMapper, extract_layer_index, - is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -340,61 +338,18 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - for param_name, shard_name, shard_id in stacked_params_mapping: - if shard_name not in name: - continue - name = name.replace(shard_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) + # LoRA specific attributes embedding_modules = {"embed_tokens": "input_embeddings"} @@ -453,4 +408,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=["lm_head", "rotary_emb.inv_freq"] ) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index dca05f72c696..13b29e433d34 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -50,17 +50,13 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP from .utils import ( AutoWeightsLoader, PPMissingLayer, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -373,71 +369,17 @@ def forward( hidden_states, _ = self.ln_f(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - (".gate_up_proj", ".c_fc_0", 0), - (".gate_up_proj", ".c_fc_1", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - continue - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "c_fc_0", - "c_fc_1", - ], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".c_fc_0": ".gate_up_proj.0", + ".c_fc_1": ".gate_up_proj.1", + } + ) # LoRA specific attributes embedding_modules = { @@ -509,4 +451,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # processed with quantization, LoRA, fine-tuning, etc. skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index e38dbb5ee294..c583a776e390 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -46,10 +46,6 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import set_default_rope_theta @@ -57,8 +53,8 @@ from .utils import ( AutoWeightsLoader, PPMissingLayer, + WeightsMapper, extract_layer_index, - is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -371,71 +367,17 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - (".gate_up_proj", ".gate_proj", 0), - (".gate_up_proj", ".up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - continue - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) # LoRA specific attributes embedding_modules = { @@ -506,4 +448,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # processed with quantization, LoRA, fine-tuning, etc. skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 733eb3ed3c19..fd1d2027297d 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -39,17 +39,13 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP from .utils import ( AutoWeightsLoader, + WeightsMapper, extract_layer_index, - is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -316,61 +312,17 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - for param_name, shard_name, shard_id in stacked_params_mapping: - if shard_name not in name: - continue - name = name.replace(shard_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - - return loaded_params - class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config @@ -418,4 +370,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: self, skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 4587a6927663..b1ad99637fa9 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -39,10 +39,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.sequence import IntermediateTensors from vllm.v1.attention.backend import AttentionType @@ -52,7 +48,7 @@ from .utils import ( AutoWeightsLoader, PPMissingLayer, - is_pp_missing_parameter, + WeightsMapper, maybe_prefix, ) @@ -237,74 +233,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): vllm_config=vllm_config, prefix=prefix, layer_type=Glm4DecoderLayer ) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - (".gate_up_proj", ".gate_proj", 0), - (".gate_up_proj", ".up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) - if spec_layer is not None: - continue - if "rotary_emb.inv_freq" in name: - continue - if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - continue - if "scale" in name or "zero_point" in name: - # Remapping the name of FP8 kv-scale or zero point. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -364,7 +303,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: self, skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) def get_spec_layer_idx_from_weight_name( diff --git a/vllm/model_executor/models/hyperclovax.py b/vllm/model_executor/models/hyperclovax.py index 2f54f78e7580..3eff84a4fe1a 100644 --- a/vllm/model_executor/models/hyperclovax.py +++ b/vllm/model_executor/models/hyperclovax.py @@ -50,10 +50,6 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.hyperclovax import HyperCLOVAXConfig @@ -61,7 +57,7 @@ from .utils import ( AutoWeightsLoader, PPMissingLayer, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -377,72 +373,17 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - (".gate_up_proj", ".gate_proj", 0), - (".gate_up_proj", ".up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - continue - if "scale" in name or "zero_point" in name: - # Remapping the name of FP8 kv-scale or zero point. - remapped_name = maybe_remap_kv_scale_name(name, params_dict) - if remapped_name is None: - continue - name = remapped_name - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader # type: ignore[attr-defined] - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class HyperCLOVAXForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) # LoRA specific attributes embedding_modules = { @@ -536,4 +477,4 @@ def load_weights( self, skip_prefixes=["lm_head."] if self.config.tie_word_embeddings else None, ) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py index dafa0f03ae9d..5a6d2f0c5fe0 100644 --- a/vllm/model_executor/models/jais2.py +++ b/vllm/model_executor/models/jais2.py @@ -51,18 +51,14 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP from .utils import ( AutoWeightsLoader, PPMissingLayer, + WeightsMapper, extract_layer_index, - is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -370,64 +366,15 @@ def forward( hidden_states, _ = self.norm(hidden_states + residual), residual return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - continue - if "scale" in name: - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class Jais2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": ["q_proj", "k_proj", "v_proj"], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + } + ) embedding_modules = { "embed_tokens": "input_embeddings", @@ -494,4 +441,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: self, skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 7b2e6b93b27e..4e3a4efc5a70 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -47,10 +47,6 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.nemotron import NemotronConfig @@ -58,7 +54,7 @@ from .utils import ( AutoWeightsLoader, PPMissingLayer, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -365,59 +361,15 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + } + ) # LoRA specific attributes embedding_modules = { @@ -484,4 +436,4 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index b974a3eb0851..86b348fcc9d8 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -42,10 +42,6 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.model_executor.models.llama import LlamaAttention, LlamaMLP from vllm.sequence import IntermediateTensors from vllm.v1.attention.backend import AttentionType @@ -54,7 +50,7 @@ from .utils import ( AutoWeightsLoader, PPMissingLayer, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -316,64 +312,17 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - (".gate_up_proj", ".gate_proj", 0), - (".gate_up_proj", ".up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - continue - if "scale" in name or "zero_point" in name: - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps): - packed_modules_mapping = { - "qkv_proj": ["q_proj", "k_proj", "v_proj"], - "gate_up_proj": ["gate_proj", "up_proj"], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) # LoRA specific attributes embedding_modules = { @@ -463,4 +412,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: self, skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index 503d4b5c8343..ebbe2999829e 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -51,16 +51,13 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.sequence import IntermediateTensors from vllm.v1.attention.backend import AttentionType from .interfaces import SupportsLoRA from .utils import ( AutoWeightsLoader, + WeightsMapper, extract_layer_index, make_empty_intermediate_tensors_factory, make_layers, @@ -376,66 +373,17 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if name.endswith("scale"): - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - if weight_loader == default_weight_loader: - weight_loader(param, loaded_weight) - else: - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class OuroForCausalLM(nn.Module, SupportsLoRA): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -492,4 +440,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: self, skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/rnj1.py b/vllm/model_executor/models/rnj1.py index 68c3722e2bc1..37f0f6e1684a 100644 --- a/vllm/model_executor/models/rnj1.py +++ b/vllm/model_executor/models/rnj1.py @@ -30,18 +30,14 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.sequence import IntermediateTensors from vllm.v1.attention.backend import AttentionType from .interfaces import SupportsLoRA, SupportsPP from .utils import ( AutoWeightsLoader, + WeightsMapper, extract_layer_index, - is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -331,76 +327,17 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if ( - self.quant_config - and self.quant_config.get_name() == "gguf" - and name.endswith("norm.weight") - ): - loaded_weight -= 1 - - if name.endswith((".k_scale", ".v_scale", ".q_scale", ".prob_scale")): - remapped_name = maybe_remap_kv_scale_name(name, params_dict) - if remapped_name is not None and remapped_name in params_dict: - param = params_dict[remapped_name] - weight_loader = getattr( - param, "weight_loader", default_weight_loader - ) - weight_loader(param, loaded_weight) - loaded_params.add(remapped_name) - continue - - for param_name, shard_name, shard_id in stacked_params_mapping: - if shard_name not in name: - continue - name = name.replace(shard_name, param_name) - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - if name.endswith(".bias") and name not in params_dict: - continue - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - - return loaded_params - class Rnj1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config @@ -457,4 +394,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: self, skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index 48147f7334e8..af21f3ba17a9 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -49,10 +49,6 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import set_default_rope_theta from vllm.v1.attention.backend import AttentionType @@ -61,7 +57,7 @@ from .utils import ( AutoWeightsLoader, PPMissingLayer, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -362,62 +358,17 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -477,4 +428,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: self, skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 454a0e971125..275b8b4b9655 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -48,17 +48,13 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP from .utils import ( AutoWeightsLoader, PPMissingLayer, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -348,65 +344,17 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - (".gate_up_proj", ".gate_proj", 0), - (".gate_up_proj", ".up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) # LoRA specific attributes embedding_modules = { @@ -469,4 +417,4 @@ def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 5f08a59e2364..5ff3a4cbeeed 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -45,16 +45,12 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP from .utils import ( AutoWeightsLoader, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -272,41 +268,16 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class Starcoder2ForCausalLM(nn.Module, SupportsPP): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + } + ) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config @@ -362,4 +333,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: ["lm_head.weight"] if self.config.tie_word_embeddings else None ), ) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) From b1f1c9d919a8e36f05e5b14988fde3c07f95b750 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Jun 2026 10:30:58 +0000 Subject: [PATCH 09/34] Fix `load_weights` methods for fused case Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/layers/linear.py | 34 ++++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 40e98ac34ad6..6e3bd57085f8 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -647,7 +647,7 @@ def __init__( self.tp_size = get_tensor_model_parallel_world_size() if not disable_tp else 1 self.tp_rank = get_tensor_model_parallel_rank() if not disable_tp else 0 assert all(output_size % self.tp_size == 0 for output_size in output_sizes) - self.checkpoint_format: Literal["sharded", "fused"] | None = None + self.checkpoint_format: Literal["fused", "sharded"] | None = None super().__init__( input_size=input_size, output_size=sum(output_sizes), @@ -974,18 +974,18 @@ def load_weights( self, weights: Iterable[tuple[str, torch.Tensor]] ) -> Iterable[str]: for name, loaded_weight in weights: - shard_id_str, _, param_name = name.partition(".") - # If the shard_id is not an integer, the weight is not sharded - try: + if "." in name: + # Checkpoint is sharded + shard_id_str, _, name = name.partition(".") shard_id = int(shard_id_str) self.checkpoint_format = "sharded" - except ValueError: + else: shard_id = None self.checkpoint_format = "fused" - # If param_name is "bias" get it from self, otherwise load into self - param: Parameter = getattr(self, param_name, self) + # If name is "bias" get it from self, otherwise load into self + param: Parameter = getattr(self, name, self) param.weight_loader(param, loaded_weight, shard_id) - yield param_name + yield name class QKVParallelLinear(ColumnParallelLinear): @@ -1407,19 +1407,19 @@ def load_weights( self, weights: Iterable[tuple[str, torch.Tensor]] ) -> Iterable[str]: for name, loaded_weight in weights: - shard_id_str, _, param_name = name.partition(".") - # If the shard_id is not valid, the weight is not sharded - try: - self.validate_shard_id(shard_id_str) - shard_id = shard_id_str + if "." in name: + # Checkpoint is sharded + shard_id, _, name = name.partition(".") + self.validate_shard_id(shard_id) self.checkpoint_format = "sharded" - except ValueError: + else: + # Checkpoint is fused shard_id = None self.checkpoint_format = "fused" - # If param_name is "bias" get it from self, otherwise load into self - param: Parameter = getattr(self, param_name, self) + # If name is "bias" get it from self, otherwise load into self + param: Parameter = getattr(self, name, self) param.weight_loader(param, loaded_weight, shard_id) - yield param_name + yield name # --8<-- [start:row_parallel_linear] From b5a719172b2d8f0a806302a7f99f561b95071d5d Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Jun 2026 11:35:38 +0000 Subject: [PATCH 10/34] Fix BaiChuan tests that depend on old behaviour Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/lora/conftest.py | 21 ++++++++++++++ tests/lora/test_lora_checkpoints.py | 44 ++++++++++------------------- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index dea54ed21aea..7d9e8444827f 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -19,6 +19,7 @@ from vllm.model_executor.layers.linear import ( ColumnParallelLinear, MergedColumnParallelLinear, + QKVParallelLinear, RowParallelLinear, ) from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -166,6 +167,26 @@ def dummy_model_gate_up(default_vllm_config) -> nn.Module: return model +@pytest.fixture +def baichuan_dummy_model(default_vllm_config, dist_init) -> nn.Module: + # Only includes BaiChuan's lora modules so get_supported_lora_modules will work + model = DummyLoRAModel( + OrderedDict( + [ + ("W_pack", QKVParallelLinear(64, 8, 8)), + ("o_proj", RowParallelLinear(64, 64)), + ("gate_up_proj", MergedColumnParallelLinear(64, [16, 16])), + ("down_proj", RowParallelLinear(16, 64)), + ] + ) + ) + model.config = MagicMock() + # Match the expected format for BaiChuan checkpoints + model.W_pack.checkpoint_format = "fused" + model.gate_up_proj.checkpoint_format = "sharded" + return model + + @pytest.fixture(scope="session") def mixtral_lora_files(): # Note: this module has incorrect adapter_config.json to test diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index 7c263e2a2276..0a54a80242be 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -5,37 +5,26 @@ from vllm.lora.lora_model import LoRAModel from vllm.lora.peft_helper import PEFTHelper -from vllm.lora.utils import parse_fine_tuned_lora_name +from vllm.lora.utils import get_supported_lora_modules, parse_fine_tuned_lora_name from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM from vllm.model_executor.models.gemma4 import Gemma4ForCausalLM from vllm.model_executor.models.utils import WeightsMapper -lora_lst = ["baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"] -BAICHUAN_LORA_MODULES = [ - "W_pack", - "o_proj", - "gate_up_proj", - "down_proj", -] - -@pytest.mark.parametrize("lora_name", lora_lst) +@pytest.mark.parametrize( + "lora_name", + ["baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"], +) def test_load_checkpoints( lora_name, baichuan_lora_files, baichuan_zero_lora_files, baichuan_regex_lora_files, chatglm3_lora_files, + baichuan_dummy_model, ): - packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping - - expected_lora_lst: list[str] = [] - for module in BAICHUAN_LORA_MODULES: - if module in packed_modules_mapping: - expected_lora_lst.extend(packed_modules_mapping[module]) - else: - expected_lora_lst.append(module) - expected_lora_modules = set(expected_lora_lst) + expected_lora_modules = set(get_supported_lora_modules(baichuan_dummy_model)) + weights_mapper = BaiChuanBaseForCausalLM.hf_to_vllm_mapper if lora_name == "baichuan7B": peft_helper = PEFTHelper.from_local_dir( baichuan_lora_files, max_position_embeddings=4096 @@ -49,6 +38,7 @@ def test_load_checkpoints( lora_model_id=1, device="cpu", model_vocab_size=64000, + weights_mapper=weights_mapper, ) elif lora_name == "baichuan7B-zero": # Test that the target_modules contain prefix @@ -64,6 +54,7 @@ def test_load_checkpoints( lora_model_id=1, device="cpu", model_vocab_size=64000, + weights_mapper=weights_mapper, ) elif lora_name == "baichuan7B-zero-regex": # Test that the `target_modules` in the form of regular expressions, @@ -78,6 +69,7 @@ def test_load_checkpoints( lora_model_id=1, device="cpu", model_vocab_size=64000, + weights_mapper=weights_mapper, ) else: # For the baichuan7B model, load chatglm3-6b's LoRA, @@ -97,22 +89,16 @@ def test_load_checkpoints( ) -def test_lora_weights_mapping(baichuan_lora_files): - packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping - - expected_lora_lst: list[str] = [] - for module in BAICHUAN_LORA_MODULES: - if module in packed_modules_mapping: - expected_lora_lst.extend(packed_modules_mapping[module]) - else: - expected_lora_lst.append(module) - expected_lora_modules = set(expected_lora_lst) +def test_lora_weights_mapping(baichuan_lora_files, baichuan_dummy_model): + expected_lora_modules = set(get_supported_lora_modules(baichuan_dummy_model)) hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.": "language_model.model.", }, orig_to_new_substr={ ".layers.": ".baichuan_layers.", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", }, ) peft_helper = PEFTHelper.from_local_dir( From f5383aa2d5dedd033770a0fbce1ac655387ac032 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Jun 2026 12:52:19 +0000 Subject: [PATCH 11/34] Handle MergedColumnParallelLinear for LoRA too Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/lora/layers/column_parallel_linear.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py index f9643809c7f4..7ec31c31253e 100644 --- a/vllm/lora/layers/column_parallel_linear.py +++ b/vllm/lora/layers/column_parallel_linear.py @@ -167,7 +167,10 @@ def can_replace_layer( if type(source_layer) is maybe_get_oot_by_class(ColumnParallelLinear): return True if isinstance(source_layer, maybe_get_oot_by_class(MergedColumnParallelLinear)): - if len(packed_modules_list) != 1: + if ( + len(packed_modules_list) != 1 + or source_layer.checkpoint_format == "sharded" + ): return False # Exclude layers with 3+ output sizes - those are handled by # MergedColumnParallelLinearVariableSliceWithLoRA since this @@ -347,7 +350,11 @@ def can_replace_layer( decorate: bool = True, ) -> bool: merged_cls = maybe_get_oot_by_class(MergedColumnParallelLinear) - if not isinstance(source_layer, merged_cls) or len(packed_modules_list) != 2: + if ( + not isinstance(source_layer, merged_cls) + or len(source_layer.output_sizes) != 2 + or source_layer.checkpoint_format == "fused" + ): return False tp_size = getattr(source_layer, "tp_size", 1) From 82a7a64f5f824393eac66efa6c4ef5972c17f798 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Jun 2026 13:59:45 +0000 Subject: [PATCH 12/34] Delete some more load_weights methods Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/chatglm.py | 53 ++----------- vllm/model_executor/models/cohere_eagle.py | 41 +--------- vllm/model_executor/models/fairseq2_llama.py | 6 +- vllm/model_executor/models/glm4.py | 26 +++---- vllm/model_executor/models/glm4v.py | 9 +++ vllm/model_executor/models/gpt_j.py | 61 +++------------ vllm/model_executor/models/granite.py | 74 ++++-------------- vllm/model_executor/models/internlm2.py | 20 +++-- vllm/model_executor/models/jina.py | 5 +- vllm/model_executor/models/llama.py | 73 ++++-------------- vllm/model_executor/models/mamba.py | 25 +----- vllm/model_executor/models/mamba2.py | 26 +------ vllm/model_executor/models/mimo.py | 57 ++------------ vllm/model_executor/models/mistral_eagle.py | 10 +-- vllm/model_executor/models/mpt.py | 17 ---- vllm/model_executor/models/qwen2.py | 81 ++++---------------- vllm/model_executor/models/qwen2_rm.py | 12 --- vllm/model_executor/models/qwen3.py | 19 ++--- vllm/model_executor/models/whisper.py | 48 +++--------- 19 files changed, 133 insertions(+), 530 deletions(-) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index c5d857e7c3df..4363188ff6e1 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -30,7 +30,6 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.chatglm import ChatGLMConfig @@ -38,7 +37,6 @@ from .utils import ( AutoWeightsLoader, WeightsMapper, - is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -316,12 +314,9 @@ def forward( @support_torch_compile class ChatGLMModel(nn.Module, SupportsQuant): - packed_modules_mapping = { - "linear_proj.merged_proj": [ - "linear_proj.gate_proj", - "linear_proj.dense_h_to_4h", - ] - } + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={".word_embeddings": ""}, + ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -386,47 +381,11 @@ def forward( return hidden_states def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("linear_proj.merged_proj", "linear_proj.gate_proj", 0), - ("linear_proj.merged_proj", "linear_proj.dense_h_to_4h", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - - for name, loaded_weight in weights: - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - if "rotary_pos_emb.inv_freq" in name: - continue - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) class ChatGLMBaseModel(nn.Module): - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_substr={".word_embeddings": ""}, - ) - def __init__( self, *, @@ -467,7 +426,7 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) - return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + return loader.load_weights(weights) class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, SupportsQuant): diff --git a/vllm/model_executor/models/cohere_eagle.py b/vllm/model_executor/models/cohere_eagle.py index 7b57c739ffe9..64ec0d6dd544 100644 --- a/vllm/model_executor/models/cohere_eagle.py +++ b/vllm/model_executor/models/cohere_eagle.py @@ -14,7 +14,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.commandr import ( CohereDecoderLayer, CohereForCausalLM, @@ -134,42 +133,6 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states, hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - (".gate_up_proj", ".gate_proj", 0), - (".gate_up_proj", ".up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - if name.endswith(".bias") and name not in params_dict: - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class EagleCohereForCausalLM(CohereForCausalLM): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -225,7 +188,9 @@ def _track_and_forward(inputs): ), ) - loaded_weight_names = loader.load_weights(map(_track_and_forward, weights)) + loaded_weight_names = loader.load_weights( + map(_track_and_forward, weights), mapper=self.hf_to_vllm_mapper + ) # Embed tokens are tied with the target model and therefore not # present in the EAGLE checkpoint; mark them as loaded explicitly to diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py index ca0e7e64df53..e898034fbfa5 100644 --- a/vllm/model_executor/models/fairseq2_llama.py +++ b/vllm/model_executor/models/fairseq2_llama.py @@ -79,10 +79,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) return loader.load_weights( - ( - self.reshape_fairseq2_weights(name, loaded_weight, params) - for name, loaded_weight in weights - ) + self.reshape_fairseq2_weights(name, loaded_weight, params) + for name, loaded_weight in weights ) def flag_sharded_weights(self, params: dict[str, Parameter]): diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index b1ad99637fa9..e7414e799861 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -48,7 +48,6 @@ from .utils import ( AutoWeightsLoader, PPMissingLayer, - WeightsMapper, maybe_prefix, ) @@ -235,16 +234,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_substr={ - ".q_proj": ".qkv_proj.q", - ".k_proj": ".qkv_proj.k", - ".v_proj": ".qkv_proj.v", - ".gate_proj": ".gate_up_proj.0", - ".up_proj": ".gate_up_proj.1", - } - ) - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config @@ -299,11 +288,16 @@ def compute_logits( return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader( - self, - skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), - ) - return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + skip_prefixes = ["lm_head."] if self.config.tie_word_embeddings else [] + # Skip the speculative (MTP) layers, which are loaded by the + # draft model instead. + num_nextn_layers = getattr(self.config, "num_nextn_predict_layers", 0) + skip_prefixes += [ + f"model.layers.{self.config.num_hidden_layers + i}." + for i in range(num_nextn_layers) + ] + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) + return loader.load_weights(weights) def get_spec_layer_idx_from_weight_name( diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 9d08df4df8dc..8b9a8f088930 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -61,6 +61,7 @@ SupportsMultiModal, SupportsPP, ) +from .utils import WeightsMapper class GLMVImagePixelInputs(TensorSchema): @@ -376,6 +377,14 @@ def forward(self, images: torch.Tensor) -> torch.Tensor: class GLM4VModel(ChatGLMModel): + hf_to_vllm_mapper = ChatGLMModel.hf_to_vllm_mapper | WeightsMapper( + orig_to_new_substr={ + # Vision GLU projections + "linear_proj.gate_proj": "linear_proj.merged_proj.0", + "linear_proj.dense_h_to_4h": "linear_proj.merged_proj.1", + } + ) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 30da9b4dea23..12c90a53d7ff 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -43,16 +43,12 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP from .utils import ( AutoWeightsLoader, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -239,51 +235,16 @@ def forward( hidden_states = self.ln_f(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "attn.bias" in name or "attn.masked_bias" in name: - continue - - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class GPTJForCausalLM(nn.Module, SupportsPP): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + } + ) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config @@ -329,5 +290,5 @@ def compute_logits( return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) - return loader.load_weights(weights) + loader = AutoWeightsLoader(self, skip_substrs=["attn.bias", "attn.masked_bias"]) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 2adc29f8d252..518dce8453d5 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -49,17 +49,13 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP from .utils import ( AutoWeightsLoader, PPMissingLayer, - is_pp_missing_parameter, + WeightsMapper, make_layers, maybe_prefix, ) @@ -253,6 +249,16 @@ def forward( @support_torch_compile class GraniteModel(nn.Module): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -323,65 +329,11 @@ def forward( return hidden_states def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - (".gate_up_proj", ".gate_proj", 0), - (".gate_up_proj", ".up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - # LoRA specific attributes embedding_modules = { "embed_tokens": "input_embeddings", diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 4010dd54d7d4..8eeadf2d0202 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -248,6 +248,13 @@ def forward( @support_torch_compile class InternLM2Model(nn.Module): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".w1": ".gate_up_proj.0", + ".w3": ".gate_up_proj.1", + } + ) + def __init__( self, *, @@ -308,15 +315,12 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_substr={ - ".w1": ".gate_up_proj.0", - ".w3": ".gate_up_proj.1", - } - ) - def __init__( self, *, @@ -374,7 +378,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: self, skip_prefixes=(["output."] if self.config.tie_word_embeddings else None), ) - return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + return loader.load_weights(weights) @default_pooling_type(tok_pooling_type="ALL") diff --git a/vllm/model_executor/models/jina.py b/vllm/model_executor/models/jina.py index 2b07937df08e..f1f585cdae8d 100644 --- a/vllm/model_executor/models/jina.py +++ b/vllm/model_executor/models/jina.py @@ -254,5 +254,8 @@ def _merge_weights( tensor = tensor + (lora_B @ lora_A) * scaling yield name, tensor - loaded = self.model.load_weights(_merge_weights(weights)) + loader = AutoWeightsLoader(self.model, ignore_unexpected_prefixes=["lm_head."]) + loaded = loader.load_weights( + _merge_weights(weights), mapper=self.model.hf_to_vllm_mapper + ) return {f"model.{name}" for name in loaded} diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 39044f5e8b4a..6290509923a3 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -52,10 +52,6 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.sequence import IntermediateTensors from vllm.v1.attention.backend import AttentionType @@ -71,8 +67,8 @@ from .utils import ( AutoWeightsLoader, PPMissingLayer, + WeightsMapper, extract_layer_index, - is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -349,6 +345,16 @@ def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None }, ) class LlamaModel(nn.Module, EagleModelMixin): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) + def __init__( self, *, @@ -435,66 +441,13 @@ def forward( return hidden_states def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - (".gate_up_proj", ".gate_proj", 0), - (".gate_up_proj", ".up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - continue - if "scale" in name or "zero_point" in name: - # Remapping the name of FP8 kv-scale or zero point. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) class LlamaForCausalLM( LocalArgmaxMixin, nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3 ): - packed_modules_mapping = { - "qkv_proj": ["q_proj", "k_proj", "v_proj"], - "gate_up_proj": ["gate_proj", "up_proj"], - } - # LoRA specific attributes embedding_modules = { "embed_tokens": "input_embeddings", diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index ec2a7255eb66..6a77a58abf4d 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -26,7 +26,6 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import ( HasInnerState, IsAttentionFree, @@ -37,7 +36,7 @@ from .utils import ( AutoWeightsLoader, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -170,28 +169,12 @@ def forward( return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "A_log" in name: - name = name.replace("A_log", "A") - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class MambaForCausalLM( nn.Module, HasInnerState, IsAttentionFree, SupportsPP, SupportsMambaPrefixCaching ): + hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={".A_log": ".A"}) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config @@ -279,4 +262,4 @@ def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index deb20852a26a..343111ee0151 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -25,7 +25,6 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import ( HasInnerState, IsAttentionFree, @@ -35,7 +34,7 @@ from .utils import ( AutoWeightsLoader, - is_pp_missing_parameter, + WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -167,29 +166,12 @@ def forward( return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "A_log" in name: - name = name.replace("A_log", "A") - - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class Mamba2ForCausalLM( nn.Module, HasInnerState, IsAttentionFree, SupportsMambaPrefixCaching ): + hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={".A_log": ".A"}) + @classmethod def get_mamba_state_dtype_from_config( cls, @@ -292,4 +274,4 @@ def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py index 4f67d468ace5..e4247fa8d8df 100644 --- a/vllm/model_executor/models/mimo.py +++ b/vllm/model_executor/models/mimo.py @@ -38,14 +38,10 @@ from vllm.logger import init_logger from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM, Qwen2Model from vllm.sequence import IntermediateTensors -from .utils import PPMissingLayer, is_pp_missing_parameter, maybe_prefix +from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix logger = init_logger(__name__) @@ -89,50 +85,6 @@ def forward( hidden_states = hidden_states + residual return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "mtp_layers" in name: - continue - if "rotary_emb.inv_freq" in name: - continue - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -167,6 +119,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model.make_empty_intermediate_tensors ) + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + skip_prefixes = ["lm_head."] if self.config.tie_word_embeddings else [] + # MTP layers are loaded by the draft model, not the main model. + skip_prefixes.append("model.mtp_layers.") + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) + return loader.load_weights(weights) + def compute_logits( self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/mistral_eagle.py b/vllm/model_executor/models/mistral_eagle.py index 8865742d6495..75d1ebb91a80 100644 --- a/vllm/model_executor/models/mistral_eagle.py +++ b/vllm/model_executor/models/mistral_eagle.py @@ -108,11 +108,6 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states, hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - # Pretend embed_tokens is loaded; the actual weight is shared - # from the target model at runtime by `load_eagle_model`. - return super().load_weights(weights) | {"embed_tokens.weight"} - class EagleMistralForCausalLM(MistralForCausalLM): mistral_mapping = MistralForCausalLM.mistral_mapping | { @@ -166,3 +161,8 @@ def embed_input_ids( multimodal_embeddings=multimodal_embeddings, is_multimodal=is_multimodal, ) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + # Pretend embed_tokens is loaded; the actual weight is shared + # from the target model at runtime by `load_eagle_model`. + return super().load_weights(weights) | {"model.embed_tokens.weight"} diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 85933626cd30..8e509fbcb4c6 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -27,13 +27,11 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP from .utils import ( AutoWeightsLoader, - is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -274,21 +272,6 @@ def forward( hidden_states = self.norm_f(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class MPTForCausalLM(nn.Module, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 9c39c6497082..e27ed683d3e1 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -54,10 +54,6 @@ ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import is_interleaved, set_default_rope_theta from vllm.v1.attention.backend import AttentionType @@ -72,8 +68,8 @@ from .utils import ( AutoWeightsLoader, PPMissingLayer, + WeightsMapper, extract_layer_index, - is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, @@ -323,6 +319,16 @@ def forward( } ) class Qwen2Model(nn.Module, EagleModelMixin): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + } + ) + def __init__( self, *, @@ -426,74 +432,13 @@ def forward( return hidden_states def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - if name.endswith("scale"): - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - if weight_loader == default_weight_loader: - weight_loader(param, loaded_weight) - else: - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - if is_pp_missing_parameter(name, self): - continue - if name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) class Qwen2ForCausalLM( nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3 ): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config.get_text_config() diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index cdf1a327efe5..08c036e1a9b8 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -28,18 +28,6 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP): is_pooling_model = True pooler: Pooler - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index b070eac32551..0bebd7d367e9 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -57,7 +57,12 @@ ) from .qwen2 import Qwen2MLP as Qwen3MLP from .qwen2 import Qwen2Model -from .utils import AutoWeightsLoader, PPMissingLayer, extract_layer_index, maybe_prefix +from .utils import ( + AutoWeightsLoader, + PPMissingLayer, + extract_layer_index, + maybe_prefix, +) logger = init_logger(__name__) @@ -267,18 +272,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): class Qwen3ForCausalLM( LocalArgmaxMixin, nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3 ): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - embedding_modules = { "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 628186e7598b..a0a06264a7f6 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -44,7 +44,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.whisper_utils import ( ISO639_1_SUPPORTED_LANGS, ) @@ -617,42 +616,6 @@ def get_encoder_outputs( return None return self.encoder(input_features) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - (".self_attn.qkv_proj", ".self_attn.q_proj", "q"), - (".self_attn.qkv_proj", ".self_attn.k_proj", "k"), - (".self_attn.qkv_proj", ".self_attn.v_proj", "v"), - # MergedColumnParallelLinear uses integer indices (0, 1) - (".encoder_attn.kv_proj", ".encoder_attn.k_proj", 0), - (".encoder_attn.kv_proj", ".encoder_attn.v_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - class WhisperProcessingInfo(BaseProcessingInfo): def get_hf_config(self) -> WhisperConfig: @@ -808,7 +771,16 @@ class WhisperForConditionalGeneration( } hf_to_vllm_mapper = WeightsMapper( - orig_to_new_substr={".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."} + orig_to_new_substr={ + ".fc1.": ".mlp.fc1.", + ".fc2.": ".mlp.fc2.", + ".self_attn.q_proj": ".self_attn.qkv_proj.q", + ".self_attn.k_proj": ".self_attn.qkv_proj.k", + ".self_attn.v_proj": ".self_attn.qkv_proj.v", + # MergedColumnParallelLinear uses integer indices (0, 1) + ".encoder_attn.k_proj": ".encoder_attn.kv_proj.0", + ".encoder_attn.v_proj": ".encoder_attn.kv_proj.1", + } ) # Whisper only supports audio-conditioned generation. From c3a316a79e48efc4d2f0dab8ff55e6c79c5e8178 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Jun 2026 14:09:33 +0000 Subject: [PATCH 13/34] Add debug logs while loading Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/layers/linear.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 6e3bd57085f8..dbcdc6e32ca2 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -979,9 +979,22 @@ def load_weights( shard_id_str, _, name = name.partition(".") shard_id = int(shard_id_str) self.checkpoint_format = "sharded" + logger.debug( + "Loaded shard %s into %s for layer %s.%s", + shard_id, + name, + self.prefix, + name, + ) else: shard_id = None self.checkpoint_format = "fused" + logger.debug( + "Loaded weight %s.%s with shape %s", + self.prefix, + name, + loaded_weight.shape, + ) # If name is "bias" get it from self, otherwise load into self param: Parameter = getattr(self, name, self) param.weight_loader(param, loaded_weight, shard_id) @@ -1412,10 +1425,23 @@ def load_weights( shard_id, _, name = name.partition(".") self.validate_shard_id(shard_id) self.checkpoint_format = "sharded" + logger.debug( + "Loaded shard %s into %s for layer %s.%s", + shard_id, + name, + self.prefix, + name, + ) else: # Checkpoint is fused shard_id = None self.checkpoint_format = "fused" + logger.debug( + "Loaded weight %s.%s with shape %s", + self.prefix, + name, + loaded_weight.shape, + ) # If name is "bias" get it from self, otherwise load into self param: Parameter = getattr(self, name, self) param.weight_loader(param, loaded_weight, shard_id) From c802faadb715fbf4a5310fc6630050cd99c1fc33 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Jun 2026 14:59:13 +0000 Subject: [PATCH 14/34] Fix late initialised biases Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/model_loader/reload/layerwise.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/reload/layerwise.py b/vllm/model_executor/model_loader/reload/layerwise.py index 6cf1c19cba43..d0d26fed3e6c 100644 --- a/vllm/model_executor/model_loader/reload/layerwise.py +++ b/vllm/model_executor/model_loader/reload/layerwise.py @@ -131,8 +131,11 @@ def initialize_online_processing(layer: torch.nn.Module): # Track loading progress to determine when to process/copy info.load_numel = 0 info.load_numel_total = get_layer_size(layer) + _wrap_parameters_weight_loader(layer) - # Wrap each parameter's weight loader + +def _wrap_parameters_weight_loader(layer: torch.nn.Module) -> None: + """Wrap each parameter's weight loader.""" # Note that nested wrapping will occur for shared tensors for name, tensor in get_layer_tensors(layer).items(): if name in SKIP_TENSORS: @@ -168,6 +171,12 @@ def online_process_loader(*args, **kwargs): logger.debug("%s: Excessive loading", layer.__class__.__name__) return + # Re-run on each load: layers may register parameters later (e.g., `bias`). + # Wrap late parameters and refresh `load_numel_total` so processing waits + # until all parameters are loaded. + info.load_numel_total = get_layer_size(layer) + _wrap_parameters_weight_loader(layer) + # Bind and normalize arguments bound_args = loader_signature.bind(*args, **kwargs) bound_args.apply_defaults() From 88de67d0a67a588881c78a361adcbe63fbc1c84f Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Jun 2026 16:00:08 +0000 Subject: [PATCH 15/34] Fix GPTQ tests Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/model_executor/test_weight_utils.py | 34 +++++++++++++++++++++++ vllm/model_executor/layers/linear.py | 14 ++++++++++ vllm/model_executor/models/gemma.py | 4 +-- vllm/model_executor/models/granite.py | 4 +-- vllm/model_executor/models/interfaces.py | 5 +++- vllm/model_executor/models/internlm2.py | 4 +-- vllm/model_executor/models/llama.py | 3 +- vllm/model_executor/models/qwen2.py | 3 +- vllm/model_executor/models/utils.py | 28 +++++++++++++++++++ 9 files changed, 90 insertions(+), 9 deletions(-) diff --git a/tests/model_executor/test_weight_utils.py b/tests/model_executor/test_weight_utils.py index 9e67609b78e4..f2ac7a8ba26c 100644 --- a/tests/model_executor/test_weight_utils.py +++ b/tests/model_executor/test_weight_utils.py @@ -281,5 +281,39 @@ def test_composes_with_qkv_mapper(self): ) +def test_weights_mapper_get_packed_modules_mapping(): + from vllm.model_executor.models.utils import WeightsMapper + + mapper = WeightsMapper( + orig_to_new_substr={ + ".q_proj": ".qkv_proj.q", + ".k_proj": ".qkv_proj.k", + ".v_proj": ".qkv_proj.v", + ".gate_proj": ".gate_up_proj.0", + ".up_proj": ".gate_up_proj.1", + # Non-fusion entries must not contribute + ".word_embeddings": "", + "llm.model.": "model.decoder.", + "llm.lm_head": "lm_head", + } + ) + assert mapper.get_packed_modules_mapping() == { + "qkv_proj": ["qkv_proj.q", "qkv_proj.k", "qkv_proj.v"], + "gate_up_proj": ["gate_up_proj.0", "gate_up_proj.1"], + } + + # Shard order comes from the shard id, not declaration order, and + # dotted module paths reduce to the last component + mapper = WeightsMapper( + orig_to_new_substr={ + "linear_proj.dense_h_to_4h": "linear_proj.merged_proj.1", + "linear_proj.gate_proj": "linear_proj.merged_proj.0", + } + ) + assert mapper.get_packed_modules_mapping() == { + "merged_proj": ["merged_proj.0", "merged_proj.1"], + } + + if __name__ == "__main__": test_download_weights_from_hf() diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index dbcdc6e32ca2..1dc03b34ecc4 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -997,6 +997,13 @@ def load_weights( ) # If name is "bias" get it from self, otherwise load into self param: Parameter = getattr(self, name, self) + if ( + param is None + and name == "bias" + and self.quant_config is not None + and "gptq" in self.quant_config.get_name() + ): + continue param.weight_loader(param, loaded_weight, shard_id) yield name @@ -1444,6 +1451,13 @@ def load_weights( ) # If name is "bias" get it from self, otherwise load into self param: Parameter = getattr(self, name, self) + if ( + param is None + and name == "bias" + and self.quant_config is not None + and "gptq" in self.quant_config.get_name() + ): + continue param.weight_loader(param, loaded_weight, shard_id) yield name diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 64808e95ae2b..f95cdc161482 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -44,7 +44,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP +from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant from .utils import ( AutoWeightsLoader, WeightsMapper, @@ -324,7 +324,7 @@ def forward( return hidden_states -class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): +class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant): hf_to_vllm_mapper = WeightsMapper( orig_to_new_substr={ ".q_proj": ".qkv_proj.q", diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 518dce8453d5..77775873de8d 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -51,7 +51,7 @@ ) from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP +from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant from .utils import ( AutoWeightsLoader, PPMissingLayer, @@ -248,7 +248,7 @@ def forward( @support_torch_compile -class GraniteModel(nn.Module): +class GraniteModel(nn.Module, SupportsQuant): hf_to_vllm_mapper = WeightsMapper( orig_to_new_substr={ ".q_proj": ".qkv_proj.q", diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 68dbcf90f877..093514cc0da3 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1032,8 +1032,11 @@ def _maybe_apply_model_mapping(self): return if (hf_to_vllm_mapper := self.hf_to_vllm_mapper) is not None: self.quant_config.apply_vllm_mapper(hf_to_vllm_mapper) - if self.packed_modules_mapping is not None: + if self.packed_modules_mapping: self.quant_config.packed_modules_mapping.update(self.packed_modules_mapping) + elif hf_to_vllm_mapper is not None: + packed_modules_mapping = hf_to_vllm_mapper.get_packed_modules_mapping() + self.quant_config.packed_modules_mapping.update(packed_modules_mapping) @runtime_checkable diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 8eeadf2d0202..eb726e48f956 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -37,7 +37,7 @@ ) from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP +from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant from .interfaces_base import default_pooling_type from .utils import ( AutoWeightsLoader, @@ -247,7 +247,7 @@ def forward( @support_torch_compile -class InternLM2Model(nn.Module): +class InternLM2Model(nn.Module, SupportsQuant): hf_to_vllm_mapper = WeightsMapper( orig_to_new_substr={ ".w1": ".gate_up_proj.0", diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 6290509923a3..b9474648b78e 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -63,6 +63,7 @@ SupportsEagle3, SupportsLoRA, SupportsPP, + SupportsQuant, ) from .utils import ( AutoWeightsLoader, @@ -344,7 +345,7 @@ def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None "inputs_embeds": {0: "b"}, }, ) -class LlamaModel(nn.Module, EagleModelMixin): +class LlamaModel(nn.Module, EagleModelMixin, SupportsQuant): hf_to_vllm_mapper = WeightsMapper( orig_to_new_substr={ ".q_proj": ".qkv_proj.q", diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index e27ed683d3e1..d5a9e1aab50a 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -64,6 +64,7 @@ SupportsEagle3, SupportsLoRA, SupportsPP, + SupportsQuant, ) from .utils import ( AutoWeightsLoader, @@ -318,7 +319,7 @@ def forward( "inputs_embeds": {0: "b"}, } ) -class Qwen2Model(nn.Module, EagleModelMixin): +class Qwen2Model(nn.Module, EagleModelMixin, SupportsQuant): hf_to_vllm_mapper = WeightsMapper( orig_to_new_substr={ ".q_proj": ".qkv_proj.q", diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index b89f5b6db684..7161640f4e5f 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -120,6 +120,34 @@ def apply_dict(self, values: dict[str, Any]) -> dict[str, Any]: if (out_name := self._map_name(name)) is not None } + def get_packed_modules_mapping(self) -> dict[str, list[str]]: + """Derive a `packed_modules_mapping` from this mapper's fusion entries.""" + qkv_order = {"q": 0, "k": 1, "v": 2} + packed: dict[str, list[tuple[int, str]]] = {} + mappings = ( + self.orig_to_new_substr, + self.orig_to_new_prefix, + self.orig_to_new_suffix, + ) + for mapping in mappings: + for new in mapping.values(): + if new is None or "." not in new: + continue + param_path, _, shard_id = new.rpartition(".") + if shard_id.isdigit(): + order = int(shard_id) + elif shard_id in qkv_order: + order = qkv_order[shard_id] + else: + continue + param_name = param_path.lstrip(".").rpartition(".")[2] + shards = packed.setdefault(param_name, []) + shards.append((order, f"{param_name}.{shard_id}")) + return { + name: [shard for _, shard in sorted(shards)] + for name, shards in packed.items() + } + class AutoWeightsLoader: """ From 4069aaef2fcc0ca2b7c9ba9b70a65ba370c7faf2 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Jun 2026 16:36:19 +0000 Subject: [PATCH 16/34] fix bnb Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../model_loader/bitsandbytes_loader.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index d10f3bfcbe9b..86be8ad1d179 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -563,7 +563,6 @@ def _initialize_loader_state( configuration. """ self.is_pool_model = is_pooling_model(model) - self.modules_mapping = ParamMapping(get_packed_modules_mapping(model)) if is_moe_model(model): self.expert_params_mapping = get_moe_expert_mapping(model) @@ -573,10 +572,18 @@ def _initialize_loader_state( "BitsAndBytes quantization yet. Ensure this model has " "'get_expert_mapping' method." ) - # For some models like Molmo, we need to use hf_to_vllm_mapper - # to ensure correct loading of weights. - if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None): - self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name) + packed_modules_mapping = get_packed_modules_mapping(model) + # `hf_to_vllm_mapper` may belong to model or base model + for module in (model, *model.children()): + if hf_to_vllm_mapper := getattr(module, "hf_to_vllm_mapper", None): + self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name) + # If model had no `packed_modules_mapping`, try to get it from mapper + if not packed_modules_mapping: + packed_modules_mapping = ( + hf_to_vllm_mapper.get_packed_modules_mapping() + ) + break + self.modules_mapping = ParamMapping(packed_modules_mapping) self._get_bnb_target_modules(model) self._classify_module_sharding(model) From 872ff37de619f5c91ad442af908d15c9ce2793d1 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 15:01:30 +0000 Subject: [PATCH 17/34] Make `vllm.model_executor.utils.get_packed_modules_mapping` check `hf_to_vllm_mapper` too Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/utils.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index a0269be855a9..d41cbf0f75ce 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -98,9 +98,26 @@ def replace_parameter( setattr(layer, param_name, new_param) +def _get_packed_modules_mapping(module: torch.nn.Module) -> dict[str, list[str]]: + """Get the packed modules mapping from a module. + + It could come from one of two places: + + 1. The module has a `packed_modules_mapping` attribute. + 2. The module has a `hf_to_vllm_mapper` attribute, which can generate the mapping. + + No module should have both attributes, and if it does, + the `packed_modules_mapping` attribute takes precedence.""" + if packed_modules_mapping := getattr(module, "packed_modules_mapping", None): + return copy.deepcopy(packed_modules_mapping) + elif hf_to_vllm_mapper := getattr(module, "hf_to_vllm_mapper", None): + return hf_to_vllm_mapper.get_packed_modules_mapping() + else: + return {} + + def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]: - parent_map = getattr(model, "packed_modules_mapping", None) - parent_map = copy.deepcopy(parent_map) if parent_map is not None else {} + parent_map = _get_packed_modules_mapping(model) # don't infer mapping if the model has defined it explicitly. if parent_map: @@ -108,8 +125,7 @@ def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]: # We only check main components instead of whole model submodules for child in model.children(): - child_map = getattr(child, "packed_modules_mapping", None) - child_map = copy.deepcopy(child_map) if child_map is not None else {} + child_map = _get_packed_modules_mapping(child) if any((k in parent_map and parent_map[k] != v) for k, v in child_map.items()): raise ValueError( From 3b73687bced12246532f3e6383c6b24ce4418c94 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 15:02:29 +0000 Subject: [PATCH 18/34] Fix `WeightsMapper.get_packed_modules_mapping` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/utils.py | 47 +++++++++++------------------ 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index e5366256f296..a5b8410c00c3 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -129,32 +129,20 @@ def apply_dict(self, values: dict[str, Any]) -> dict[str, Any]: } def get_packed_modules_mapping(self) -> dict[str, list[str]]: - """Derive a `packed_modules_mapping` from this mapper's fusion entries.""" - qkv_order = {"q": 0, "k": 1, "v": 2} - packed: dict[str, list[tuple[int, str]]] = {} - mappings = ( - self.orig_to_new_substr, - self.orig_to_new_prefix, - self.orig_to_new_suffix, - ) - for mapping in mappings: - for new in mapping.values(): - if new is None or "." not in new: - continue - param_path, _, shard_id = new.rpartition(".") - if shard_id.isdigit(): - order = int(shard_id) - elif shard_id in qkv_order: - order = qkv_order[shard_id] - else: - continue - param_name = param_path.lstrip(".").rpartition(".")[2] - shards = packed.setdefault(param_name, []) - shards.append((order, f"{param_name}.{shard_id}")) - return { - name: [shard for _, shard in sorted(shards)] - for name, shards in packed.items() - } + """Derive a `packed_modules_mapping` from `self.orig_to_new_substr`.""" + qkv_shards = {"q", "k", "v"} + packed_modules_mapping: dict[str, list[str]] = {} + for old, new in self.orig_to_new_substr.items(): + if new is None or "." not in new: + continue + param_path, _, shard_id = new.rpartition(".") + # Is shard_id actually a shard ID? + if not (shard_id.isdigit() or shard_id in qkv_shards): + continue + _, _, weight_name = old.rpartition(".") + _, _, param_name = param_path.rpartition(".") + packed_modules_mapping.setdefault(param_name, []).append(weight_name) + return packed_modules_mapping class AutoWeightsLoader: @@ -396,12 +384,11 @@ def load_weights( # Skip loading extra bias for GPTQ models if "gptq" in quant_config.get_name(): self.ignore_unexpected_suffixes.append(".bias") - # Get mappings for KV cache quantization scales + # Get mappings and ignore prefixes for KV cache quantization scales mapper = mapper or WeightsMapper() mapper |= quant_config.get_cache_scale_mapper() - self.ignore_unexpected_suffixes.extend( - quant_config._ignore_unexpected_suffixes - ) + ignore_unexpected_prefixes = quant_config._ignore_unexpected_prefixes + self.ignore_unexpected_suffixes.extend(ignore_unexpected_prefixes) if mapper is not None: weights = mapper.apply(weights) # filter out weights with first-prefix/substr to skip in name From c657d7d2855dd501594a8c6cf888183815ced54b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 15:03:18 +0000 Subject: [PATCH 19/34] Better `SupportsQuant._maybe_apply_model_mapping` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/interfaces.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 093514cc0da3..89815722a1d7 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1032,11 +1032,11 @@ def _maybe_apply_model_mapping(self): return if (hf_to_vllm_mapper := self.hf_to_vllm_mapper) is not None: self.quant_config.apply_vllm_mapper(hf_to_vllm_mapper) + if packed_modules_mapping := hf_to_vllm_mapper.get_packed_modules_mapping(): + self.packed_modules_mapping = self.packed_modules_mapping or {} + self.packed_modules_mapping.update(packed_modules_mapping) if self.packed_modules_mapping: self.quant_config.packed_modules_mapping.update(self.packed_modules_mapping) - elif hf_to_vllm_mapper is not None: - packed_modules_mapping = hf_to_vllm_mapper.get_packed_modules_mapping() - self.quant_config.packed_modules_mapping.update(packed_modules_mapping) @runtime_checkable From f2d548b1772934db9e91eb35f9d605e616f2a8ff Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 15:03:37 +0000 Subject: [PATCH 20/34] `BitsAndBytesModelLoader` can be simpler now Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/model_loader/bitsandbytes_loader.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 0162ad300a37..87a310d1fb23 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -563,6 +563,7 @@ def _initialize_loader_state( configuration. """ self.is_pool_model = is_pooling_model(model) + self.modules_mapping = ParamMapping(get_packed_modules_mapping(model)) if is_moe_model(model): self.expert_params_mapping = get_moe_expert_mapping(model) @@ -572,18 +573,11 @@ def _initialize_loader_state( "BitsAndBytes quantization yet. Ensure this model has " "'get_expert_mapping' method." ) - packed_modules_mapping = get_packed_modules_mapping(model) # `hf_to_vllm_mapper` may belong to model or base model for module in (model, *model.children()): if hf_to_vllm_mapper := getattr(module, "hf_to_vllm_mapper", None): self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name) - # If model had no `packed_modules_mapping`, try to get it from mapper - if not packed_modules_mapping: - packed_modules_mapping = ( - hf_to_vllm_mapper.get_packed_modules_mapping() - ) break - self.modules_mapping = ParamMapping(packed_modules_mapping) self._get_bnb_target_modules(model) self._classify_module_sharding(model) From 41e3a9e9c9e63b7d9bc1aaca5b38c0d02fa3809a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 15:04:42 +0000 Subject: [PATCH 21/34] Use `get_packed_modules_mapping` for `get_supported_lora_modules` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/lora/utils.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 11ff37465339..fd624593b998 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -34,11 +34,7 @@ VocabParallelEmbeddingWithLoRA, ) from vllm.model_executor.layers.fused_moe import MoERunner -from vllm.model_executor.layers.linear import ( - LinearBase, - MergedColumnParallelLinear, - QKVParallelLinear, -) +from vllm.model_executor.layers.linear import LinearBase from vllm.model_executor.utils import get_moe_expert_mapping, get_packed_modules_mapping from vllm.transformers_utils.repo_utils import hf_api @@ -214,7 +210,8 @@ def get_supported_lora_modules(model: nn.Module) -> list[str]: In vLLM, all linear layers support LoRA. """ - supported_lora_modules: set[str] = set() + packed_modules_mapping = get_packed_modules_mapping(model) + supported_lora_modules: set[str] = set(sum(packed_modules_mapping.values(), [])) for name, module in model.named_modules(): # get the embedding modules if the module's embedding_modules # is not empty. @@ -224,18 +221,10 @@ def get_supported_lora_modules(model: nn.Module) -> list[str]: supported_lora_modules.add(name) if ( - isinstance(module, QKVParallelLinear) - and module.checkpoint_format == "sharded" - ): - supported_lora_modules.update(["q", "k", "v"]) - elif ( - isinstance(module, MergedColumnParallelLinear) - and module.checkpoint_format == "sharded" + isinstance(module, (LinearBase, MoERunner)) + and (supported_name := name.split(".")[-1]) not in packed_modules_mapping ): - shard_ids = [str(i) for i in range(len(module.output_sizes))] - supported_lora_modules.update(shard_ids) - elif isinstance(module, (LinearBase, MoERunner)): - supported_lora_modules.add(name.split(".")[-1]) + supported_lora_modules.add(supported_name) return list(supported_lora_modules) From 68085a0ca4395cb96ecaf546bddc52e052fc4fc5 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 15:12:50 +0000 Subject: [PATCH 22/34] Fix test Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/model_executor/test_weight_utils.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/tests/model_executor/test_weight_utils.py b/tests/model_executor/test_weight_utils.py index f2ac7a8ba26c..202a41d36e3b 100644 --- a/tests/model_executor/test_weight_utils.py +++ b/tests/model_executor/test_weight_utils.py @@ -298,20 +298,8 @@ def test_weights_mapper_get_packed_modules_mapping(): } ) assert mapper.get_packed_modules_mapping() == { - "qkv_proj": ["qkv_proj.q", "qkv_proj.k", "qkv_proj.v"], - "gate_up_proj": ["gate_up_proj.0", "gate_up_proj.1"], - } - - # Shard order comes from the shard id, not declaration order, and - # dotted module paths reduce to the last component - mapper = WeightsMapper( - orig_to_new_substr={ - "linear_proj.dense_h_to_4h": "linear_proj.merged_proj.1", - "linear_proj.gate_proj": "linear_proj.merged_proj.0", - } - ) - assert mapper.get_packed_modules_mapping() == { - "merged_proj": ["merged_proj.0", "merged_proj.1"], + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], } From 2f56a4264fb063e1a94818687952cd5517ed5d27 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 15:25:41 +0000 Subject: [PATCH 23/34] tweaks Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/interfaces.py | 2 +- vllm/model_executor/models/jina.py | 6 ++---- vllm/model_executor/models/qwen3.py | 7 +------ 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 89815722a1d7..3040f3283f08 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1035,7 +1035,7 @@ def _maybe_apply_model_mapping(self): if packed_modules_mapping := hf_to_vllm_mapper.get_packed_modules_mapping(): self.packed_modules_mapping = self.packed_modules_mapping or {} self.packed_modules_mapping.update(packed_modules_mapping) - if self.packed_modules_mapping: + if self.packed_modules_mapping is not None: self.quant_config.packed_modules_mapping.update(self.packed_modules_mapping) diff --git a/vllm/model_executor/models/jina.py b/vllm/model_executor/models/jina.py index f1f585cdae8d..82a534404027 100644 --- a/vllm/model_executor/models/jina.py +++ b/vllm/model_executor/models/jina.py @@ -255,7 +255,5 @@ def _merge_weights( yield name, tensor loader = AutoWeightsLoader(self.model, ignore_unexpected_prefixes=["lm_head."]) - loaded = loader.load_weights( - _merge_weights(weights), mapper=self.model.hf_to_vllm_mapper - ) - return {f"model.{name}" for name in loaded} + weights = _merge_weights(weights) + return loader.load_weights(weights, mapper=self.model.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 0bebd7d367e9..06f721209fd0 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -57,12 +57,7 @@ ) from .qwen2 import Qwen2MLP as Qwen3MLP from .qwen2 import Qwen2Model -from .utils import ( - AutoWeightsLoader, - PPMissingLayer, - extract_layer_index, - maybe_prefix, -) +from .utils import AutoWeightsLoader, PPMissingLayer, extract_layer_index, maybe_prefix logger = init_logger(__name__) From d0518136e7d5d2a5299b21819abd70e558b2893c Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 15:54:32 +0000 Subject: [PATCH 24/34] typo Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index a5b8410c00c3..0d6ece74c435 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -387,8 +387,8 @@ def load_weights( # Get mappings and ignore prefixes for KV cache quantization scales mapper = mapper or WeightsMapper() mapper |= quant_config.get_cache_scale_mapper() - ignore_unexpected_prefixes = quant_config._ignore_unexpected_prefixes - self.ignore_unexpected_suffixes.extend(ignore_unexpected_prefixes) + ignore_unexpected_suffixes = quant_config._ignore_unexpected_suffixes + self.ignore_unexpected_suffixes.extend(ignore_unexpected_suffixes) if mapper is not None: weights = mapper.apply(weights) # filter out weights with first-prefix/substr to skip in name From 5c2a354a2b1ff4b81cd1c0dfde3d4860b059ea8c Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 17:22:52 +0000 Subject: [PATCH 25/34] Mapper must present both shard id and weight name as supported packings Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 0d6ece74c435..7c10c2e939ff 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -141,7 +141,8 @@ def get_packed_modules_mapping(self) -> dict[str, list[str]]: continue _, _, weight_name = old.rpartition(".") _, _, param_name = param_path.rpartition(".") - packed_modules_mapping.setdefault(param_name, []).append(weight_name) + packed_names = packed_modules_mapping.setdefault(param_name, []) + packed_names.extend([weight_name, shard_id]) return packed_modules_mapping From b5bdb582b186999c6e864e958c1afe023f301eb9 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 19:08:54 +0000 Subject: [PATCH 26/34] Fix test Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/lora/test_lora_huggingface.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index 7c7f4eb4b626..49601a7e0786 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -6,7 +6,7 @@ from vllm.lora.lora_model import LoRAModel from vllm.lora.peft_helper import PEFTHelper from vllm.lora.utils import get_adapter_absolute_path -from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM +from vllm.model_executor.models.llama import LlamaModel # Provide absolute path and huggingface lora ids lora_fixture_name = ["llama32_lora_files", "llama32_lora_huggingface_id"] @@ -23,7 +23,7 @@ @pytest.mark.parametrize("lora_fixture_name", lora_fixture_name) def test_load_checkpoints_from_huggingface(lora_fixture_name, request): lora_name = request.getfixturevalue(lora_fixture_name) - packed_modules_mapping = Qwen3ForCausalLM.packed_modules_mapping + packed_modules_mapping = LlamaModel.hf_to_vllm_mapper.get_packed_modules_mapping() expected_lora_lst: list[str] = [] for module in LLAMA_LORA_MODULES: From dbf02b1cbdebf8c7076eabb030e731b2bf501274 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 19:10:47 +0000 Subject: [PATCH 27/34] `AutoWeightsLoader` inject packed mappings from mapper at load time Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/utils.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 7c10c2e939ff..1a22ec92a3b9 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -5,7 +5,7 @@ from collections.abc import Callable, Iterable, Mapping from contextlib import contextmanager from dataclasses import dataclass, field -from typing import Any, Literal, Protocol, overload +from typing import TYPE_CHECKING, Any, Literal, Protocol, overload import regex as re import torch @@ -19,7 +19,6 @@ get_tensor_model_parallel_world_size, ) from vllm.logger import init_logger -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.model_loader.reload import ( support_quantized_model_reload_from_hp_weights, ) @@ -33,6 +32,9 @@ direct_register_custom_op, ) +if TYPE_CHECKING: + from vllm.model_executor.layers.quantization import QuantizationConfig + logger = init_logger(__name__) @@ -390,6 +392,9 @@ def load_weights( mapper |= quant_config.get_cache_scale_mapper() ignore_unexpected_suffixes = quant_config._ignore_unexpected_suffixes self.ignore_unexpected_suffixes.extend(ignore_unexpected_suffixes) + # If mapper contains packed_modules_mapping, update them in quant_config + if packed_modules_mapping := mapper.get_packed_modules_mapping(): + quant_config.packed_modules_mapping.update(packed_modules_mapping) if mapper is not None: weights = mapper.apply(weights) # filter out weights with first-prefix/substr to skip in name @@ -758,9 +763,7 @@ def maybe_prefix(prefix: str, name: str) -> str: return name if not prefix else f"{prefix}.{name}" -def get_draft_quant_config( - vllm_config: VllmConfig, -) -> QuantizationConfig | None: +def get_draft_quant_config(vllm_config: VllmConfig) -> "QuantizationConfig | None": """Get quantization config for Draft models. Draft models should use their own quantization config instead of the verifier/target From a97a1a15395dcce03a886c707ad7320128765b5f Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 22:55:05 +0000 Subject: [PATCH 28/34] revert lora test changes Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/lora/conftest.py | 21 -------------- tests/lora/test_lora_checkpoints.py | 44 +++++++++++++++++++---------- tests/lora/test_lora_huggingface.py | 4 +-- 3 files changed, 31 insertions(+), 38 deletions(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 7d9e8444827f..dea54ed21aea 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -19,7 +19,6 @@ from vllm.model_executor.layers.linear import ( ColumnParallelLinear, MergedColumnParallelLinear, - QKVParallelLinear, RowParallelLinear, ) from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -167,26 +166,6 @@ def dummy_model_gate_up(default_vllm_config) -> nn.Module: return model -@pytest.fixture -def baichuan_dummy_model(default_vllm_config, dist_init) -> nn.Module: - # Only includes BaiChuan's lora modules so get_supported_lora_modules will work - model = DummyLoRAModel( - OrderedDict( - [ - ("W_pack", QKVParallelLinear(64, 8, 8)), - ("o_proj", RowParallelLinear(64, 64)), - ("gate_up_proj", MergedColumnParallelLinear(64, [16, 16])), - ("down_proj", RowParallelLinear(16, 64)), - ] - ) - ) - model.config = MagicMock() - # Match the expected format for BaiChuan checkpoints - model.W_pack.checkpoint_format = "fused" - model.gate_up_proj.checkpoint_format = "sharded" - return model - - @pytest.fixture(scope="session") def mixtral_lora_files(): # Note: this module has incorrect adapter_config.json to test diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index 0a54a80242be..7c263e2a2276 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -5,26 +5,37 @@ from vllm.lora.lora_model import LoRAModel from vllm.lora.peft_helper import PEFTHelper -from vllm.lora.utils import get_supported_lora_modules, parse_fine_tuned_lora_name +from vllm.lora.utils import parse_fine_tuned_lora_name from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM from vllm.model_executor.models.gemma4 import Gemma4ForCausalLM from vllm.model_executor.models.utils import WeightsMapper +lora_lst = ["baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"] +BAICHUAN_LORA_MODULES = [ + "W_pack", + "o_proj", + "gate_up_proj", + "down_proj", +] -@pytest.mark.parametrize( - "lora_name", - ["baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"], -) + +@pytest.mark.parametrize("lora_name", lora_lst) def test_load_checkpoints( lora_name, baichuan_lora_files, baichuan_zero_lora_files, baichuan_regex_lora_files, chatglm3_lora_files, - baichuan_dummy_model, ): - expected_lora_modules = set(get_supported_lora_modules(baichuan_dummy_model)) - weights_mapper = BaiChuanBaseForCausalLM.hf_to_vllm_mapper + packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping + + expected_lora_lst: list[str] = [] + for module in BAICHUAN_LORA_MODULES: + if module in packed_modules_mapping: + expected_lora_lst.extend(packed_modules_mapping[module]) + else: + expected_lora_lst.append(module) + expected_lora_modules = set(expected_lora_lst) if lora_name == "baichuan7B": peft_helper = PEFTHelper.from_local_dir( baichuan_lora_files, max_position_embeddings=4096 @@ -38,7 +49,6 @@ def test_load_checkpoints( lora_model_id=1, device="cpu", model_vocab_size=64000, - weights_mapper=weights_mapper, ) elif lora_name == "baichuan7B-zero": # Test that the target_modules contain prefix @@ -54,7 +64,6 @@ def test_load_checkpoints( lora_model_id=1, device="cpu", model_vocab_size=64000, - weights_mapper=weights_mapper, ) elif lora_name == "baichuan7B-zero-regex": # Test that the `target_modules` in the form of regular expressions, @@ -69,7 +78,6 @@ def test_load_checkpoints( lora_model_id=1, device="cpu", model_vocab_size=64000, - weights_mapper=weights_mapper, ) else: # For the baichuan7B model, load chatglm3-6b's LoRA, @@ -89,16 +97,22 @@ def test_load_checkpoints( ) -def test_lora_weights_mapping(baichuan_lora_files, baichuan_dummy_model): - expected_lora_modules = set(get_supported_lora_modules(baichuan_dummy_model)) +def test_lora_weights_mapping(baichuan_lora_files): + packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping + + expected_lora_lst: list[str] = [] + for module in BAICHUAN_LORA_MODULES: + if module in packed_modules_mapping: + expected_lora_lst.extend(packed_modules_mapping[module]) + else: + expected_lora_lst.append(module) + expected_lora_modules = set(expected_lora_lst) hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.": "language_model.model.", }, orig_to_new_substr={ ".layers.": ".baichuan_layers.", - ".gate_proj": ".gate_up_proj.0", - ".up_proj": ".gate_up_proj.1", }, ) peft_helper = PEFTHelper.from_local_dir( diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index 49601a7e0786..53253278ad80 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -6,7 +6,7 @@ from vllm.lora.lora_model import LoRAModel from vllm.lora.peft_helper import PEFTHelper from vllm.lora.utils import get_adapter_absolute_path -from vllm.model_executor.models.llama import LlamaModel +from vllm.model_executor.models.llama import LlamaForCausalLM # Provide absolute path and huggingface lora ids lora_fixture_name = ["llama32_lora_files", "llama32_lora_huggingface_id"] @@ -23,7 +23,7 @@ @pytest.mark.parametrize("lora_fixture_name", lora_fixture_name) def test_load_checkpoints_from_huggingface(lora_fixture_name, request): lora_name = request.getfixturevalue(lora_fixture_name) - packed_modules_mapping = LlamaModel.hf_to_vllm_mapper.get_packed_modules_mapping() + packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping expected_lora_lst: list[str] = [] for module in LLAMA_LORA_MODULES: From a33e90ebe146d2f99bc18254fc5b2cb128ae4049 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 23:00:15 +0000 Subject: [PATCH 29/34] Revert quant/lora hacks; `get_packed_modules_mapping` -> `get_unfused_mapper` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/model_executor/test_weight_utils.py | 22 -------------- vllm/lora/utils.py | 14 ++++----- vllm/lora/worker_manager.py | 6 +++- vllm/model_executor/model_loader/utils.py | 2 +- vllm/model_executor/models/interfaces.py | 5 +--- vllm/model_executor/models/utils.py | 35 +++++++++++------------ vllm/model_executor/utils.py | 24 +++------------- 7 files changed, 35 insertions(+), 73 deletions(-) diff --git a/tests/model_executor/test_weight_utils.py b/tests/model_executor/test_weight_utils.py index 202a41d36e3b..9e67609b78e4 100644 --- a/tests/model_executor/test_weight_utils.py +++ b/tests/model_executor/test_weight_utils.py @@ -281,27 +281,5 @@ def test_composes_with_qkv_mapper(self): ) -def test_weights_mapper_get_packed_modules_mapping(): - from vllm.model_executor.models.utils import WeightsMapper - - mapper = WeightsMapper( - orig_to_new_substr={ - ".q_proj": ".qkv_proj.q", - ".k_proj": ".qkv_proj.k", - ".v_proj": ".qkv_proj.v", - ".gate_proj": ".gate_up_proj.0", - ".up_proj": ".gate_up_proj.1", - # Non-fusion entries must not contribute - ".word_embeddings": "", - "llm.model.": "model.decoder.", - "llm.lm_head": "lm_head", - } - ) - assert mapper.get_packed_modules_mapping() == { - "qkv_proj": ["q_proj", "k_proj", "v_proj"], - "gate_up_proj": ["gate_proj", "up_proj"], - } - - if __name__ == "__main__": test_download_weights_from_hf() diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index fd624593b998..828aea712d01 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -210,8 +210,7 @@ def get_supported_lora_modules(model: nn.Module) -> list[str]: In vLLM, all linear layers support LoRA. """ - packed_modules_mapping = get_packed_modules_mapping(model) - supported_lora_modules: set[str] = set(sum(packed_modules_mapping.values(), [])) + supported_lora_modules: set[str] = set() for name, module in model.named_modules(): # get the embedding modules if the module's embedding_modules # is not empty. @@ -220,11 +219,12 @@ def get_supported_lora_modules(model: nn.Module) -> list[str]: for name in embedding_modules: supported_lora_modules.add(name) - if ( - isinstance(module, (LinearBase, MoERunner)) - and (supported_name := name.split(".")[-1]) not in packed_modules_mapping - ): - supported_lora_modules.add(supported_name) + # get all the linear subfixes. + if isinstance(module, (LinearBase,)): + supported_lora_modules.add(name.split(".")[-1]) + + if isinstance(module, (MoERunner,)): + supported_lora_modules.add(name.split(".")[-1]) return list(supported_lora_modules) diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 166d5c36ba57..785df09fe400 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -122,9 +122,13 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: peft_helper.validate_legal(self.lora_config) # For some models like Qwen2VL, we need to use hf_to_vllm_mapper - # to ensure correct loading of lora weights. + # to ensure correct loading of lora weights. Drop the QKV/MLP fusion + # substr maps so constituent names (e.g. `q_proj`) survive for the + # LoRA manager to pack, while keeping genuine renames/prefixes. model = self._adapter_manager.model hf_to_vllm_mapper = getattr(model, "hf_to_vllm_mapper", None) + if hf_to_vllm_mapper is not None: + hf_to_vllm_mapper = hf_to_vllm_mapper.get_unfused_mapper() # Get model-defined prefixes to skip during LoRA loading. lora_skip_prefixes = getattr(model, "lora_skip_prefixes", None) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index fc279c7e9c78..b4b4db11ed3a 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -290,6 +290,6 @@ def configure_quant_config( # pass mappings by reference to quant_config if hf_to_vllm_mapper is not None: - quant_config.apply_vllm_mapper(hf_to_vllm_mapper) + quant_config.apply_vllm_mapper(hf_to_vllm_mapper.get_unfused_mapper()) if packed_mapping is not None: quant_config.packed_modules_mapping = packed_mapping diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 3040f3283f08..26356fce91cd 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1031,10 +1031,7 @@ def _maybe_apply_model_mapping(self): if self.quant_config is None: return if (hf_to_vllm_mapper := self.hf_to_vllm_mapper) is not None: - self.quant_config.apply_vllm_mapper(hf_to_vllm_mapper) - if packed_modules_mapping := hf_to_vllm_mapper.get_packed_modules_mapping(): - self.packed_modules_mapping = self.packed_modules_mapping or {} - self.packed_modules_mapping.update(packed_modules_mapping) + self.quant_config.apply_vllm_mapper(hf_to_vllm_mapper.get_unfused_mapper()) if self.packed_modules_mapping is not None: self.quant_config.packed_modules_mapping.update(self.packed_modules_mapping) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 1a22ec92a3b9..55f40fcdc435 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -4,7 +4,7 @@ import itertools from collections.abc import Callable, Iterable, Mapping from contextlib import contextmanager -from dataclasses import dataclass, field +from dataclasses import dataclass, field, replace from typing import TYPE_CHECKING, Any, Literal, Protocol, overload import regex as re @@ -130,22 +130,24 @@ def apply_dict(self, values: dict[str, Any]) -> dict[str, Any]: if (out_name := self._map_name(name)) is not None } - def get_packed_modules_mapping(self) -> dict[str, list[str]]: - """Derive a `packed_modules_mapping` from `self.orig_to_new_substr`.""" + def get_unfused_mapper(self) -> "WeightsMapper": + """Mapper variant that drops the QKV/MLP fusion substr maps, keeping + all genuine renames/prefixes. + + Consumers that reference the checkpoint's *unfused* module names — LoRA + name parsing and the quantization config's layer lists + (`modules_in_block_to_quantize`, ignored layers) — need the constituent + names (e.g. `q_proj`) to survive rather than being rewritten to the + fused vLLM name (`qkv_proj.q`).""" qkv_shards = {"q", "k", "v"} - packed_modules_mapping: dict[str, list[str]] = {} + substr = {} for old, new in self.orig_to_new_substr.items(): - if new is None or "." not in new: - continue - param_path, _, shard_id = new.rpartition(".") - # Is shard_id actually a shard ID? - if not (shard_id.isdigit() or shard_id in qkv_shards): - continue - _, _, weight_name = old.rpartition(".") - _, _, param_name = param_path.rpartition(".") - packed_names = packed_modules_mapping.setdefault(param_name, []) - packed_names.extend([weight_name, shard_id]) - return packed_modules_mapping + if new is not None and "." in new: + shard_id = new.rpartition(".")[2] + if shard_id.isdigit() or shard_id in qkv_shards: + continue + substr[old] = new + return replace(self, orig_to_new_substr=substr) class AutoWeightsLoader: @@ -392,9 +394,6 @@ def load_weights( mapper |= quant_config.get_cache_scale_mapper() ignore_unexpected_suffixes = quant_config._ignore_unexpected_suffixes self.ignore_unexpected_suffixes.extend(ignore_unexpected_suffixes) - # If mapper contains packed_modules_mapping, update them in quant_config - if packed_modules_mapping := mapper.get_packed_modules_mapping(): - quant_config.packed_modules_mapping.update(packed_modules_mapping) if mapper is not None: weights = mapper.apply(weights) # filter out weights with first-prefix/substr to skip in name diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index d41cbf0f75ce..a0269be855a9 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -98,26 +98,9 @@ def replace_parameter( setattr(layer, param_name, new_param) -def _get_packed_modules_mapping(module: torch.nn.Module) -> dict[str, list[str]]: - """Get the packed modules mapping from a module. - - It could come from one of two places: - - 1. The module has a `packed_modules_mapping` attribute. - 2. The module has a `hf_to_vllm_mapper` attribute, which can generate the mapping. - - No module should have both attributes, and if it does, - the `packed_modules_mapping` attribute takes precedence.""" - if packed_modules_mapping := getattr(module, "packed_modules_mapping", None): - return copy.deepcopy(packed_modules_mapping) - elif hf_to_vllm_mapper := getattr(module, "hf_to_vllm_mapper", None): - return hf_to_vllm_mapper.get_packed_modules_mapping() - else: - return {} - - def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]: - parent_map = _get_packed_modules_mapping(model) + parent_map = getattr(model, "packed_modules_mapping", None) + parent_map = copy.deepcopy(parent_map) if parent_map is not None else {} # don't infer mapping if the model has defined it explicitly. if parent_map: @@ -125,7 +108,8 @@ def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]: # We only check main components instead of whole model submodules for child in model.children(): - child_map = _get_packed_modules_mapping(child) + child_map = getattr(child, "packed_modules_mapping", None) + child_map = copy.deepcopy(child_map) if child_map is not None else {} if any((k in parent_map and parent_map[k] != v) for k, v in child_map.items()): raise ValueError( From 4e5c1e3b7280f502639a21c1c969361a8fa76b33 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 23:04:32 +0000 Subject: [PATCH 30/34] Add `packed_modules_mapping` attrs back to models Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/arcee.py | 5 +++++ vllm/model_executor/models/baichuan.py | 4 ++++ vllm/model_executor/models/commandr.py | 4 ++++ vllm/model_executor/models/exaone.py | 4 ++++ vllm/model_executor/models/exaone4.py | 4 ++++ vllm/model_executor/models/gemma.py | 4 ++++ vllm/model_executor/models/gemma2.py | 4 ++++ vllm/model_executor/models/glm4.py | 5 +++++ vllm/model_executor/models/granite.py | 4 ++++ vllm/model_executor/models/hyperclovax.py | 4 ++++ vllm/model_executor/models/internlm2.py | 5 +++++ vllm/model_executor/models/jais2.py | 3 +++ vllm/model_executor/models/llama.py | 4 ++++ vllm/model_executor/models/nemotron.py | 3 +++ vllm/model_executor/models/nemotron_nas.py | 4 ++++ vllm/model_executor/models/olmo.py | 4 ++++ vllm/model_executor/models/olmo2.py | 4 ++++ vllm/model_executor/models/opt.py | 3 +++ vllm/model_executor/models/ouro.py | 4 ++++ vllm/model_executor/models/phi.py | 3 +++ vllm/model_executor/models/qwen2.py | 5 +++++ vllm/model_executor/models/qwen2_rm.py | 5 +++++ vllm/model_executor/models/qwen3.py | 4 ++++ vllm/model_executor/models/rnj1.py | 4 ++++ vllm/model_executor/models/seed_oss.py | 4 ++++ vllm/model_executor/models/solar.py | 5 ++++- vllm/model_executor/models/step1.py | 4 ++++ 27 files changed, 109 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py index 844f7ff44342..c004fe793d0e 100644 --- a/vllm/model_executor/models/arcee.py +++ b/vllm/model_executor/models/arcee.py @@ -286,6 +286,11 @@ class ArceeForCausalLM( ".v_proj": ".qkv_proj.v", } ) + # Map fused module names to their submodule components + # (for quantization and LoRA) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + } def __init__(self, *, vllm_config, prefix: str = "") -> None: super().__init__() diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 85cb254670e3..d29b72733549 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -350,6 +350,10 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant ".up_proj": ".gate_up_proj.1", } ) + packed_modules_mapping = { + "W_pack": ["W_pack"], + "gate_up_proj": ["gate_proj", "up_proj"], + } def __init__( self, diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 6dae6b4bccda..96c3e4133e21 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -349,6 +349,10 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant): ".up_proj": ".gate_up_proj.1", } ) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } # LoRA specific attributes embedding_modules = {"embed_tokens": "input_embeddings"} diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index d0523459482d..6ef94b099a29 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -377,6 +377,10 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ".c_fc_1": ".gate_up_proj.1", } ) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["c_fc_0", "c_fc_1"], + } # LoRA specific attributes embedding_modules = { diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index ce14149acf35..7927eea6ac84 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -375,6 +375,10 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ".up_proj": ".gate_up_proj.1", } ) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } # LoRA specific attributes embedding_modules = { diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index f95cdc161482..8c9f85d84e36 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -334,6 +334,10 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant): ".up_proj": ".gate_up_proj.1", } ) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index fd1d2027297d..334a5603c7fc 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -323,6 +323,10 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ".up_proj": ".gate_up_proj.1", } ) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index e7414e799861..3a25f90ad2a0 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -234,6 +234,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index d8625afd39f4..e520b17c3b16 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -334,6 +334,10 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # LoRA specific attributes + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } embedding_modules = { "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", diff --git a/vllm/model_executor/models/hyperclovax.py b/vllm/model_executor/models/hyperclovax.py index 3eff84a4fe1a..7a531ffce1e6 100644 --- a/vllm/model_executor/models/hyperclovax.py +++ b/vllm/model_executor/models/hyperclovax.py @@ -384,6 +384,10 @@ class HyperCLOVAXForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ".up_proj": ".gate_up_proj.1", } ) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } # LoRA specific attributes embedding_modules = { diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index eb726e48f956..743357e09d62 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -321,6 +321,11 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): + packed_modules_mapping = { + "wqkv": ["wqkv"], + "gate_up_proj": ["w1", "w3"], + } + def __init__( self, *, diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py index d24337230942..23e0f640e39f 100644 --- a/vllm/model_executor/models/jais2.py +++ b/vllm/model_executor/models/jais2.py @@ -371,6 +371,9 @@ class Jais2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ".v_proj": ".qkv_proj.v", } ) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + } embedding_modules = { "embed_tokens": "input_embeddings", diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index eaec6e42a3dc..a512751db41d 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -446,6 +446,10 @@ class LlamaForCausalLM( LocalArgmaxMixin, nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3 ): # LoRA specific attributes + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } embedding_modules = { "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index b62e9f991c4f..da33584bb104 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -369,6 +369,9 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ".v_proj": ".qkv_proj.v", } ) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + } # LoRA specific attributes embedding_modules = { diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index b37ff8be0701..04044f6477ba 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -322,6 +322,10 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps): ".up_proj": ".gate_up_proj.1", } ) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } # LoRA specific attributes embedding_modules = { diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index b084ca727af9..e9eaad16399b 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -315,6 +315,10 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA): ".up_proj": ".gate_up_proj.1", } ) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 893b41451bf5..e85541115e00 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -357,6 +357,10 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): ".up_proj": ".gate_up_proj.1", } ) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index ccb6798dec75..8669688aa74d 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -335,6 +335,9 @@ class OPTForCausalLM(nn.Module, SupportsPP, SupportsLoRA): "decoder.": "model.decoder.", }, ) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index ebbe2999829e..aacce6300399 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -384,6 +384,10 @@ class OuroForCausalLM(nn.Module, SupportsLoRA): ".up_proj": ".gate_up_proj.1", } ) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index c417c658d2e2..e1c0ed0b1625 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -265,6 +265,9 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ".v_proj": ".qkv_proj.v", } ) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index d5a9e1aab50a..7f76ffd1f6a1 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -440,6 +440,11 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: class Qwen2ForCausalLM( nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3 ): + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config.get_text_config() diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 08c036e1a9b8..008039e296c2 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -25,6 +25,11 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP): + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } + is_pooling_model = True pooler: Pooler diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 06f721209fd0..3c9517ec9b1c 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -267,6 +267,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): class Qwen3ForCausalLM( LocalArgmaxMixin, nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3 ): + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } embedding_modules = { "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", diff --git a/vllm/model_executor/models/rnj1.py b/vllm/model_executor/models/rnj1.py index 37f0f6e1684a..1bea77c87935 100644 --- a/vllm/model_executor/models/rnj1.py +++ b/vllm/model_executor/models/rnj1.py @@ -338,6 +338,10 @@ class Rnj1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ".up_proj": ".gate_up_proj.1", } ) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index af21f3ba17a9..68d29b6640f6 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -369,6 +369,10 @@ class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ".up_proj": ".gate_up_proj.1", } ) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 427d76d91946..07e2aa83c404 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -354,7 +354,10 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ".up_proj": ".gate_up_proj.1", } ) - + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } # LoRA specific attributes embedding_modules = { "embed_tokens": "input_embeddings", diff --git a/vllm/model_executor/models/step1.py b/vllm/model_executor/models/step1.py index 529b405533ae..eab36640deeb 100644 --- a/vllm/model_executor/models/step1.py +++ b/vllm/model_executor/models/step1.py @@ -321,6 +321,10 @@ class Step1ForCausalLM(nn.Module, SupportsPP, SupportsEagle, SupportsEagle3): ".up_proj": ".gate_up_proj.1", } ) + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() From 735d725de9633e17a3263835971975688ac4baff Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 23:14:52 +0000 Subject: [PATCH 31/34] Revert now unused LoRA things Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/lora/layers/column_parallel_linear.py | 21 ++++++++------------- vllm/model_executor/layers/linear.py | 8 +------- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py index 7ec31c31253e..8a86191b8918 100644 --- a/vllm/lora/layers/column_parallel_linear.py +++ b/vllm/lora/layers/column_parallel_linear.py @@ -167,10 +167,7 @@ def can_replace_layer( if type(source_layer) is maybe_get_oot_by_class(ColumnParallelLinear): return True if isinstance(source_layer, maybe_get_oot_by_class(MergedColumnParallelLinear)): - if ( - len(packed_modules_list) != 1 - or source_layer.checkpoint_format == "sharded" - ): + if len(packed_modules_list) != 1: return False # Exclude layers with 3+ output sizes - those are handled by # MergedColumnParallelLinearVariableSliceWithLoRA since this @@ -350,11 +347,7 @@ def can_replace_layer( decorate: bool = True, ) -> bool: merged_cls = maybe_get_oot_by_class(MergedColumnParallelLinear) - if ( - not isinstance(source_layer, merged_cls) - or len(source_layer.output_sizes) != 2 - or source_layer.checkpoint_format == "fused" - ): + if not isinstance(source_layer, merged_cls) or len(packed_modules_list) != 2: return False tp_size = getattr(source_layer, "tp_size", 1) @@ -429,8 +422,9 @@ def can_replace_layer( packed_modules_list: list, model_config: PretrainedConfig | None = None, ) -> bool: - return type(source_layer) is maybe_get_oot_by_class(QKVParallelLinear) and ( - len(packed_modules_list) == 1 or source_layer.checkpoint_format == "fused" + return ( + type(source_layer) is maybe_get_oot_by_class(QKVParallelLinear) + and len(packed_modules_list) == 1 ) @@ -489,8 +483,9 @@ def can_replace_layer( packed_modules_list: list, model_config: PretrainedConfig | None = None, ) -> bool: - return type(source_layer) is maybe_get_oot_by_class(QKVParallelLinear) and ( - len(packed_modules_list) == 3 or source_layer.checkpoint_format == "sharded" + return ( + type(source_layer) is maybe_get_oot_by_class(QKVParallelLinear) + and len(packed_modules_list) == 3 ) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 916c15ca3c1f..e78d8de2b2ce 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -4,7 +4,6 @@ import itertools from abc import abstractmethod from collections.abc import Iterable -from typing import Literal import torch from torch.nn.parameter import Parameter @@ -619,8 +618,8 @@ def __init__( self.output_sizes = output_sizes self.tp_size = get_tensor_model_parallel_world_size() if not disable_tp else 1 self.tp_rank = get_tensor_model_parallel_rank() if not disable_tp else 0 + assert all(output_size % self.tp_size == 0 for output_size in output_sizes) - self.checkpoint_format: Literal["fused", "sharded"] | None = None super().__init__( input_size=input_size, output_size=sum(output_sizes), @@ -920,7 +919,6 @@ def load_weights( # Checkpoint is sharded shard_id_str, _, name = name.partition(".") shard_id = int(shard_id_str) - self.checkpoint_format = "sharded" logger.debug( "Loaded shard %s into %s for layer %s.%s", shard_id, @@ -930,7 +928,6 @@ def load_weights( ) else: shard_id = None - self.checkpoint_format = "fused" logger.debug( "Loaded weight %s.%s with shape %s", self.prefix, @@ -1021,7 +1018,6 @@ def __init__( self.num_kv_heads * self.head_size * tp_size, # k_proj self.num_kv_heads * self.v_head_size * tp_size, # v_proj ] - self.checkpoint_format: Literal["fused", "sharded"] | None = None super().__init__( input_size=input_size, @@ -1349,7 +1345,6 @@ def load_weights( # Checkpoint is sharded shard_id, _, name = name.partition(".") self.validate_shard_id(shard_id) - self.checkpoint_format = "sharded" logger.debug( "Loaded shard %s into %s for layer %s.%s", shard_id, @@ -1360,7 +1355,6 @@ def load_weights( else: # Checkpoint is fused shard_id = None - self.checkpoint_format = "fused" logger.debug( "Loaded weight %s.%s with shape %s", self.prefix, From a7a65a9729dc56e888e723ddd6c8bd28203e8299 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 23:15:53 +0000 Subject: [PATCH 32/34] Revert lora test Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/lora/test_lora_huggingface.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index 53253278ad80..7c7f4eb4b626 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -6,7 +6,7 @@ from vllm.lora.lora_model import LoRAModel from vllm.lora.peft_helper import PEFTHelper from vllm.lora.utils import get_adapter_absolute_path -from vllm.model_executor.models.llama import LlamaForCausalLM +from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM # Provide absolute path and huggingface lora ids lora_fixture_name = ["llama32_lora_files", "llama32_lora_huggingface_id"] @@ -23,7 +23,7 @@ @pytest.mark.parametrize("lora_fixture_name", lora_fixture_name) def test_load_checkpoints_from_huggingface(lora_fixture_name, request): lora_name = request.getfixturevalue(lora_fixture_name) - packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping + packed_modules_mapping = Qwen3ForCausalLM.packed_modules_mapping expected_lora_lst: list[str] = [] for module in LLAMA_LORA_MODULES: From 2aadacec3b806a6e4061afc3fbd526a0f27452da Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 23:30:31 +0000 Subject: [PATCH 33/34] tweak diff Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/commandr.py | 1 - vllm/model_executor/models/qwen2_rm.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 96c3e4133e21..2880a2c22103 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -353,7 +353,6 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant): "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], } - # LoRA specific attributes embedding_modules = {"embed_tokens": "input_embeddings"} # ModelOpt NVFP4 checkpoints carry raw quantizer-module state diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 008039e296c2..47184173d5a2 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -25,14 +25,14 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP): + is_pooling_model = True + pooler: Pooler + packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], } - is_pooling_model = True - pooler: Pooler - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config From ed3b43cade10c2de657cb16ea8e7d901dd73d5b6 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Jun 2026 23:33:22 +0000 Subject: [PATCH 34/34] More accurate comment Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/layers/linear.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index e78d8de2b2ce..0ad5702f35df 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -934,7 +934,7 @@ def load_weights( name, loaded_weight.shape, ) - # If name is "bias" get it from self, otherwise load into self + # Load into self if name is not an attr of self param: Parameter = getattr(self, name, self) if ( param is None @@ -1361,7 +1361,7 @@ def load_weights( name, loaded_weight.shape, ) - # If name is "bias" get it from self, otherwise load into self + # Load into self if name is not an attr of self param: Parameter = getattr(self, name, self) if ( param is None