From 3562c7f19828113377e0e022516fb1bfe9ea8ee5 Mon Sep 17 00:00:00 2001 From: Tarek Ziade Date: Mon, 13 Apr 2026 08:29:37 +0200 Subject: [PATCH 01/38] audio tester --- tests/audio_tester.py | 322 ++++++++++++++++++ .../test_modeling_audioflamingo3.py | 166 ++------- .../test_modeling_granite_speech.py | 277 ++++++--------- .../qwen2_audio/test_modeling_qwen2_audio.py | 150 +------- 4 files changed, 444 insertions(+), 471 deletions(-) create mode 100644 tests/audio_tester.py diff --git a/tests/audio_tester.py b/tests/audio_tester.py new file mode 100644 index 000000000000..b2d900a2236d --- /dev/null +++ b/tests/audio_tester.py @@ -0,0 +1,322 @@ +# Copyright 2026 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tempfile +import unittest +from inspect import signature + +from .test_configuration_common import ConfigTester +from .test_modeling_common import ( + GenerationTesterMixin, + ModelTesterMixin, + floats_tensor, + ids_tensor, + is_torch_available, + require_torch, + torch_device, +) +from .test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + +class AudioModelTester: + # If the model follows standard naming conventions, only `config_class` and + # `conditional_generation_class` need to be set (others are optional). + config_class = None + conditional_generation_class = None + base_model_class = None + sequence_classification_class = None + + # Key name for the audio sub-config in the main config constructor. + # Override to "encoder_config" for models like GraniteSpeech. + audio_config_key = "audio_config" + + # Model attribute name for the audio encoder (used in SDPA dispatch tests). + # Set to None to skip audio encoder SDPA checking. + audio_tower_attr = "audio_tower" + + # Arguments that should be passed to the config class even if not in its signature. + forced_config_args = ["pad_token_id"] + + _required_attributes = ("config_class", "conditional_generation_class") + + @property + def all_model_classes(self): + return [ + model_class + for model_class in ( + self.base_model_class, + self.conditional_generation_class, + self.sequence_classification_class, + ) + if model_class is not None + ] + + @property + def pipeline_model_mapping(self): + return {"any-to-any": self.conditional_generation_class} + + def __init__(self, parent, **kwargs): + self.parent = parent + + # Standard defaults + kwargs.setdefault("batch_size", 3) + kwargs.setdefault("seq_length", 25) + kwargs.setdefault("feat_seq_length", 60) + kwargs.setdefault("num_mel_bins", 80) + kwargs.setdefault("is_training", True) + kwargs.setdefault("use_labels", True) + kwargs.setdefault("pad_token_id", 1) + kwargs.setdefault("bos_token_id", 1) + kwargs.setdefault("eos_token_id", 2) + kwargs.setdefault("audio_token_id", 0) + kwargs.setdefault("audio_token_index", 0) # Alias for models that use this name + kwargs.setdefault("ignore_index", -100) + kwargs.setdefault("scope", None) + + # Text config defaults (small Qwen2-style backbone) + kwargs.setdefault( + "text_config", + { + "model_type": "qwen2", + "intermediate_size": 36, + "initializer_range": 0.02, + "hidden_size": 32, + "max_position_embeddings": 52, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "num_key_value_heads": 2, + "vocab_size": 99, + "pad_token_id": 1, + }, + ) + + # Audio config defaults (small Whisper-style encoder) + kwargs.setdefault( + "audio_config", + { + "model_type": "qwen2_audio_encoder", + "d_model": 16, + "encoder_attention_heads": 4, + "encoder_ffn_dim": 16, + "encoder_layers": 2, + "num_mel_bins": 80, + "max_source_positions": 30, + "initializer_range": 0.02, + }, + ) + + # Optional projector config (e.g. GraniteSpeech uses a Q-Former projector) + kwargs.setdefault("projector_config", None) + + # Set all kwargs as instance attributes + for key, value in kwargs.items(): + setattr(self, key, value) + + # Derived from text config (needed by ModelTesterMixin) + self.vocab_size = self.text_config.get("vocab_size", 99) + self.hidden_size = self.text_config.get("hidden_size", 32) + self.num_hidden_layers = self.text_config.get("num_hidden_layers", 2) + self.num_attention_heads = self.text_config.get("num_attention_heads", 4) + self.encoder_seq_length = self.seq_length + + for required_attribute in self._required_attributes: + if getattr(self, required_attribute) is None: + raise ValueError( + f"You have inherited from AudioModelTester but did not set the {required_attribute} attribute." + ) + + # Because audio-LMs have some different standards in how they handle audio tokens, we need + # a few methods that can be overridden if required: + + def create_audio_features(self): + """Create audio feature tensor. Override for different shapes (e.g. [B, T, features]).""" + return floats_tensor([self.batch_size, self.num_mel_bins, self.feat_seq_length]) + + def create_attention_mask(self, input_ids): + """Create text attention mask. Override for models without a padding sentinel.""" + attention_mask = torch.ones_like(input_ids, dtype=torch.long).to(torch_device) + attention_mask[:, :1] = 0 # Padding sentinel + return attention_mask + + def get_num_audio_tokens(self, audio_features): + """Compute number of audio placeholder tokens from features. Override for different subsampling.""" + # Default: 2-stage pooling (common for Whisper-style encoders) + input_length = (audio_features.shape[-1] - 1) // 2 + 1 + return (input_length - 2) // 2 + 1 + + def place_audio_tokens(self, input_ids, config, num_audio_tokens): + """Place audio placeholder tokens in input_ids. Override for different placement.""" + input_ids = input_ids.clone() + input_ids[input_ids == self.audio_token_id] = self.pad_token_id + input_ids[:, 1 : 1 + num_audio_tokens] = self.audio_token_id + return input_ids + + def get_audio_feature_key(self): + """Key name for audio features in the inputs dict.""" + return "input_features" + + def get_audio_mask_key(self): + """Key name for audio attention mask. Return None if no audio mask needed.""" + return None + + def create_audio_mask(self, audio_features): + """Create audio-level attention mask. Override for bool masks or different shapes.""" + return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.long).to(torch_device) + + def get_additional_inputs(self, config, input_ids, audio_features): + """Return dict of model-specific extra inputs (e.g. image_sizes for multi-modal).""" + return {} + + # End of overridable methods + + @property + def config_args(self): + return list(signature(self.config_class.__init__).parameters.keys()) + + def get_config(self): + kwargs = {} + skip_keys = {"self", "text_config", self.audio_config_key, "projector_config"} + attribute_map = getattr(self.config_class, "attribute_map", {}) + model_name_to_common_name = {v: k for k, v in attribute_map.items()} + for k in self.config_args + self.forced_config_args: + if k in skip_keys: + continue + if hasattr(self, k) and k != "self": + kwargs[k] = getattr(self, k) + elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]): + kwargs[k] = getattr(self, model_name_to_common_name[k]) + kwargs["text_config"] = self.text_config + kwargs[self.audio_config_key] = self.audio_config + if self.projector_config is not None: + kwargs["projector_config"] = self.projector_config + return self.config_class(**kwargs) + + def prepare_config_and_inputs_for_common(self): + config = self.get_config() + audio_features = self.create_audio_features() + num_audio_tokens = self.get_num_audio_tokens(audio_features) + + input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2 + input_ids = self.place_audio_tokens(input_ids, config, num_audio_tokens) + attention_mask = self.create_attention_mask(input_ids) + + inputs_dict = { + self.get_audio_feature_key(): audio_features, + "input_ids": input_ids, + "attention_mask": attention_mask, + } + + audio_mask_key = self.get_audio_mask_key() + if audio_mask_key is not None: + inputs_dict[audio_mask_key] = self.create_audio_mask(audio_features) + + inputs_dict.update(self.get_additional_inputs(config, input_ids, audio_features)) + return config, inputs_dict + + +@require_torch +class AudioModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin): + """ + Base test class for Audio-Language Models. + + Subclasses should set: + - `model_tester_class`: The tester class (subclass of AudioModelTester) + + Optional: + - `all_model_classes`: Override if not using default from model_tester + - `pipeline_model_mapping`: Override if not using default from model_tester + """ + + model_tester_class = None + all_model_classes = None + pipeline_model_mapping = None + + # Audio-LMs are always composite + _is_composite = True + + def setUp(self): + if self.model_tester_class is None: + raise ValueError( + "You have inherited from AudioModelTest but did not set the model_tester_class attribute." + ) + self.model_tester = self.model_tester_class(self) + self.config_tester = ConfigTester(self, config_class=self.model_tester.config_class, has_text_modality=False) + + if self.pipeline_model_mapping is None: + if self.all_model_classes is not None: + raise ValueError( + "Tests that inherit from `AudioModelTest` and set `all_model_classes` must manually set " + "`pipeline_model_mapping`." + ) + else: + self.pipeline_model_mapping = self.model_tester.pipeline_model_mapping + + if self.all_model_classes is None: + self.all_model_classes = self.model_tester.all_model_classes + + def test_config(self): + """Test config common functionality.""" + self.config_tester.run_common_tests() + + def test_sdpa_can_dispatch_composite_models(self): + """Verify SDPA toggles propagate correctly to audio and text sub-modules.""" + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + if not self._is_composite: + self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + # SDPA (default) + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + self.assertTrue(model.language_model.config._attn_implementation == text_attn) + + audio_tower_attr = self.model_tester.audio_tower_attr + if audio_tower_attr is not None: + audio_tower = getattr(model, audio_tower_attr) + audio_attn = "sdpa" if audio_tower._supports_sdpa else "eager" + self.assertTrue(audio_tower.config._attn_implementation == audio_attn) + + # Eager + model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") + model_eager = model_eager.eval().to(torch_device) + self.assertTrue(model_eager.config._attn_implementation == "eager") + self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") + + if audio_tower_attr is not None: + self.assertTrue(getattr(model_eager, audio_tower_attr).config._attn_implementation == "eager") + + for _, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + @unittest.skip("Audio-LMs have no separate base model without a head.") + def test_model_base_model_prefix(self): + pass diff --git a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py index 7301812e7032..8726443bbfca 100644 --- a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py +++ b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py @@ -15,7 +15,6 @@ """Testing suite for the PyTorch AudioFlamingo3 model.""" import json -import tempfile import unittest from pathlib import Path @@ -34,56 +33,21 @@ torch_device, ) -from ...generation.test_utils import GenerationTesterMixin -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...audio_tester import AudioModelTest, AudioModelTester if is_torch_available(): import torch -class AudioFlamingo3ModelTester: - """ - Builds a tiny AudioFlamingo3 config and synthetic inputs that respect AF3's - post-pool token accounting: num tokens per sample == post-pool frame count. - """ +class AudioFlamingo3ModelTester(AudioModelTester): + config_class = AudioFlamingo3Config + conditional_generation_class = AudioFlamingo3ForConditionalGeneration - def __init__( - self, - parent, - audio_token_id=0, - seq_length=25, - feat_seq_length=60, - text_config=None, - audio_config=None, - is_training=True, - ): - self.parent = parent - self.audio_token_id = audio_token_id - self.seq_length = seq_length - self.feat_seq_length = feat_seq_length - self.is_training = is_training - - # Small text backbone (Qwen2-ish) - if text_config is None: - text_config = { - "model_type": "qwen2", - "intermediate_size": 36, - "initializer_range": 0.02, - "hidden_size": 32, - "max_position_embeddings": 52, - "num_hidden_layers": 2, - "num_attention_heads": 4, - "num_key_value_heads": 2, - "use_labels": True, - "use_mrope": False, - "vocab_size": 99, - "pad_token_id": 1, # Ensure pad token != audio token - } - # Small audio encoder (AF3 Whisper-style) - if audio_config is None: - audio_config = { + def __init__(self, parent, **kwargs): + kwargs.setdefault( + "audio_config", + { "model_type": "audioflamingo3_encoder", "hidden_size": 16, "num_attention_heads": 4, @@ -92,70 +56,24 @@ def __init__( "num_mel_bins": 80, "max_source_positions": 30, "initializer_range": 0.02, - } - - self.text_config = text_config - self.audio_config = audio_config - - self.batch_size = 3 - self.vocab_size = text_config["vocab_size"] - self.hidden_size = text_config["hidden_size"] - self.num_attention_heads = text_config["num_attention_heads"] - self.num_hidden_layers = text_config["num_hidden_layers"] - self.encoder_seq_length = seq_length - - def get_config(self): - return AudioFlamingo3Config( - text_config=self.text_config, - audio_config=self.audio_config, - audio_token_id=self.audio_token_id, + }, ) + super().__init__(parent, **kwargs) - def prepare_config_and_inputs(self): - # (#windows == batch_size, n_mels, T_mel) - input_features_values = floats_tensor( - [self.batch_size, self.audio_config["num_mel_bins"], self.feat_seq_length] - ) - config = self.get_config() - # Per-window mel validity (all ones => full length) - input_features_mask = torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) - return config, input_features_values, input_features_mask - - def _post_pool_tokens_per_window(self, T_mel): - # Mirror AF3 processor math: - pre = (T_mel - 1) // 2 + 1 - post = (pre - 2) // 2 + 1 - return post - - def prepare_config_and_inputs_for_common(self): - config, input_features_values, input_features_mask = self.prepare_config_and_inputs() - # Every window has same T_mel here - num_audio_tokens_per_sample = self._post_pool_tokens_per_window(input_features_values.shape[-1]) - - # Build token ids with valid range and K tokens - input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2 - attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=torch_device) - attention_mask[:, :1] = 0 # left padding sentinel - - # Fill first K positions (after padding) with the audio token id, for each sample - input_ids[:, 1 : 1 + num_audio_tokens_per_sample] = config.audio_token_id - - inputs_dict = { - "input_features": input_features_values, - "input_features_mask": input_features_mask, - "input_ids": input_ids, - "attention_mask": attention_mask, - } - return config, inputs_dict + def get_audio_mask_key(self): + return "input_features_mask" + + def create_audio_mask(self, audio_features): + return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) @require_torch -class AudioFlamingo3ForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): +class AudioFlamingo3ForConditionalGenerationModelTest(AudioModelTest, unittest.TestCase): """ Model tester for `AudioFlamingo3ForConditionalGeneration`. """ - all_model_classes = (AudioFlamingo3ForConditionalGeneration,) if is_torch_available() else () + model_tester_class = AudioFlamingo3ModelTester # TODO: @eustlb, this is incorrect pipeline_model_mapping = ( { @@ -165,14 +83,10 @@ class AudioFlamingo3ForConditionalGenerationModelTest(ModelTesterMixin, Generati if is_torch_available() else {} ) - _is_composite = True - - def setUp(self): - self.model_tester = AudioFlamingo3ModelTester(self) - self.config_tester = ConfigTester(self, config_class=AudioFlamingo3Config, has_text_modality=False) @unittest.skip( - reason="This test does not apply to AudioFlamingo3 since inputs_embeds corresponding to audio tokens are replaced when input features are provided." + reason="This test does not apply to AudioFlamingo3 since inputs_embeds corresponding to audio tokens " + "are replaced when input features are provided." ) def test_inputs_embeds_matches_input_ids(self): pass @@ -190,48 +104,6 @@ def test_sdpa_can_dispatch_on_flash(self): def test_flash_attn_2_inference_equivalence_right_padding(self): pass - @unittest.skip(reason="AudioFlamingo3 has no separate base model without a head.") - def test_model_base_model_prefix(self): - pass - - def test_sdpa_can_dispatch_composite_models(self): - # AF3 is audio+text composite; verify SDPA toggles propagate to submodules. - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - if not self._is_composite: - self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") - - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - # SDPA (default) - model_sdpa = model_class.from_pretrained(tmpdirname) - model_sdpa = model_sdpa.eval().to(torch_device) - - text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" - audio_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager" - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - self.assertTrue(model.language_model.config._attn_implementation == text_attn) - self.assertTrue(model.audio_tower.config._attn_implementation == audio_attn) - - # Eager - model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") - model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") - self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager") - - for _, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - @require_torch class AudioFlamingo3ForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py index c5e7aa3defcd..498f4fac0e12 100644 --- a/tests/models/granite_speech/test_modeling_granite_speech.py +++ b/tests/models/granite_speech/test_modeling_granite_speech.py @@ -13,7 +13,6 @@ # limitations under the License. """Testing suite for the IBM Granite Speech model.""" -import tempfile import unittest import pytest @@ -35,14 +34,8 @@ is_torch_available, ) -from ...generation.test_utils import GenerationTesterMixin -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ( - ModelTesterMixin, - floats_tensor, - ids_tensor, -) -from ...test_pipeline_mixin import PipelineTesterMixin +from ...audio_tester import AudioModelTest, AudioModelTester +from ...test_modeling_common import floats_tensor if is_torch_available(): @@ -52,129 +45,101 @@ from datasets import load_dataset -class GraniteSpeechForConditionalGenerationModelTester: - def __init__( - self, - parent, - seq_length=7, - encoder_config={ - "model_type": "granite_speech_encoder", - "context_size": 200, - "conv_expansion_factor": 2, - "conv_kernel_size": 15, - "dim_head": 32, - "dropout": 0.1, - "feedforward_mult": 4, - "hidden_dim": 32, - "input_dim": 160, - "num_heads": 4, - "num_layers": 2, - "output_dim": 42, - }, - text_config={ - "model_type": "granite", - "is_training": True, - "seq_length": 7, - "use_token_type_ids": False, - "use_labels": True, - "vocab_size": 99, - "hidden_size": 32, - "num_hidden_layers": 2, - "num_attention_heads": 4, - "intermediate_size": 37, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 580, - "type_vocab_size": 16, - "type_sequence_label_size": 2, - "initializer_range": 0.02, - "num_labels": 3, - "num_choices": 4, - "pad_token_id": 1, - }, - projector_config={ - "attention_probs_dropout_prob": 0.1, - "cross_attention_frequency": 1, - "encoder_hidden_size": 32, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 32, - "initializer_range": 0.02, - "intermediate_size": 256, - "layer_norm_eps": 1e-12, - "max_position_embeddings": 2048, - "model_type": "blip_2_qformer", - "num_attention_heads": 4, - "num_hidden_layers": 2, - "use_qformer_text_input": False, - "vocab_size": 30522, - }, - audio_token_index=0, - tie_word_embeddings=True, - initializer_range=0.02, - has_lora_adapter=True, - downsample_rate=5, - window_size=15, - is_training=True, - ): - self.parent = parent - self.encoder_config = encoder_config - self.text_config = text_config - self.projector_config = projector_config - self.audio_token_index = audio_token_index - self.tie_word_embeddings = tie_word_embeddings - self.initializer_range = initializer_range - self.has_lora_adapter = has_lora_adapter - self.downsample_rate = downsample_rate - self.window_size = window_size - self.is_training = is_training - - # Dims for audio features - self.sequence_dim = 844 - self.feature_dim = 160 - self.num_attention_heads = text_config["num_attention_heads"] - self.num_hidden_layers = text_config["num_hidden_layers"] - self.hidden_size = text_config["hidden_size"] - self.batch_size = 3 - self.pad_token_id = text_config["pad_token_id"] - self.seq_len = 7 - self.num_audio_tokens = 2 - self.seq_length = seq_length + self.num_audio_tokens - - def get_config(self): - return GraniteSpeechConfig( - encoder_config=self.encoder_config, - text_config=self.text_config, - projector_config=self.projector_config, - audio_token_index=self.audio_token_index, - tie_word_embeddings=self.tie_word_embeddings, - initializer_range=self.initializer_range, - has_lora_adapter=self.has_lora_adapter, +class GraniteSpeechModelTester(AudioModelTester): + config_class = GraniteSpeechConfig + conditional_generation_class = GraniteSpeechForConditionalGeneration + audio_config_key = "encoder_config" + audio_tower_attr = None # Encoder SDPA not checked + + def __init__(self, parent, **kwargs): + kwargs.setdefault("seq_length", 9) # 7 text + 2 audio tokens + kwargs.setdefault("num_audio_tokens", 2) + kwargs.setdefault("sequence_dim", 844) + kwargs.setdefault("feature_dim", 160) + kwargs.setdefault("audio_token_index", 0) + kwargs.setdefault("tie_word_embeddings", True) + kwargs.setdefault("initializer_range", 0.02) + kwargs.setdefault("has_lora_adapter", True) + kwargs.setdefault("downsample_rate", 5) + kwargs.setdefault("window_size", 15) + kwargs.setdefault( + "text_config", + { + "model_type": "granite", + "is_training": True, + "seq_length": 7, + "use_token_type_ids": False, + "use_labels": True, + "vocab_size": 99, + "hidden_size": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 37, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 580, + "type_vocab_size": 16, + "type_sequence_label_size": 2, + "initializer_range": 0.02, + "num_labels": 3, + "num_choices": 4, + "pad_token_id": 1, + }, ) - - def prepare_config_and_inputs(self): - input_features = floats_tensor( - [self.batch_size, self.sequence_dim, self.feature_dim], + kwargs.setdefault( + "audio_config", + { + "model_type": "granite_speech_encoder", + "context_size": 200, + "conv_expansion_factor": 2, + "conv_kernel_size": 15, + "dim_head": 32, + "dropout": 0.1, + "feedforward_mult": 4, + "hidden_dim": 32, + "input_dim": 160, + "num_heads": 4, + "num_layers": 2, + "output_dim": 42, + }, + ) + kwargs.setdefault( + "projector_config", + { + "attention_probs_dropout_prob": 0.1, + "cross_attention_frequency": 1, + "encoder_hidden_size": 32, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 32, + "initializer_range": 0.02, + "intermediate_size": 256, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 2048, + "model_type": "blip_2_qformer", + "num_attention_heads": 4, + "num_hidden_layers": 2, + "use_qformer_text_input": False, + "vocab_size": 30522, + }, ) - config = self.get_config() - return config, input_features + super().__init__(parent, **kwargs) - def prepare_config_and_inputs_for_common(self): - config_and_inputs = self.prepare_config_and_inputs() - config, input_features = config_and_inputs - input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2 - attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) - input_ids[input_ids == config.audio_token_index] = self.pad_token_id + def create_audio_features(self): + return floats_tensor([self.batch_size, self.sequence_dim, self.feature_dim]) - input_ids[:, : self.num_audio_tokens] = config.audio_token_index + def create_attention_mask(self, input_ids): + return torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) - inputs_dict = { - "input_features": input_features, - "input_ids": input_ids, - "attention_mask": attention_mask, - } - return config, inputs_dict + def get_num_audio_tokens(self, audio_features): + return self.num_audio_tokens + + def place_audio_tokens(self, input_ids, config, num_audio_tokens): + input_ids = input_ids.clone() + input_ids[input_ids == self.audio_token_id] = self.pad_token_id + input_ids[:, :num_audio_tokens] = self.audio_token_id + return input_ids def create_and_check_granite_speech_model_fp16_forward(self, config, input_ids, input_features, attention_mask): model = GraniteSpeechForConditionalGeneration(config=config) @@ -211,27 +176,16 @@ def create_and_check_granite_speech_model_fp16_autocast_forward( @require_torch -class GraniteSpeechForConditionalGenerationModelTest( - ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase -): +class GraniteSpeechForConditionalGenerationModelTest(AudioModelTest, unittest.TestCase): """ Model tester for `GraniteSpeechForConditionalGeneration`. """ - all_model_classes = (GraniteSpeechForConditionalGeneration,) if is_torch_available() else () + model_tester_class = GraniteSpeechModelTester pipeline_model_mapping = {"any-to-any": GraniteSpeechForConditionalGeneration} if is_torch_available() else {} - _is_composite = True - - def setUp(self): - self.model_tester = GraniteSpeechForConditionalGenerationModelTester(self) - self.config_tester = ConfigTester( - self, - config_class=GraniteSpeechConfig, - has_text_modality=False, - ) def test_inputs_embeds(self): - # overwrite inputs_embeds tests because we need to delete "input features" for the audio model + # Overwrite inputs_embeds tests because we need to delete "input_features" for the audio model config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: @@ -251,53 +205,12 @@ def test_inputs_embeds(self): with torch.no_grad(): model(**inputs) - def test_sdpa_can_dispatch_composite_models(self): - # overwrite because Granite Speech is audio+text model (not vision+text) - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - if not self._is_composite: - self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") - - for model_class in self.all_model_classes: - # NOTE - currently we only enable alternate attention implementations on - # the encapsulated LLM; in the future, this should be added for the conformer - # encoder as well. - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model_sdpa = model_class.from_pretrained(tmpdirname) - model_sdpa = model_sdpa.eval().to(torch_device) - - text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" - - # `None` as it is the requested one which will be assigned to each sub-config - # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - self.assertTrue(model.language_model.config._attn_implementation == text_attn) - - model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") - model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - @pytest.mark.generate @slow @unittest.skip(reason="Granite Speech doesn't support SDPA for all backbones") def test_eager_matches_sdpa_generate(self): pass - @unittest.skip(reason="GraniteSpeech has no separate base model without a head.") - def test_model_base_model_prefix(self): - pass - class GraniteSpeechForConditionalGenerationIntegrationTest(unittest.TestCase): def setUp(self): diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py index 4df16b9f6f4b..a1caaa4e7ae1 100644 --- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py +++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py @@ -13,7 +13,6 @@ # limitations under the License. """Testing suite for the PyTorch Qwen2Audio model.""" -import tempfile import unittest from io import BytesIO from urllib.request import urlopen @@ -34,121 +33,29 @@ torch_device, ) -from ...generation.test_utils import GenerationTesterMixin -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor -from ...test_pipeline_mixin import PipelineTesterMixin +from ...audio_tester import AudioModelTest, AudioModelTester if is_torch_available(): import torch -class Qwen2AudioModelTester: - def __init__( - self, - parent, - ignore_index=-100, - audio_token_index=0, - seq_length=25, - feat_seq_length=60, - text_config={ - "model_type": "qwen2", - "intermediate_size": 36, - "initializer_range": 0.02, - "hidden_size": 32, - "max_position_embeddings": 52, - "num_hidden_layers": 2, - "num_attention_heads": 4, - "num_key_value_heads": 2, - "use_labels": True, - "use_mrope": False, - "vocab_size": 99, - "pad_token_id": 1, # can't be the same as the audio token id - }, - is_training=True, - audio_config={ - "model_type": "qwen2_audio_encoder", - "d_model": 16, - "encoder_attention_heads": 4, - "encoder_ffn_dim": 16, - "encoder_layers": 2, - "num_mel_bins": 80, - "max_source_positions": 30, - "initializer_range": 0.02, - }, - ): - self.parent = parent - self.ignore_index = ignore_index - self.audio_token_index = audio_token_index - self.text_config = text_config - self.audio_config = audio_config - self.seq_length = seq_length - self.feat_seq_length = feat_seq_length - - self.num_hidden_layers = text_config["num_hidden_layers"] - self.vocab_size = text_config["vocab_size"] - self.hidden_size = text_config["hidden_size"] - self.num_attention_heads = text_config["num_attention_heads"] - self.is_training = is_training - - self.batch_size = 3 - self.encoder_seq_length = seq_length - - def get_config(self): - return Qwen2AudioConfig( - text_config=self.text_config, - audio_config=self.audio_config, - ignore_index=self.ignore_index, - audio_token_index=self.audio_token_index, - ) +class Qwen2AudioModelTester(AudioModelTester): + config_class = Qwen2AudioConfig + conditional_generation_class = Qwen2AudioForConditionalGeneration - def prepare_config_and_inputs(self): - input_features_values = floats_tensor( - [ - self.batch_size, - self.audio_config["num_mel_bins"], - self.feat_seq_length, - ] - ) - config = self.get_config() - feature_attention_mask = torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.long).to(torch_device) - return config, input_features_values, feature_attention_mask - - def prepare_config_and_inputs_for_common(self): - config_and_inputs = self.prepare_config_and_inputs() - config, input_features_values, feature_attention_mask = config_and_inputs - input_length = (input_features_values.shape[-1] - 1) // 2 + 1 - num_audio_tokens = (input_length - 2) // 2 + 1 - input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1 - attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) - attention_mask[:, :1] = 0 - # we are giving 3 audios let's make sure we pass in 3 audios tokens - input_ids[:, 1 : 1 + num_audio_tokens] = config.audio_token_index - inputs_dict = { - "input_features": input_features_values, - "feature_attention_mask": feature_attention_mask, - "input_ids": input_ids, - "attention_mask": attention_mask, - } - return config, inputs_dict + def get_audio_mask_key(self): + return "feature_attention_mask" @require_torch -class Qwen2AudioForConditionalGenerationModelTest( - ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase -): +class Qwen2AudioForConditionalGenerationModelTest(AudioModelTest, unittest.TestCase): """ Model tester for `Qwen2AudioForConditionalGeneration`. """ - all_model_classes = (Qwen2AudioForConditionalGeneration,) if is_torch_available() else () + model_tester_class = Qwen2AudioModelTester pipeline_model_mapping = {"any-to-any": Qwen2AudioForConditionalGeneration} if is_torch_available() else {} - _is_composite = True - - def setUp(self): - self.model_tester = Qwen2AudioModelTester(self) - self.config_tester = ConfigTester(self, config_class=Qwen2AudioConfig, has_text_modality=False) @unittest.skip(reason="Compile not yet supported because in Qwen2Audio models") @pytest.mark.torch_compile_test @@ -159,47 +66,6 @@ def test_sdpa_can_compile_dynamic(self): def test_sdpa_can_dispatch_on_flash(self): pass - @unittest.skip(reason="Qwen2Audio has no separate base model without a head.") - def test_model_base_model_prefix(self): - pass - - def test_sdpa_can_dispatch_composite_models(self): - # overwrite because Qwen2 is audio+text model (not vision+text) - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - if not self._is_composite: - self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") - - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model_sdpa = model_class.from_pretrained(tmpdirname) - model_sdpa = model_sdpa.eval().to(torch_device) - - text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" - vision_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager" - - # `None` as it is the requested one which will be assigned to each sub-config - # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - self.assertTrue(model.language_model.config._attn_implementation == text_attn) - self.assertTrue(model.audio_tower.config._attn_implementation == vision_attn) - - model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") - model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") - self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - @require_torch class Qwen2AudioForConditionalGenerationIntegrationTest(unittest.TestCase): From 0817bdbd3c4332e07216d1e50e84893810f8af2b Mon Sep 17 00:00:00 2001 From: Tarek Ziade Date: Mon, 13 Apr 2026 08:57:12 +0200 Subject: [PATCH 02/38] tweak check repo for audio tester --- utils/check_repo.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/utils/check_repo.py b/utils/check_repo.py index b1a3d158c716..0706e67236ee 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -776,6 +776,23 @@ def find_tested_models(test_file: str) -> set[str]: continue model_tested.add(tested_class) + # Same as above, but for AudioModelTester. Audio-LMs typically only set `conditional_generation_class` + # (no base_model_class). + audio_class_match = re.search(r"class \w+\(AudioModelTester\)", content) + if audio_class_match is not None: + audio_content = content[audio_class_match.start() :] + for test_class_type in [ + "config_class", + "conditional_generation_class", + "base_model_class", + "sequence_classification_class", + ]: + tested_class = re.findall(rf"{test_class_type}\s+=.*", audio_content) + if tested_class: + tested_class = tested_class[0].split("=")[1].strip() + if tested_class != "None": + model_tested.add(tested_class) + return model_tested From 356c922ee0b3944d78c58c9753d8c1bc2d30ac7f Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 13 Apr 2026 14:06:53 +0200 Subject: [PATCH 03/38] audio -> ALM --- tests/{audio_tester.py => alm_tester.py} | 12 ++++++------ .../audioflamingo3/test_modeling_audioflamingo3.py | 6 +++--- .../granite_speech/test_modeling_granite_speech.py | 6 +++--- .../models/qwen2_audio/test_modeling_qwen2_audio.py | 6 +++--- utils/check_repo.py | 4 ++-- 5 files changed, 17 insertions(+), 17 deletions(-) rename tests/{audio_tester.py => alm_tester.py} (96%) diff --git a/tests/audio_tester.py b/tests/alm_tester.py similarity index 96% rename from tests/audio_tester.py rename to tests/alm_tester.py index b2d900a2236d..4c47cf7eb538 100644 --- a/tests/audio_tester.py +++ b/tests/alm_tester.py @@ -33,7 +33,7 @@ import torch -class AudioModelTester: +class ALMModelTester: # If the model follows standard naming conventions, only `config_class` and # `conditional_generation_class` need to be set (others are optional). config_class = None @@ -137,7 +137,7 @@ def __init__(self, parent, **kwargs): for required_attribute in self._required_attributes: if getattr(self, required_attribute) is None: raise ValueError( - f"You have inherited from AudioModelTester but did not set the {required_attribute} attribute." + f"You have inherited from ALMModelTester but did not set the {required_attribute} attribute." ) # Because audio-LMs have some different standards in how they handle audio tokens, we need @@ -230,12 +230,12 @@ def prepare_config_and_inputs_for_common(self): @require_torch -class AudioModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin): +class ALMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin): """ Base test class for Audio-Language Models. Subclasses should set: - - `model_tester_class`: The tester class (subclass of AudioModelTester) + - `model_tester_class`: The tester class (subclass of ALMModelTester) Optional: - `all_model_classes`: Override if not using default from model_tester @@ -252,7 +252,7 @@ class AudioModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi def setUp(self): if self.model_tester_class is None: raise ValueError( - "You have inherited from AudioModelTest but did not set the model_tester_class attribute." + "You have inherited from ALMModelTest but did not set the model_tester_class attribute." ) self.model_tester = self.model_tester_class(self) self.config_tester = ConfigTester(self, config_class=self.model_tester.config_class, has_text_modality=False) @@ -260,7 +260,7 @@ def setUp(self): if self.pipeline_model_mapping is None: if self.all_model_classes is not None: raise ValueError( - "Tests that inherit from `AudioModelTest` and set `all_model_classes` must manually set " + "Tests that inherit from `ALMModelTest` and set `all_model_classes` must manually set " "`pipeline_model_mapping`." ) else: diff --git a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py index 8726443bbfca..86d82cf4294d 100644 --- a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py +++ b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py @@ -33,14 +33,14 @@ torch_device, ) -from ...audio_tester import AudioModelTest, AudioModelTester +from ...alm_tester import ALMModelTest, ALMModelTester if is_torch_available(): import torch -class AudioFlamingo3ModelTester(AudioModelTester): +class AudioFlamingo3ModelTester(ALMModelTester): config_class = AudioFlamingo3Config conditional_generation_class = AudioFlamingo3ForConditionalGeneration @@ -68,7 +68,7 @@ def create_audio_mask(self, audio_features): @require_torch -class AudioFlamingo3ForConditionalGenerationModelTest(AudioModelTest, unittest.TestCase): +class AudioFlamingo3ForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): """ Model tester for `AudioFlamingo3ForConditionalGeneration`. """ diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py index 498f4fac0e12..4b0e91ddbd36 100644 --- a/tests/models/granite_speech/test_modeling_granite_speech.py +++ b/tests/models/granite_speech/test_modeling_granite_speech.py @@ -34,7 +34,7 @@ is_torch_available, ) -from ...audio_tester import AudioModelTest, AudioModelTester +from ...alm_tester import ALMModelTest, ALMModelTester from ...test_modeling_common import floats_tensor @@ -45,7 +45,7 @@ from datasets import load_dataset -class GraniteSpeechModelTester(AudioModelTester): +class GraniteSpeechModelTester(ALMModelTester): config_class = GraniteSpeechConfig conditional_generation_class = GraniteSpeechForConditionalGeneration audio_config_key = "encoder_config" @@ -176,7 +176,7 @@ def create_and_check_granite_speech_model_fp16_autocast_forward( @require_torch -class GraniteSpeechForConditionalGenerationModelTest(AudioModelTest, unittest.TestCase): +class GraniteSpeechForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): """ Model tester for `GraniteSpeechForConditionalGeneration`. """ diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py index a1caaa4e7ae1..5733a4347568 100644 --- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py +++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py @@ -33,14 +33,14 @@ torch_device, ) -from ...audio_tester import AudioModelTest, AudioModelTester +from ...alm_tester import ALMModelTest, ALMModelTester if is_torch_available(): import torch -class Qwen2AudioModelTester(AudioModelTester): +class Qwen2AudioModelTester(ALMModelTester): config_class = Qwen2AudioConfig conditional_generation_class = Qwen2AudioForConditionalGeneration @@ -49,7 +49,7 @@ def get_audio_mask_key(self): @require_torch -class Qwen2AudioForConditionalGenerationModelTest(AudioModelTest, unittest.TestCase): +class Qwen2AudioForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): """ Model tester for `Qwen2AudioForConditionalGeneration`. """ diff --git a/utils/check_repo.py b/utils/check_repo.py index 0706e67236ee..3199d6cf4b2f 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -776,9 +776,9 @@ def find_tested_models(test_file: str) -> set[str]: continue model_tested.add(tested_class) - # Same as above, but for AudioModelTester. Audio-LMs typically only set `conditional_generation_class` + # Same as above, but for ALMModelTester. Audio-LMs typically only set `conditional_generation_class` # (no base_model_class). - audio_class_match = re.search(r"class \w+\(AudioModelTester\)", content) + audio_class_match = re.search(r"class \w+\(ALMModelTester\)", content) if audio_class_match is not None: audio_content = content[audio_class_match.start() :] for test_class_type in [ From 9663a8e56fe1c86b9833d251a90def0f4add31b8 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 13 Apr 2026 17:38:32 +0200 Subject: [PATCH 04/38] ALMTester: no audio/text defaults; better input prep --- tests/alm_tester.py | 231 ++++++++++++++++++++++++++++---------------- 1 file changed, 146 insertions(+), 85 deletions(-) diff --git a/tests/alm_tester.py b/tests/alm_tester.py index 4c47cf7eb538..5fd50997f470 100644 --- a/tests/alm_tester.py +++ b/tests/alm_tester.py @@ -75,8 +75,11 @@ def __init__(self, parent, **kwargs): # Standard defaults kwargs.setdefault("batch_size", 3) - kwargs.setdefault("seq_length", 25) - kwargs.setdefault("feat_seq_length", 60) + + # TODO: explain here specifically why these values are chosen + kwargs.setdefault("seq_length", 32) + kwargs.setdefault("feat_seq_length", 128) + kwargs.setdefault("num_mel_bins", 80) kwargs.setdefault("is_training", True) kwargs.setdefault("use_labels", True) @@ -84,42 +87,17 @@ def __init__(self, parent, **kwargs): kwargs.setdefault("bos_token_id", 1) kwargs.setdefault("eos_token_id", 2) kwargs.setdefault("audio_token_id", 0) - kwargs.setdefault("audio_token_index", 0) # Alias for models that use this name kwargs.setdefault("ignore_index", -100) kwargs.setdefault("scope", None) - - # Text config defaults (small Qwen2-style backbone) - kwargs.setdefault( - "text_config", - { - "model_type": "qwen2", - "intermediate_size": 36, - "initializer_range": 0.02, - "hidden_size": 32, - "max_position_embeddings": 52, - "num_hidden_layers": 2, - "num_attention_heads": 4, - "num_key_value_heads": 2, - "vocab_size": 99, - "pad_token_id": 1, - }, - ) - - # Audio config defaults (small Whisper-style encoder) - kwargs.setdefault( - "audio_config", - { - "model_type": "qwen2_audio_encoder", - "d_model": 16, - "encoder_attention_heads": 4, - "encoder_ffn_dim": 16, - "encoder_layers": 2, - "num_mel_bins": 80, - "max_source_positions": 30, - "initializer_range": 0.02, - }, - ) - + kwargs.setdefault("vocab_size", 99) + kwargs.setdefault("hidden_size", 32) + kwargs.setdefault("num_hidden_layers", 2) + kwargs.setdefault("num_attention_heads", 2) + kwargs.setdefault("num_key_value_heads", 2) + kwargs.setdefault("intermediate_size", 32) # Keep this divisible by 8 for fp16/bf16/fp32 16-bytes alignment + kwargs.setdefault("hidden_act", "gelu") + kwargs.setdefault("max_position_embeddings", 512) + # Optional projector config (e.g. GraniteSpeech uses a Q-Former projector) kwargs.setdefault("projector_config", None) @@ -127,14 +105,20 @@ def __init__(self, parent, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) - # Derived from text config (needed by ModelTesterMixin) - self.vocab_size = self.text_config.get("vocab_size", 99) - self.hidden_size = self.text_config.get("hidden_size", 32) - self.num_hidden_layers = self.text_config.get("num_hidden_layers", 2) - self.num_attention_heads = self.text_config.get("num_attention_heads", 4) - self.encoder_seq_length = self.seq_length - - for required_attribute in self._required_attributes: + # # Derived from text config (needed by ModelTesterMixin) + # self.vocab_size = self.text_config.get("vocab_size", 99) + # self.hidden_size = self.text_config.get("hidden_size", 32) + # self.num_hidden_layers = self.text_config.get("num_hidden_layers", 2) + # self.num_attention_heads = self.text_config.get("num_attention_heads", 4) + # self.encoder_seq_length = self.seq_length + + for required_attribute in [ + # "base_model_class", # TODO: @eustlb, there is a discrepancy here between ALMs/ VLMs. XXModel and XXForConditionalGeneration + "config_class", + "conditional_generation_class", + "text_config_class", + "audio_config_class", + ]: if getattr(self, required_attribute) is None: raise ValueError( f"You have inherited from ALMModelTester but did not set the {required_attribute} attribute." @@ -148,22 +132,23 @@ def create_audio_features(self): return floats_tensor([self.batch_size, self.num_mel_bins, self.feat_seq_length]) def create_attention_mask(self, input_ids): - """Create text attention mask. Override for models without a padding sentinel.""" - attention_mask = torch.ones_like(input_ids, dtype=torch.long).to(torch_device) - attention_mask[:, :1] = 0 # Padding sentinel - return attention_mask + # TODO: check, this looks strange to force as default behavior + # Override for bidirectional attention models like Gemma3 + return torch.tril(torch.ones_like(input_ids).to(torch_device)) - def get_num_audio_tokens(self, audio_features): - """Compute number of audio placeholder tokens from features. Override for different subsampling.""" - # Default: 2-stage pooling (common for Whisper-style encoders) - input_length = (audio_features.shape[-1] - 1) // 2 + 1 - return (input_length - 2) // 2 + 1 + def get_audio_embeds_mask(self, audio_embeds_mask): + """Get audio embeds mask from audio mask. Override for different shapes.""" + raise NotImplementedError("This method should be overridden in the subclass") def place_audio_tokens(self, input_ids, config, num_audio_tokens): - """Place audio placeholder tokens in input_ids. Override for different placement.""" + """Place audio placeholder tokens at random positions in input_ids. Override for different placement.""" input_ids = input_ids.clone() input_ids[input_ids == self.audio_token_id] = self.pad_token_id - input_ids[:, 1 : 1 + num_audio_tokens] = self.audio_token_id + for i in range(input_ids.shape[0]): + n = num_audio_tokens[i].item() if isinstance(num_audio_tokens, torch.Tensor) else num_audio_tokens + available_positions = torch.arange(1, input_ids.shape[1]) # skip position 0 (BOS) + perm = torch.randperm(len(available_positions))[:n] + input_ids[i, available_positions[perm]] = self.audio_token_id return input_ids def get_audio_feature_key(self): @@ -174,9 +159,20 @@ def get_audio_mask_key(self): """Key name for audio attention mask. Return None if no audio mask needed.""" return None - def create_audio_mask(self, audio_features): - """Create audio-level attention mask. Override for bool masks or different shapes.""" - return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.long).to(torch_device) + def create_audio_mask(self): + """Create audio-level attention mask with contiguous valid regions per batch element. + + Each element gets a random offset and length, producing masks like [0, 0, 1, 1, 1, 0, 0]. + """ + # Sample lengths in [1, feat_seq_length] and offsets in [0, feat_seq_length - length] + lengths = ids_tensor([self.batch_size], vocab_size=self.feat_seq_length).abs() + 1 + lengths = lengths.clamp(max=self.feat_seq_length) + offsets = ids_tensor([self.batch_size], vocab_size=self.feat_seq_length).abs() + offsets = offsets % (self.feat_seq_length - lengths + 1) + + positions = torch.arange(self.feat_seq_length, device=torch_device)[None, :] + audio_mask = ((positions >= offsets[:, None]) & (positions < offsets[:, None] + lengths[:, None])).long() + return audio_mask def get_additional_inputs(self, config, input_ids, audio_features): """Return dict of model-specific extra inputs (e.g. image_sizes for multi-modal).""" @@ -184,50 +180,115 @@ def get_additional_inputs(self, config, input_ids, audio_features): # End of overridable methods - @property - def config_args(self): - return list(signature(self.config_class.__init__).parameters.keys()) - - def get_config(self): - kwargs = {} - skip_keys = {"self", "text_config", self.audio_config_key, "projector_config"} - attribute_map = getattr(self.config_class, "attribute_map", {}) - model_name_to_common_name = {v: k for k, v in attribute_map.items()} - for k in self.config_args + self.forced_config_args: - if k in skip_keys: - continue - if hasattr(self, k) and k != "self": - kwargs[k] = getattr(self, k) - elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]): - kwargs[k] = getattr(self, model_name_to_common_name[k]) - kwargs["text_config"] = self.text_config - kwargs[self.audio_config_key] = self.audio_config - if self.projector_config is not None: - kwargs["projector_config"] = self.projector_config - return self.config_class(**kwargs) - def prepare_config_and_inputs_for_common(self): - config = self.get_config() + # TODO: add a clear diagram that explains input prep + audio_features = self.create_audio_features() - num_audio_tokens = self.get_num_audio_tokens(audio_features) + audio_mask = self.create_audio_mask() + audio_embeds_mask = self.get_audio_embeds_mask(audio_mask) - input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2 + if audio_embeds_mask.shape[1] > self.seq_length: + raise ValueError( + f"`audio_embeds_mask` has more tokens per sequence than `seq_length` allows " + f"({audio_embeds_mask.shape[1]} > {self.seq_length}). " + "This likely indicates a mismatch between your feature extraction/configuration and your sequence length. " + "Please ensure `seq_length` is >= the number of audio embedding positions." + ) + + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + special_tokens = [self.pad_token_id, self.bos_token_id, self.eos_token_id, self.audio_token_id] + for i in range(self.vocab_size): + if i not in special_tokens: + safe_token_id = i + break + else: + raise ValueError("vocab_size is too small and there is no token ID that is not a special token!") + + # Avoid flaky tests, clear any special tokens in ids_tensor + # audio_token_id is handled separately by place_audio_tokens() + input_ids[input_ids == self.pad_token_id] = safe_token_id + input_ids[input_ids == self.eos_token_id] = safe_token_id + + config = self.get_config() + num_audio_tokens = audio_embeds_mask.sum(dim=1) input_ids = self.place_audio_tokens(input_ids, config, num_audio_tokens) attention_mask = self.create_attention_mask(input_ids) inputs_dict = { - self.get_audio_feature_key(): audio_features, "input_ids": input_ids, "attention_mask": attention_mask, + self.get_audio_feature_key(): audio_features, } audio_mask_key = self.get_audio_mask_key() if audio_mask_key is not None: - inputs_dict[audio_mask_key] = self.create_audio_mask(audio_features) + inputs_dict[audio_mask_key] = audio_mask inputs_dict.update(self.get_additional_inputs(config, input_ids, audio_features)) return config, inputs_dict + @property + def config_args(self): + return list(signature(self.config_class.__init__).parameters.keys()) + + @property + def text_config_args(self): + args = list(signature(self.text_config_class.__init__).parameters.keys()) + for token_arg in ["pad_token_id", "bos_token_id", "eos_token_id"]: # Not always explicitly in the sig + if token_arg not in args: + args.append(token_arg) + return args + + @property + def audio_config_args(self): + return list(signature(self.audio_config_class.__init__).parameters.keys()) + + def get_config(self): + kwargs = {} + attribute_map = getattr(self.config_class, "attribute_map", {}) + model_name_to_common_name = {v: k for k, v in attribute_map.items()} + for k in self.config_args + self.forced_config_args: + if hasattr(self, k) and k != "self": + kwargs[k] = getattr(self, k) + elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]): + kwargs[k] = getattr(self, model_name_to_common_name[k]) + kwargs["text_config"] = self.get_text_config() + kwargs["audio_config"] = self.get_audio_config() + return self.config_class(**kwargs) + + def get_text_config(self): + kwargs = {} + attribute_map = getattr(self.text_config_class, "attribute_map", {}) + model_name_to_common_name = {v: k for k, v in attribute_map.items()} + for k in self.text_config_args: + if hasattr(self, k) and k != "self": + kwargs[k] = getattr(self, k) + elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]): + kwargs[k] = getattr(self, model_name_to_common_name[k]) + return self.text_config_class(**kwargs) + + def get_audio_config(self): + kwargs = {} + attribute_map = getattr(self.audio_config_class, "attribute_map", {}) + model_name_to_common_name = {v: k for k, v in attribute_map.items()} + for k in self.audio_config_args: + if hasattr(self, k) and k != "self": + kwargs[k] = getattr(self, k) + elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]): + kwargs[k] = getattr(self, model_name_to_common_name[k]) + return self.audio_config_class(**kwargs) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = self.base_model_class(config=config) + model.to(torch_device) + model.eval() + model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + @require_torch class ALMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin): From a599b1de9051e6f0f47867f05e08e5b07e2c7731 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Sun, 19 Apr 2026 15:24:48 +0200 Subject: [PATCH 05/38] udpate test_sdpa_can_dispatch_composite_models to hanlde ALMs --- tests/alm_tester.py | 49 ------------------- .../test_modeling_audioflamingo3.py | 49 +++++++++---------- tests/test_modeling_common.py | 26 ++++++---- 3 files changed, 39 insertions(+), 85 deletions(-) diff --git a/tests/alm_tester.py b/tests/alm_tester.py index 5fd50997f470..4223e9a87ca4 100644 --- a/tests/alm_tester.py +++ b/tests/alm_tester.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tempfile import unittest from inspect import signature @@ -45,10 +44,6 @@ class ALMModelTester: # Override to "encoder_config" for models like GraniteSpeech. audio_config_key = "audio_config" - # Model attribute name for the audio encoder (used in SDPA dispatch tests). - # Set to None to skip audio encoder SDPA checking. - audio_tower_attr = "audio_tower" - # Arguments that should be passed to the config class even if not in its signature. forced_config_args = ["pad_token_id"] @@ -334,50 +329,6 @@ def test_config(self): """Test config common functionality.""" self.config_tester.run_common_tests() - def test_sdpa_can_dispatch_composite_models(self): - """Verify SDPA toggles propagate correctly to audio and text sub-modules.""" - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - if not self._is_composite: - self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") - - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - # SDPA (default) - model_sdpa = model_class.from_pretrained(tmpdirname) - model_sdpa = model_sdpa.eval().to(torch_device) - - text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - self.assertTrue(model.language_model.config._attn_implementation == text_attn) - - audio_tower_attr = self.model_tester.audio_tower_attr - if audio_tower_attr is not None: - audio_tower = getattr(model, audio_tower_attr) - audio_attn = "sdpa" if audio_tower._supports_sdpa else "eager" - self.assertTrue(audio_tower.config._attn_implementation == audio_attn) - - # Eager - model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") - model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") - - if audio_tower_attr is not None: - self.assertTrue(getattr(model_eager, audio_tower_attr).config._attn_implementation == "eager") - - for _, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - @unittest.skip("Audio-LMs have no separate base model without a head.") def test_model_base_model_prefix(self): pass diff --git a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py index 86d82cf4294d..153c6ba11b52 100644 --- a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py +++ b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py @@ -22,6 +22,8 @@ from transformers import ( AudioFlamingo3Config, + AudioFlamingo3EncoderConfig, + Qwen2Config, AudioFlamingo3ForConditionalGeneration, AutoProcessor, is_torch_available, @@ -43,29 +45,35 @@ class AudioFlamingo3ModelTester(ALMModelTester): config_class = AudioFlamingo3Config conditional_generation_class = AudioFlamingo3ForConditionalGeneration + text_config_class = Qwen2Config + audio_config_class = AudioFlamingo3EncoderConfig + def __init__(self, parent, **kwargs): - kwargs.setdefault( - "audio_config", - { - "model_type": "audioflamingo3_encoder", - "hidden_size": 16, - "num_attention_heads": 4, - "intermediate_size": 16, - "num_hidden_layers": 2, - "num_mel_bins": 80, - "max_source_positions": 30, - "initializer_range": 0.02, - }, - ) + # feat_seq_length → (L-1)//2+1 after conv2 → (·-2)//2+1 after avg_pool, so + # feat_seq_length=60 gives 15 audio embed tokens (fits inside seq_length=32 + BOS + text). + kwargs.setdefault("feat_seq_length", 60) + # Encoder adds a learned positional embedding of size max_source_positions to post-conv2 features, + # so it must equal (feat_seq_length - 1) // 2 + 1. + kwargs.setdefault("max_source_positions", (kwargs["feat_seq_length"] - 1) // 2 + 1) super().__init__(parent, **kwargs) def get_audio_mask_key(self): return "input_features_mask" - def create_audio_mask(self, audio_features): + def create_audio_mask(self): return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) + def get_audio_embeds_mask(self, audio_mask): + # Mirrors AudioFlamingo3Encoder._get_feat_extract_output_lengths: + # conv2 (k=3,s=2,p=1) then avg_pool (k=2,s=2). + input_lengths = audio_mask.sum(-1) + input_lengths = (input_lengths - 1) // 2 + 1 + output_lengths = (input_lengths - 2) // 2 + 1 + max_len = int(output_lengths.max().item()) + positions = torch.arange(max_len, device=audio_mask.device)[None, :] + return (positions < output_lengths[:, None]).long() + @require_torch class AudioFlamingo3ForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): @@ -91,19 +99,6 @@ class AudioFlamingo3ForConditionalGenerationModelTest(ALMModelTest, unittest.Tes def test_inputs_embeds_matches_input_ids(self): pass - @unittest.skip(reason="Compile not yet supported for AudioFlamingo3 models") - @pytest.mark.torch_compile_test - def test_sdpa_can_compile_dynamic(self): - pass - - @unittest.skip(reason="Compile not yet supported for AudioFlamingo3 models") - def test_sdpa_can_dispatch_on_flash(self): - pass - - @unittest.skip(reason="AudioFlamingo3 tests avoid right-padding equivalence; fusion is in-place.") - def test_flash_attn_2_inference_equivalence_right_padding(self): - pass - @require_torch class AudioFlamingo3ForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 24f278c24704..ac754f3d672a 100644 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -3584,30 +3584,38 @@ def test_sdpa_can_dispatch_composite_models(self): model_sdpa = model_class.from_pretrained(tmpdirname) model_sdpa = model_sdpa.base_model - vision_model_names = {"visual", "image_tower", "vision_tower", "vision_model"} + modality_tower_names = { + "visual", + "image_tower", + "vision_tower", + "vision_model", + "audio_tower", + "audio_model", + } language_model_names = {"language_model", "model", "text_model"} - vision_model_name = [name for name in vision_model_names if hasattr(model_sdpa, name)] - vision_model_name = vision_model_name[0] if len(vision_model_name) > 0 else None + modality_tower_name = [name for name in modality_tower_names if hasattr(model_sdpa, name)] + modality_tower_name = modality_tower_name[0] if len(modality_tower_name) > 0 else None language_model_name = [name for name in language_model_names if hasattr(model_sdpa, name)] language_model_name = language_model_name[0] if len(language_model_name) > 0 else None - if language_model_name is None or vision_model_name is None: + if language_model_name is None or modality_tower_name is None: self.skipTest( - reason="Model does not have both vision and language sub-models, cannot test composite SDPA dispatch" + reason="Model does not have both a non-text modality tower and a language sub-model, " + "cannot test composite SDPA dispatch" ) - vision_model_sdpa = getattr(model_sdpa, vision_model_name) + modality_tower_sdpa = getattr(model_sdpa, modality_tower_name) language_model_sdpa = getattr(model_sdpa, language_model_name) text_attn = "sdpa" if language_model_sdpa._supports_sdpa else "eager" - vision_attn = "sdpa" if vision_model_sdpa._supports_sdpa else "eager" + modality_attn = "sdpa" if modality_tower_sdpa._supports_sdpa else "eager" # `None` as it is the requested one which will be assigned to each sub-config # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) self.assertTrue(language_model_sdpa.config._attn_implementation == text_attn) - self.assertTrue(vision_model_sdpa.config._attn_implementation == vision_attn) + self.assertTrue(modality_tower_sdpa.config._attn_implementation == modality_attn) model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") model_eager = model_eager.base_model self.assertTrue(getattr(model_eager, language_model_name).config._attn_implementation == "eager") - self.assertTrue(getattr(model_eager, vision_model_name).config._attn_implementation == "eager") + self.assertTrue(getattr(model_eager, modality_tower_name).config._attn_implementation == "eager") for name, submodule in model_eager.named_modules(): class_name = submodule.__class__.__name__ From a7d54dc554c80c19013c4ce7d04fa12748c23b9f Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 20 Apr 2026 10:43:46 +0200 Subject: [PATCH 06/38] propagate to other model classes --- tests/alm_tester.py | 13 +- tests/models/glmasr/test_modeling_glmasr.py | 170 +++------------- .../test_modeling_granite_speech.py | 94 +++------ .../test_modeling_musicflamingo.py | 183 ++++------------- .../qwen2_audio/test_modeling_qwen2_audio.py | 34 ++++ tests/models/voxtral/test_modeling_voxtral.py | 167 +++------------- .../test_modeling_voxtral_realtime.py | 189 +++++++----------- 7 files changed, 232 insertions(+), 618 deletions(-) diff --git a/tests/alm_tester.py b/tests/alm_tester.py index 4223e9a87ca4..4c104e6dd49d 100644 --- a/tests/alm_tester.py +++ b/tests/alm_tester.py @@ -136,14 +136,17 @@ def get_audio_embeds_mask(self, audio_embeds_mask): raise NotImplementedError("This method should be overridden in the subclass") def place_audio_tokens(self, input_ids, config, num_audio_tokens): - """Place audio placeholder tokens at random positions in input_ids. Override for different placement.""" + """Place audio placeholder tokens contiguously after BOS. Override for different placement. + + Deterministic placement (position 0 reserved for BOS; audio tokens at [1:1+n]) keeps + the tail of each sequence text-only, which downstream tests (e.g. resize_token_embeddings + overwriting column -2) rely on. + """ input_ids = input_ids.clone() input_ids[input_ids == self.audio_token_id] = self.pad_token_id for i in range(input_ids.shape[0]): n = num_audio_tokens[i].item() if isinstance(num_audio_tokens, torch.Tensor) else num_audio_tokens - available_positions = torch.arange(1, input_ids.shape[1]) # skip position 0 (BOS) - perm = torch.randperm(len(available_positions))[:n] - input_ids[i, available_positions[perm]] = self.audio_token_id + input_ids[i, 1 : 1 + int(n)] = self.audio_token_id return input_ids def get_audio_feature_key(self): @@ -249,7 +252,7 @@ def get_config(self): elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]): kwargs[k] = getattr(self, model_name_to_common_name[k]) kwargs["text_config"] = self.get_text_config() - kwargs["audio_config"] = self.get_audio_config() + kwargs[self.audio_config_key] = self.get_audio_config() return self.config_class(**kwargs) def get_text_config(self): diff --git a/tests/models/glmasr/test_modeling_glmasr.py b/tests/models/glmasr/test_modeling_glmasr.py index 744e268e74c7..8b93ad64337d 100644 --- a/tests/models/glmasr/test_modeling_glmasr.py +++ b/tests/models/glmasr/test_modeling_glmasr.py @@ -13,7 +13,6 @@ # limitations under the License. """Testing suite for the PyTorch glmasr model.""" -import tempfile import unittest import pytest @@ -22,8 +21,10 @@ AutoProcessor, GlmAsrConfig, GlmAsrForConditionalGeneration, + LlamaConfig, is_torch_available, ) +from transformers.models.glmasr.configuration_glmasr import GlmAsrEncoderConfig from transformers.testing_utils import ( cleanup, require_torch, @@ -31,123 +32,53 @@ torch_device, ) -from ...generation.test_utils import GenerationTesterMixin -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor -from ...test_pipeline_mixin import PipelineTesterMixin +from ...alm_tester import ALMModelTest, ALMModelTester if is_torch_available(): import torch -class GlmAsrModelTester: - def __init__( - self, - parent, - ignore_index=-100, - audio_token_id=0, - seq_length=35, - feat_seq_length=64, - text_config={ - "model_type": "llama", - "intermediate_size": 64, - "initializer_range": 0.02, - "hidden_size": 16, - "max_position_embeddings": 52, - "num_hidden_layers": 2, - "num_attention_heads": 4, - "use_labels": True, - "use_mrope": False, - "vocab_size": 99, - "head_dim": 8, - "pad_token_id": 1, # can't be the same as the audio token id - }, - is_training=True, - audio_config={ - "model_type": "glmasr_encoder", - "hidden_size": 128, - "num_attention_heads": 2, - "intermediate_size": 512, - "num_hidden_layers": 2, - "num_mel_bins": 128, - "max_source_positions": 32, - "initializer_range": 0.02, - }, - ): - self.parent = parent - self.ignore_index = ignore_index - self.audio_token_id = audio_token_id - self.text_config = text_config - self.audio_config = audio_config - self.seq_length = seq_length - self.feat_seq_length = feat_seq_length - - self.num_hidden_layers = text_config["num_hidden_layers"] - self.vocab_size = text_config["vocab_size"] - self.hidden_size = text_config["hidden_size"] - self.num_attention_heads = text_config["num_attention_heads"] - self.is_training = is_training - - self.batch_size = 3 - self.encoder_seq_length = seq_length - - def get_config(self): - return GlmAsrConfig( - text_config=self.text_config, - audio_config=self.audio_config, - ignore_index=self.ignore_index, - audio_token_id=self.audio_token_id, - ) +class GlmAsrModelTester(ALMModelTester): + config_class = GlmAsrConfig + conditional_generation_class = GlmAsrForConditionalGeneration + text_config_class = LlamaConfig + audio_config_class = GlmAsrEncoderConfig - def prepare_config_and_inputs(self): - input_features_values = floats_tensor( - [ - self.batch_size, - self.audio_config["num_mel_bins"], - self.feat_seq_length, - ] - ) - config = self.get_config() - input_features_mask = torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) - return config, input_features_values, input_features_mask + def __init__(self, parent, **kwargs): + # feat_seq_length=64 → conv2 (s=2): post_conv=32 → merge_factor=4: 8 audio embed tokens. + kwargs.setdefault("feat_seq_length", 64) + kwargs.setdefault("seq_length", 35) + kwargs.setdefault("head_dim", 8) + super().__init__(parent, **kwargs) - def prepare_config_and_inputs_for_common(self): - config_and_inputs = self.prepare_config_and_inputs() - config, input_features_values, input_features_mask = config_and_inputs - num_audio_tokens_per_batch_idx = 8 + def get_audio_mask_key(self): + return "input_features_mask" - input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1 - attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) - attention_mask[:, :1] = 0 + def create_audio_mask(self): + return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) - input_ids[:, 1 : 1 + num_audio_tokens_per_batch_idx] = config.audio_token_id - inputs_dict = { - "input_ids": input_ids, - "attention_mask": attention_mask, - "input_features": input_features_values, - "input_features_mask": input_features_mask, - } - return config, inputs_dict + def get_audio_embeds_mask(self, audio_mask): + # conv1 (s=1) preserves length; conv2 (s=2, k=3, p=1) halves; merge_factor=4 post-projector. + audio_lengths = audio_mask.sum(-1) + for padding, kernel_size, stride in [(1, 3, 1), (1, 3, 2)]: + audio_lengths = (audio_lengths + 2 * padding - (kernel_size - 1) - 1) // stride + 1 + merge_factor = 4 + post_lengths = (audio_lengths - merge_factor) // merge_factor + 1 + max_len = int(post_lengths.max().item()) + positions = torch.arange(max_len, device=audio_mask.device)[None, :] + return (positions < post_lengths[:, None]).long() @require_torch -class GlmAsrForConditionalGenerationModelTest( - ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase -): +class GlmAsrForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): """ Model tester for `GlmAsrForConditionalGeneration`. """ - all_model_classes = (GlmAsrForConditionalGeneration,) if is_torch_available() else () + model_tester_class = GlmAsrModelTester pipeline_model_mapping = {"audio-text-to-text": GlmAsrForConditionalGeneration} if is_torch_available() else {} - _is_composite = True - - def setUp(self): - self.model_tester = GlmAsrModelTester(self) - self.config_tester = ConfigTester(self, config_class=GlmAsrConfig, has_text_modality=False) - @unittest.skip( reason="This test does not apply to GlmAsr since inputs_embeds corresponding to audio tokens are replaced when input features are provided." ) @@ -167,47 +98,6 @@ def test_sdpa_can_dispatch_on_flash(self): def test_flash_attn_2_inference_equivalence_right_padding(self): pass - @unittest.skip(reason="GlmAsr has no separate base model without a head.") - def test_model_base_model_prefix(self): - pass - - def test_sdpa_can_dispatch_composite_models(self): - # GlmAsr is audio+text composite; verify SDPA toggles propagate to submodules. - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - if not self._is_composite: - self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") - - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - # SDPA (default) - model_sdpa = model_class.from_pretrained(tmpdirname) - model_sdpa = model_sdpa.eval().to(torch_device) - - text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" - audio_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager" - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - self.assertTrue(model.language_model.config._attn_implementation == text_attn) - self.assertTrue(model.audio_tower.config._attn_implementation == audio_attn) - - # Eager - model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") - model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") - self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager") - - for _, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - @require_torch class GlmAsrForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py index 4b0e91ddbd36..f7c76cb4093e 100644 --- a/tests/models/granite_speech/test_modeling_granite_speech.py +++ b/tests/models/granite_speech/test_modeling_granite_speech.py @@ -19,7 +19,9 @@ from transformers import ( AutoProcessor, + GraniteConfig, GraniteSpeechConfig, + GraniteSpeechEncoderConfig, GraniteSpeechForConditionalGeneration, ) from transformers.testing_utils import ( @@ -48,80 +50,39 @@ class GraniteSpeechModelTester(ALMModelTester): config_class = GraniteSpeechConfig conditional_generation_class = GraniteSpeechForConditionalGeneration + text_config_class = GraniteConfig + audio_config_class = GraniteSpeechEncoderConfig audio_config_key = "encoder_config" - audio_tower_attr = None # Encoder SDPA not checked def __init__(self, parent, **kwargs): kwargs.setdefault("seq_length", 9) # 7 text + 2 audio tokens kwargs.setdefault("num_audio_tokens", 2) kwargs.setdefault("sequence_dim", 844) kwargs.setdefault("feature_dim", 160) - kwargs.setdefault("audio_token_index", 0) - kwargs.setdefault("tie_word_embeddings", True) - kwargs.setdefault("initializer_range", 0.02) kwargs.setdefault("has_lora_adapter", True) kwargs.setdefault("downsample_rate", 5) kwargs.setdefault("window_size", 15) + # GraniteSpeechEncoderConfig fields (no attribute_map, so set explicitly). + kwargs.setdefault("input_dim", 160) + kwargs.setdefault("num_layers", 2) + kwargs.setdefault("hidden_dim", 32) + kwargs.setdefault("num_heads", 4) + kwargs.setdefault("dim_head", 8) + kwargs.setdefault("feedforward_mult", 4) + kwargs.setdefault("context_size", 200) + kwargs.setdefault("conv_kernel_size", 15) + kwargs.setdefault("conv_expansion_factor", 2) + kwargs.setdefault("output_dim", 42) + # Q-Former projector config (passed through as a dict; ALM's get_config forwards unknowns). kwargs.setdefault( - "text_config", + "projector_config", { - "model_type": "granite", - "is_training": True, - "seq_length": 7, - "use_token_type_ids": False, - "use_labels": True, - "vocab_size": 99, + "model_type": "blip_2_qformer", "hidden_size": 32, "num_hidden_layers": 2, "num_attention_heads": 4, - "intermediate_size": 37, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 580, - "type_vocab_size": 16, - "type_sequence_label_size": 2, - "initializer_range": 0.02, - "num_labels": 3, - "num_choices": 4, - "pad_token_id": 1, - }, - ) - kwargs.setdefault( - "audio_config", - { - "model_type": "granite_speech_encoder", - "context_size": 200, - "conv_expansion_factor": 2, - "conv_kernel_size": 15, - "dim_head": 32, - "dropout": 0.1, - "feedforward_mult": 4, - "hidden_dim": 32, - "input_dim": 160, - "num_heads": 4, - "num_layers": 2, - "output_dim": 42, - }, - ) - kwargs.setdefault( - "projector_config", - { - "attention_probs_dropout_prob": 0.1, - "cross_attention_frequency": 1, - "encoder_hidden_size": 32, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 32, - "initializer_range": 0.02, "intermediate_size": 256, - "layer_norm_eps": 1e-12, - "max_position_embeddings": 2048, - "model_type": "blip_2_qformer", - "num_attention_heads": 4, - "num_hidden_layers": 2, - "use_qformer_text_input": False, - "vocab_size": 30522, + "encoder_hidden_size": 32, }, ) super().__init__(parent, **kwargs) @@ -129,17 +90,16 @@ def __init__(self, parent, **kwargs): def create_audio_features(self): return floats_tensor([self.batch_size, self.sequence_dim, self.feature_dim]) - def create_attention_mask(self, input_ids): - return torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) + def create_audio_mask(self): + # Granite's encoder is fed the raw features; mask is all-ones over sequence_dim. + return torch.ones([self.batch_size, self.sequence_dim], dtype=torch.bool).to(torch_device) - def get_num_audio_tokens(self, audio_features): - return self.num_audio_tokens + def get_audio_embeds_mask(self, audio_mask): + # Projector produces `num_audio_tokens` embeds per sample (fixed by window_size/downsample_rate). + return torch.ones([self.batch_size, self.num_audio_tokens], dtype=torch.long).to(torch_device) - def place_audio_tokens(self, input_ids, config, num_audio_tokens): - input_ids = input_ids.clone() - input_ids[input_ids == self.audio_token_id] = self.pad_token_id - input_ids[:, :num_audio_tokens] = self.audio_token_id - return input_ids + def create_attention_mask(self, input_ids): + return torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) def create_and_check_granite_speech_model_fp16_forward(self, config, input_ids, input_features, attention_mask): model = GraniteSpeechForConditionalGeneration(config=config) diff --git a/tests/models/musicflamingo/test_modeling_musicflamingo.py b/tests/models/musicflamingo/test_modeling_musicflamingo.py index 8c3b0ce549c8..9b8153705582 100644 --- a/tests/models/musicflamingo/test_modeling_musicflamingo.py +++ b/tests/models/musicflamingo/test_modeling_musicflamingo.py @@ -16,16 +16,17 @@ import json import os -import tempfile import unittest from pathlib import Path import pytest from transformers import ( + AudioFlamingo3EncoderConfig, AutoProcessor, MusicFlamingoConfig, MusicFlamingoForConditionalGeneration, + Qwen2Config, is_torch_available, ) from transformers.testing_utils import ( @@ -37,129 +38,60 @@ torch_device, ) -from ...generation.test_utils import GenerationTesterMixin -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...alm_tester import ALMModelTest, ALMModelTester +from ...test_modeling_common import ids_tensor if is_torch_available(): import torch -class MusicFlamingoModelTester: +class MusicFlamingoModelTester(ALMModelTester): """ Builds a tiny MusicFlamingo config and synthetic inputs that respect MusicFlamingo's post-pool token accounting: num tokens per sample == post-pool frame count. """ - def __init__( - self, - parent, - audio_token_id=0, - seq_length=25, - feat_seq_length=60, - text_config=None, - audio_config=None, - is_training=True, - ): - self.parent = parent - self.audio_token_id = audio_token_id - self.seq_length = seq_length - self.feat_seq_length = feat_seq_length - self.is_training = is_training - - # Small text backbone (Qwen2-ish) - if text_config is None: - text_config = { - "model_type": "qwen2", - "intermediate_size": 36, - "initializer_range": 0.02, - "hidden_size": 32, - "max_position_embeddings": 52, - "num_hidden_layers": 2, - "num_attention_heads": 4, - "num_key_value_heads": 2, - "use_labels": True, - "use_mrope": False, - "vocab_size": 99, - "pad_token_id": 1, # Ensure pad token != audio token - } - # Small audio encoder (MusicFlamingo Whisper-style) - if audio_config is None: - audio_config = { - "model_type": "musicflamingo_encoder", - "hidden_size": 16, - "num_attention_heads": 4, - "intermediate_size": 16, - "num_hidden_layers": 2, - "num_mel_bins": 80, - "max_source_positions": 30, - "initializer_range": 0.02, - } + config_class = MusicFlamingoConfig + conditional_generation_class = MusicFlamingoForConditionalGeneration + text_config_class = Qwen2Config + audio_config_class = AudioFlamingo3EncoderConfig - self.text_config = text_config - self.audio_config = audio_config + def __init__(self, parent, **kwargs): + # feat_seq_length=60 → (60-1)//2+1=30 → (30-2)//2+1=15 audio embed tokens. + kwargs.setdefault("feat_seq_length", 60) + kwargs.setdefault("max_source_positions", (kwargs["feat_seq_length"] - 1) // 2 + 1) + super().__init__(parent, **kwargs) - self.batch_size = 3 - self.vocab_size = text_config["vocab_size"] - self.hidden_size = text_config["hidden_size"] - self.num_attention_heads = text_config["num_attention_heads"] - self.num_hidden_layers = text_config["num_hidden_layers"] - self.encoder_seq_length = seq_length + def get_audio_mask_key(self): + return "input_features_mask" - def get_config(self): - return MusicFlamingoConfig( - text_config=self.text_config, - audio_config=self.audio_config, - audio_token_id=self.audio_token_id, - rope_parameters={"rope_type": "default", "rope_theta": 2048, "partial_rotary_factor": 0.5}, - ) + def create_audio_mask(self): + return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) - def prepare_config_and_inputs(self): - # (#windows == batch_size, n_mels, T_mel) - input_features_values = floats_tensor( - [self.batch_size, self.audio_config["num_mel_bins"], self.feat_seq_length] - ) - config = self.get_config() - # Per-window mel validity (all ones => full length) - input_features_mask = torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) - return config, input_features_values, input_features_mask - - def _post_pool_tokens_per_window(self, T_mel): - # Mirror MusicFlamingo processor math: - pre = (T_mel - 1) // 2 + 1 - post = (pre - 2) // 2 + 1 - return post - - def prepare_config_and_inputs_for_common(self): - config, input_features_values, input_features_mask = self.prepare_config_and_inputs() - # Every window has same T_mel here - num_audio_tokens_per_sample = self._post_pool_tokens_per_window(input_features_values.shape[-1]) - - # Build token ids with valid range and K tokens - input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2 - attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=torch_device) - attention_mask[:, :1] = 0 # left padding sentinel - - # Fill first K positions (after padding) with the audio token id, for each sample - input_ids[:, 1 : 1 + num_audio_tokens_per_sample] = config.audio_token_id - - inputs_dict = { - "input_features": input_features_values, - "input_features_mask": input_features_mask, - "input_ids": input_ids, - "attention_mask": attention_mask, - } - return config, inputs_dict + def get_audio_embeds_mask(self, audio_mask): + # AudioFlamingo3Encoder._get_feat_extract_output_lengths: conv2 (k=3,s=2) then avg_pool (k=2,s=2). + input_lengths = audio_mask.sum(-1) + input_lengths = (input_lengths - 1) // 2 + 1 + output_lengths = (input_lengths - 2) // 2 + 1 + max_len = int(output_lengths.max().item()) + positions = torch.arange(max_len, device=audio_mask.device)[None, :] + return (positions < output_lengths[:, None]).long() + + def get_config(self): + # MusicFlamingoConfig requires rope_parameters. + config = super().get_config() + config.rope_parameters = {"rope_type": "default", "rope_theta": 2048, "partial_rotary_factor": 0.5} + return config @require_torch -class MusicFlamingoForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): +class MusicFlamingoForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): """ Model tester for `MusicFlamingoForConditionalGeneration`. """ - all_model_classes = (MusicFlamingoForConditionalGeneration,) if is_torch_available() else () + model_tester_class = MusicFlamingoModelTester pipeline_model_mapping = ( { "text-to-speech": MusicFlamingoForConditionalGeneration, @@ -168,11 +100,6 @@ class MusicFlamingoForConditionalGenerationModelTest(ModelTesterMixin, Generatio if is_torch_available() else {} ) - _is_composite = True - - def setUp(self): - self.model_tester = MusicFlamingoModelTester(self) - self.config_tester = ConfigTester(self, config_class=MusicFlamingoConfig, has_text_modality=False) def test_rotary_window_axis_resets_per_audio(self): config = self.model_tester.get_config() @@ -246,48 +173,6 @@ def test_sdpa_can_dispatch_on_flash(self): def test_flash_attn_2_inference_equivalence_right_padding(self): pass - @unittest.skip(reason="MusicFlamingo has no separate base model without a head.") - def test_model_base_model_prefix(self): - pass - - def test_sdpa_can_dispatch_composite_models(self): - # MusicFlamingo is audio+text composite; verify SDPA toggles propagate to submodules. - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - if not self._is_composite: - self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") - - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - # SDPA (default) - model_sdpa = model_class.from_pretrained(tmpdirname) - model_sdpa = model_sdpa.eval().to(torch_device) - - text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" - audio_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager" - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - self.assertTrue(model.language_model.config._attn_implementation == text_attn) - self.assertTrue(model.audio_tower.config._attn_implementation == audio_attn) - - # Eager - model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") - model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") - self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager") - - for _, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - @require_torch class MusicFlamingoForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py index 5733a4347568..1130220301ea 100644 --- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py +++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py @@ -23,7 +23,9 @@ from transformers import ( AutoProcessor, Qwen2AudioConfig, + Qwen2AudioEncoderConfig, Qwen2AudioForConditionalGeneration, + Qwen2Config, is_torch_available, ) from transformers.testing_utils import ( @@ -43,10 +45,36 @@ class Qwen2AudioModelTester(ALMModelTester): config_class = Qwen2AudioConfig conditional_generation_class = Qwen2AudioForConditionalGeneration + text_config_class = Qwen2Config + audio_config_class = Qwen2AudioEncoderConfig + + def __init__(self, parent, **kwargs): + # feat_seq_length=60 → after conv2 s=2: 30 → after avg_pool s=2: 15 audio embed tokens. + kwargs.setdefault("feat_seq_length", 60) + # Encoder asserts input_features.shape[-1] == max_source_positions * conv1.stride * conv2.stride == 2 * max_source_positions. + kwargs.setdefault("max_source_positions", kwargs["feat_seq_length"] // 2) + # Qwen2AudioEncoderConfig only maps `num_hidden_layers`; override remaining size knobs explicitly. + kwargs.setdefault("d_model", 32) + kwargs.setdefault("encoder_attention_heads", 2) + kwargs.setdefault("encoder_ffn_dim", 32) + super().__init__(parent, **kwargs) def get_audio_mask_key(self): return "feature_attention_mask" + def create_audio_mask(self): + # Qwen2Audio expects full-length mel input; mask with all 1s. + return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) + + def get_audio_embeds_mask(self, audio_mask): + # Mirrors Qwen2AudioEncoder._get_feat_extract_output_lengths: conv2 (k=3,s=2,p=1) then avg_pool (k=2,s=2). + input_lengths = audio_mask.sum(-1) + input_lengths = (input_lengths - 1) // 2 + 1 + output_lengths = (input_lengths - 2) // 2 + 1 + max_len = int(output_lengths.max().item()) + positions = torch.arange(max_len, device=audio_mask.device)[None, :] + return (positions < output_lengths[:, None]).long() + @require_torch class Qwen2AudioForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): @@ -66,6 +94,12 @@ def test_sdpa_can_compile_dynamic(self): def test_sdpa_can_dispatch_on_flash(self): pass + @unittest.skip( + reason="inputs_embeds is the audio-fused path; can't match raw token-only embeddings." + ) + def test_inputs_embeds_matches_input_ids(self): + pass + @require_torch class Qwen2AudioForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/voxtral/test_modeling_voxtral.py b/tests/models/voxtral/test_modeling_voxtral.py index 0cff2a66779b..adc8b1bdc767 100644 --- a/tests/models/voxtral/test_modeling_voxtral.py +++ b/tests/models/voxtral/test_modeling_voxtral.py @@ -13,12 +13,13 @@ # limitations under the License. """Testing suite for the PyTorch Voxtral model.""" -import tempfile import unittest from transformers import ( AutoProcessor, + LlamaConfig, VoxtralConfig, + VoxtralEncoderConfig, VoxtralForConditionalGeneration, is_torch_available, ) @@ -30,126 +31,53 @@ torch_device, ) -from ...generation.test_utils import GenerationTesterMixin -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor -from ...test_pipeline_mixin import PipelineTesterMixin +from ...alm_tester import ALMModelTest, ALMModelTester if is_torch_available(): import torch -class VoxtralModelTester: - def __init__( - self, - parent, - ignore_index=-100, - audio_token_id=0, - seq_length=35, - feat_seq_length=60, - text_config={ - "model_type": "llama", - "intermediate_size": 36, - "initializer_range": 0.02, - "hidden_size": 32, - "max_position_embeddings": 52, - "num_hidden_layers": 2, - "num_attention_heads": 4, - "num_key_value_heads": 2, - "use_labels": True, - "use_mrope": False, - "vocab_size": 99, - "head_dim": 8, - "pad_token_id": 1, # can't be the same as the audio token id - }, - is_training=True, - audio_config={ - "model_type": "voxtral_encoder", - "hidden_size": 16, - "num_attention_heads": 4, - "intermediate_size": 16, - "num_hidden_layers": 2, - "num_mel_bins": 80, - "max_source_positions": 30, - "initializer_range": 0.02, - }, - ): - self.parent = parent - self.ignore_index = ignore_index - self.audio_token_id = audio_token_id - self.text_config = text_config - self.audio_config = audio_config - self.seq_length = seq_length - self.feat_seq_length = feat_seq_length - - self.num_hidden_layers = text_config["num_hidden_layers"] - self.vocab_size = text_config["vocab_size"] - self.hidden_size = text_config["hidden_size"] - self.num_attention_heads = text_config["num_attention_heads"] - self.is_training = is_training - - self.batch_size = 3 - self.encoder_seq_length = seq_length - - def get_config(self): - return VoxtralConfig( - text_config=self.text_config, - audio_config=self.audio_config, - ignore_index=self.ignore_index, - audio_token_id=self.audio_token_id, - ) - - def prepare_config_and_inputs(self): - input_features_values = floats_tensor( - [ - self.batch_size, - self.audio_config["num_mel_bins"], - self.feat_seq_length, - ] - ) - config = self.get_config() - return config, input_features_values +class VoxtralModelTester(ALMModelTester): + config_class = VoxtralConfig + conditional_generation_class = VoxtralForConditionalGeneration + text_config_class = LlamaConfig + audio_config_class = VoxtralEncoderConfig - def prepare_config_and_inputs_for_common(self): - config_and_inputs = self.prepare_config_and_inputs() - config, input_features_values = config_and_inputs - num_audio_tokens_per_batch_idx = 30 + def __init__(self, parent, **kwargs): + # seq_length 35 = BOS + 30 audio + 4 text (keeps column -2 text-only for resize test). + kwargs.setdefault("seq_length", 35) + # feat_seq_length 60 → conv2(s=2) → 30 audio embeds (Voxtral's encoder does not apply avg_pool + # in the forward; projector reshapes to B*30 embeddings). + kwargs.setdefault("feat_seq_length", 60) + # Encoder asserts input_features.shape[-1] == max_source_positions * 2. + kwargs.setdefault("max_source_positions", kwargs["feat_seq_length"] // 2) + # Llama needs head_dim + kwargs.setdefault("head_dim", 8) + super().__init__(parent, **kwargs) - input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1 - attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) - attention_mask[:, :1] = 0 + def get_audio_embeds_mask(self, audio_mask): + # Voxtral encoder only applies conv2 (stride 2); no avg_pool in forward. + output_length = (self.feat_seq_length - 1) // 2 + 1 + return torch.ones([self.batch_size, output_length], dtype=torch.long).to(torch_device) - input_ids[:, 1 : 1 + num_audio_tokens_per_batch_idx] = config.audio_token_id - inputs_dict = { - "input_ids": input_ids, - "attention_mask": attention_mask, - "input_features": input_features_values, - } - return config, inputs_dict + def create_audio_mask(self): + return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) @require_torch -class VoxtralForConditionalGenerationModelTest( - ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase -): +class VoxtralForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): """ Model tester for `VoxtralForConditionalGeneration`. """ - all_model_classes = (VoxtralForConditionalGeneration,) if is_torch_available() else () + model_tester_class = VoxtralModelTester pipeline_model_mapping = ( {"text-to-speech": VoxtralForConditionalGeneration, "any-to-any": VoxtralForConditionalGeneration} if is_torch_available() else {} ) - _is_composite = True - - def setUp(self): - self.model_tester = VoxtralModelTester(self) - self.config_tester = ConfigTester(self, config_class=VoxtralConfig, has_text_modality=False) - @unittest.skip( reason="This test does not apply to Voxtral since inputs_embeds corresponding to audio tokens are replaced when input features are provided." ) @@ -192,47 +120,6 @@ def test_flash_attention_3_padding_matches_padding_free_with_position_ids(self): def test_flash_attention_3_padding_matches_padding_free_with_position_ids_and_fa_kwargs(self): pass - @unittest.skip(reason="Voxtral has no separate base model without a head.") - def test_model_base_model_prefix(self): - pass - - def test_sdpa_can_dispatch_composite_models(self): - # overwrite because Voxtral is audio+text model (not vision+text) - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - if not self._is_composite: - self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") - - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model_sdpa = model_class.from_pretrained(tmpdirname) - model_sdpa = model_sdpa.eval().to(torch_device) - - text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" - vision_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager" - - # `None` as it is the requested one which will be assigned to each sub-config - # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - self.assertTrue(model.language_model.config._attn_implementation == text_attn) - self.assertTrue(model.audio_tower.config._attn_implementation == vision_attn) - - model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") - model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") - self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - @require_torch class VoxtralForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py index 9aa817f3cba6..4d5b464236b2 100644 --- a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py +++ b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py @@ -24,6 +24,10 @@ is_torch_available, ) from transformers.audio_utils import load_audio +from transformers.models.voxtral_realtime.configuration_voxtral_realtime import ( + VoxtralRealtimeEncoderConfig, + VoxtralRealtimeTextConfig, +) from transformers.testing_utils import ( cleanup, require_torch, @@ -31,10 +35,8 @@ torch_device, ) -from ...generation.test_utils import GenerationTesterMixin -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor -from ...test_pipeline_mixin import PipelineTesterMixin +from ...alm_tester import ALMModelTest, ALMModelTester +from ...test_modeling_common import floats_tensor, ids_tensor if is_datasets_available(): @@ -44,136 +46,89 @@ import torch -class VoxtralRealtimeModelTester: - def __init__( - self, - parent, - ignore_index=-100, - audio_token_id=0, - seq_length=5, - feat_seq_length=40, - text_config={ - "model_type": "voxtral_realtime_text", - "intermediate_size": 36, - "initializer_range": 0.02, - "hidden_size": 32, - "max_position_embeddings": 52, - "num_hidden_layers": 2, - "num_attention_heads": 4, - "num_key_value_heads": 2, - "use_labels": True, - "vocab_size": 99, - "head_dim": 8, - "pad_token_id": 1, # can't be the same as the audio token id - "hidden_act": "silu", - "rms_norm_eps": 1e-6, - "attention_dropout": 0.0, - "rope_parameters": { - "rope_type": "default", - "rope_theta": 10000.0, - }, - }, - is_training=True, - audio_config={ - "model_type": "voxtral_realtime_encoder", - "hidden_size": 16, - "num_attention_heads": 4, - "num_key_value_heads": 2, - "intermediate_size": 64, - "encoder_layers": 2, - "num_mel_bins": 80, - "max_position_embeddings": 100, - "initializer_range": 0.02, - "rms_norm_eps": 1e-6, - "activation_function": "silu", - "activation_dropout": 0.0, - "attention_dropout": 0.0, - "head_dim": 4, - "rope_parameters": { - "rope_type": "default", - "rope_theta": 10000.0, - }, - }, - ): - self.parent = parent - self.ignore_index = ignore_index - self.audio_token_id = audio_token_id - self.text_config = text_config - self.audio_config = audio_config - self.seq_length = seq_length - self.feat_seq_length = feat_seq_length - - self.num_hidden_layers = text_config["num_hidden_layers"] - self.vocab_size = text_config["vocab_size"] - self.hidden_size = text_config["hidden_size"] - self.num_attention_heads = text_config["num_attention_heads"] - self.is_training = is_training - - self.batch_size = 3 - self.encoder_seq_length = seq_length - self._max_new_tokens = None # this is used to set - - def get_config(self): - return VoxtralRealtimeConfig( - text_config=self.text_config, - audio_config=self.audio_config, - ignore_index=self.ignore_index, - audio_token_id=self.audio_token_id, - ) - - def prepare_config_and_inputs(self): - if self._max_new_tokens is not None: - feat_seq_length = self.feat_seq_length + self._max_new_tokens * 8 - else: - feat_seq_length = self.feat_seq_length - - input_features_values = floats_tensor( - [ - self.batch_size, - self.audio_config["num_mel_bins"], - feat_seq_length, - ] - ) - config = self.get_config() - return config, input_features_values +class VoxtralRealtimeModelTester(ALMModelTester): + config_class = VoxtralRealtimeConfig + conditional_generation_class = VoxtralRealtimeForConditionalGeneration + text_config_class = VoxtralRealtimeTextConfig + audio_config_class = VoxtralRealtimeEncoderConfig + + def __init__(self, parent, **kwargs): + # VoxtralRealtime does additive audio/text fusion: seq_length must equal num_audio_embeds. + # With audio_length_per_tok=8 (config default), num_audio_embeds = feat_seq_length // 8. + kwargs.setdefault("seq_length", 32) + kwargs.setdefault("feat_seq_length", kwargs["seq_length"] * 8) + # Audio encoder uses RoPE; max position must cover post-conv length (feat_seq_length // 2). + kwargs.setdefault("max_position_embeddings", kwargs["feat_seq_length"]) + kwargs.setdefault("head_dim", 8) + kwargs.setdefault("rms_norm_eps", 1e-6) + kwargs.setdefault("activation_function", "silu") + kwargs.setdefault("hidden_act", "silu") + super().__init__(parent, **kwargs) + self._max_new_tokens = None + + def get_audio_embeds_mask(self, audio_mask): + # Causal conv2 (stride 2, left-pad 1): post_conv_len = feat_seq_length // 2. + # Projector reshapes by downsample_factor=4 → post_conv_len // downsample_factor embeds. + downsample_factor = 4 + effective_feat = self.feat_seq_length + (self._max_new_tokens or 0) * 8 + post_conv_len = effective_feat // 2 + output_length = post_conv_len // downsample_factor + return torch.ones([self.batch_size, output_length], dtype=torch.long).to(torch_device) + + def create_audio_features(self): + effective_feat = self.feat_seq_length + (self._max_new_tokens or 0) * 8 + return floats_tensor([self.batch_size, self.num_mel_bins, effective_feat]) + + def create_audio_mask(self): + effective_feat = self.feat_seq_length + (self._max_new_tokens or 0) * 8 + return torch.ones([self.batch_size, effective_feat], dtype=torch.bool).to(torch_device) + + def place_audio_tokens(self, input_ids, config, num_audio_tokens): + # VoxtralRealtime fuses audio additively over the whole sequence; no placeholder token required. + input_ids = input_ids.clone() + input_ids[input_ids == self.audio_token_id] = self.pad_token_id + return input_ids def prepare_config_and_inputs_for_common(self): - config_and_inputs = self.prepare_config_and_inputs() - config, input_features_values = config_and_inputs - num_audio_tokens_per_batch_idx = 30 + # Custom pipeline: input_ids at seq_length, audio covers seq_length (+ max_new_tokens extras + # during generation so the model can slice future-token audio per decode step). We do not run + # the base-class `audio_embeds_mask.shape[1] <= seq_length` invariant because, for this model, + # audio embeds legitimately exceed input length during generation. + audio_features = self.create_audio_features() + audio_mask = self.create_audio_mask() + + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + special_tokens = [self.pad_token_id, self.bos_token_id, self.eos_token_id, self.audio_token_id] + for safe_id in range(self.vocab_size): + if safe_id not in special_tokens: + break + else: + raise ValueError("vocab_size too small for a non-special safe token.") + input_ids[input_ids == self.pad_token_id] = safe_id + input_ids[input_ids == self.eos_token_id] = safe_id - input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1 - attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) - attention_mask[:, :1] = 0 + config = self.get_config() + # place_audio_tokens is a no-op for this model; call for symmetry. + input_ids = self.place_audio_tokens(input_ids, config, torch.tensor([self.seq_length] * self.batch_size)) + attention_mask = self.create_attention_mask(input_ids) - input_ids[:, 1 : 1 + num_audio_tokens_per_batch_idx] = config.audio_token_id - inputs_dict = { + return config, { "input_ids": input_ids, "attention_mask": attention_mask, - "input_features": input_features_values, + "input_features": audio_features, } - return config, inputs_dict @require_torch -class VoxtralRealtimeForConditionalGenerationModelTest( - ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase -): +class VoxtralRealtimeForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): """ Model tester for `VoxtralRealtimeForConditionalGeneration`. """ additional_model_inputs = ["input_features"] - - all_model_classes = (VoxtralRealtimeForConditionalGeneration,) if is_torch_available() else () + model_tester_class = VoxtralRealtimeModelTester pipeline_model_mapping = {"any-to-any": VoxtralRealtimeForConditionalGeneration} if is_torch_available() else {} - _is_composite = True - - def setUp(self): - self.model_tester = VoxtralRealtimeModelTester(self) - self.config_tester = ConfigTester(self, config_class=VoxtralRealtimeConfig, has_text_modality=False) - def _with_max_new_tokens(max_new_tokens): def decorator(test_func): @functools.wraps(test_func) From a302c3ecf6923a176c1dfff562e267aa157c09e0 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 20 Apr 2026 16:46:03 +0200 Subject: [PATCH 07/38] cleaner --- tests/models/audioflamingo3/test_modeling_audioflamingo3.py | 3 ++- tests/models/glmasr/test_modeling_glmasr.py | 3 --- tests/models/granite_speech/test_modeling_granite_speech.py | 4 ---- tests/models/musicflamingo/test_modeling_musicflamingo.py | 2 ++ tests/models/qwen2_audio/test_modeling_qwen2_audio.py | 4 +++- tests/models/voxtral/test_modeling_voxtral.py | 3 --- .../models/voxtral_realtime/test_modeling_voxtral_realtime.py | 4 ---- 7 files changed, 7 insertions(+), 16 deletions(-) diff --git a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py index 153c6ba11b52..0d3dd954dda2 100644 --- a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py +++ b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py @@ -48,7 +48,6 @@ class AudioFlamingo3ModelTester(ALMModelTester): text_config_class = Qwen2Config audio_config_class = AudioFlamingo3EncoderConfig - def __init__(self, parent, **kwargs): # feat_seq_length → (L-1)//2+1 after conv2 → (·-2)//2+1 after avg_pool, so # feat_seq_length=60 gives 15 audio embed tokens (fits inside seq_length=32 + BOS + text). @@ -62,6 +61,8 @@ def get_audio_mask_key(self): return "input_features_mask" def create_audio_mask(self): + # Full-length mask matches real processor output and lets the audio encoder dispatch to Flash + # Attention (which rejects non-null attn_masks) on `test_sdpa_can_dispatch_on_flash`. return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) def get_audio_embeds_mask(self, audio_mask): diff --git a/tests/models/glmasr/test_modeling_glmasr.py b/tests/models/glmasr/test_modeling_glmasr.py index 8b93ad64337d..59d8e5969523 100644 --- a/tests/models/glmasr/test_modeling_glmasr.py +++ b/tests/models/glmasr/test_modeling_glmasr.py @@ -55,9 +55,6 @@ def __init__(self, parent, **kwargs): def get_audio_mask_key(self): return "input_features_mask" - def create_audio_mask(self): - return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) - def get_audio_embeds_mask(self, audio_mask): # conv1 (s=1) preserves length; conv2 (s=2, k=3, p=1) halves; merge_factor=4 post-projector. audio_lengths = audio_mask.sum(-1) diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py index f7c76cb4093e..61b6d4db53d8 100644 --- a/tests/models/granite_speech/test_modeling_granite_speech.py +++ b/tests/models/granite_speech/test_modeling_granite_speech.py @@ -90,10 +90,6 @@ def __init__(self, parent, **kwargs): def create_audio_features(self): return floats_tensor([self.batch_size, self.sequence_dim, self.feature_dim]) - def create_audio_mask(self): - # Granite's encoder is fed the raw features; mask is all-ones over sequence_dim. - return torch.ones([self.batch_size, self.sequence_dim], dtype=torch.bool).to(torch_device) - def get_audio_embeds_mask(self, audio_mask): # Projector produces `num_audio_tokens` embeds per sample (fixed by window_size/downsample_rate). return torch.ones([self.batch_size, self.num_audio_tokens], dtype=torch.long).to(torch_device) diff --git a/tests/models/musicflamingo/test_modeling_musicflamingo.py b/tests/models/musicflamingo/test_modeling_musicflamingo.py index 9b8153705582..25e714fc30ec 100644 --- a/tests/models/musicflamingo/test_modeling_musicflamingo.py +++ b/tests/models/musicflamingo/test_modeling_musicflamingo.py @@ -67,6 +67,8 @@ def get_audio_mask_key(self): return "input_features_mask" def create_audio_mask(self): + # Deterministic full-length mask — base default uses unseeded Python `random`, which makes + # multi-call generation-comparison tests (e.g. assisted decoding vs greedy) flaky. return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) def get_audio_embeds_mask(self, audio_mask): diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py index 1130220301ea..7e45ecfc4150 100644 --- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py +++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py @@ -63,7 +63,9 @@ def get_audio_mask_key(self): return "feature_attention_mask" def create_audio_mask(self): - # Qwen2Audio expects full-length mel input; mask with all 1s. + # Deterministic full-length mask: the base default randomizes via Python's `random`, which isn't + # re-seeded per test call and desynchronizes the two `prepare_config_and_inputs_for_common` + # invocations inside generation-comparison tests (e.g. test_greedy_generate_dict_outputs). return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) def get_audio_embeds_mask(self, audio_mask): diff --git a/tests/models/voxtral/test_modeling_voxtral.py b/tests/models/voxtral/test_modeling_voxtral.py index adc8b1bdc767..4f0c604ce05f 100644 --- a/tests/models/voxtral/test_modeling_voxtral.py +++ b/tests/models/voxtral/test_modeling_voxtral.py @@ -61,9 +61,6 @@ def get_audio_embeds_mask(self, audio_mask): output_length = (self.feat_seq_length - 1) // 2 + 1 return torch.ones([self.batch_size, output_length], dtype=torch.long).to(torch_device) - def create_audio_mask(self): - return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) - @require_torch class VoxtralForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): diff --git a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py index 4d5b464236b2..f9699479aac9 100644 --- a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py +++ b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py @@ -79,10 +79,6 @@ def create_audio_features(self): effective_feat = self.feat_seq_length + (self._max_new_tokens or 0) * 8 return floats_tensor([self.batch_size, self.num_mel_bins, effective_feat]) - def create_audio_mask(self): - effective_feat = self.feat_seq_length + (self._max_new_tokens or 0) * 8 - return torch.ones([self.batch_size, effective_feat], dtype=torch.bool).to(torch_device) - def place_audio_tokens(self, input_ids, config, num_audio_tokens): # VoxtralRealtime fuses audio additively over the whole sequence; no placeholder token required. input_ids = input_ids.clone() From 8fcba58d5f4377f8cc86a20626706121d2936ff8 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 20 Apr 2026 17:28:54 +0200 Subject: [PATCH 08/38] updates --- tests/alm_tester.py | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/tests/alm_tester.py b/tests/alm_tester.py index 4c104e6dd49d..385382a13dc2 100644 --- a/tests/alm_tester.py +++ b/tests/alm_tester.py @@ -35,19 +35,21 @@ class ALMModelTester: # If the model follows standard naming conventions, only `config_class` and # `conditional_generation_class` need to be set (others are optional). + # base_model_class = None, this should be added when #45534 is merged config_class = None + text_config_class = None + audio_config_class = None conditional_generation_class = None - base_model_class = None sequence_classification_class = None - - # Key name for the audio sub-config in the main config constructor. - # Override to "encoder_config" for models like GraniteSpeech. - audio_config_key = "audio_config" + # These attributes are required after the initialization phase of the tester. + _required_attributes = ("config_class", "conditional_generation_class") # Arguments that should be passed to the config class even if not in its signature. forced_config_args = ["pad_token_id"] - _required_attributes = ("config_class", "conditional_generation_class") + # Key name for the audio sub-config in the main config constructor. + # Override to "encoder_config" for models like GraniteSpeech. + audio_config_key = "audio_config" @property def all_model_classes(self): @@ -63,7 +65,13 @@ def all_model_classes(self): @property def pipeline_model_mapping(self): - return {"any-to-any": self.conditional_generation_class} + # TODO: @eustlb, we don't have pipeline testing for audio-text-to-text + mapping = { + "feature-extraction": self.base_model_class, + # "audio-text-to-text": self.conditional_generation_class, + } + # TODO: should we add automatic-speech-recognition with a special flag? + return mapping def __init__(self, parent, **kwargs): self.parent = parent @@ -92,21 +100,11 @@ def __init__(self, parent, **kwargs): kwargs.setdefault("intermediate_size", 32) # Keep this divisible by 8 for fp16/bf16/fp32 16-bytes alignment kwargs.setdefault("hidden_act", "gelu") kwargs.setdefault("max_position_embeddings", 512) - - # Optional projector config (e.g. GraniteSpeech uses a Q-Former projector) - kwargs.setdefault("projector_config", None) # Set all kwargs as instance attributes for key, value in kwargs.items(): setattr(self, key, value) - # # Derived from text config (needed by ModelTesterMixin) - # self.vocab_size = self.text_config.get("vocab_size", 99) - # self.hidden_size = self.text_config.get("hidden_size", 32) - # self.num_hidden_layers = self.text_config.get("num_hidden_layers", 2) - # self.num_attention_heads = self.text_config.get("num_attention_heads", 4) - # self.encoder_seq_length = self.seq_length - for required_attribute in [ # "base_model_class", # TODO: @eustlb, there is a discrepancy here between ALMs/ VLMs. XXModel and XXForConditionalGeneration "config_class", @@ -192,7 +190,7 @@ def prepare_config_and_inputs_for_common(self): "This likely indicates a mismatch between your feature extraction/configuration and your sequence length. " "Please ensure `seq_length` is >= the number of audio embedding positions." ) - + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) special_tokens = [self.pad_token_id, self.bos_token_id, self.eos_token_id, self.audio_token_id] @@ -229,7 +227,7 @@ def prepare_config_and_inputs_for_common(self): @property def config_args(self): return list(signature(self.config_class.__init__).parameters.keys()) - + @property def text_config_args(self): args = list(signature(self.text_config_class.__init__).parameters.keys()) @@ -310,9 +308,7 @@ class ALMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin) def setUp(self): if self.model_tester_class is None: - raise ValueError( - "You have inherited from ALMModelTest but did not set the model_tester_class attribute." - ) + raise ValueError("You have inherited from ALMModelTest but did not set the model_tester_class attribute.") self.model_tester = self.model_tester_class(self) self.config_tester = ConfigTester(self, config_class=self.model_tester.config_class, has_text_modality=False) @@ -332,6 +328,11 @@ def test_config(self): """Test config common functionality.""" self.config_tester.run_common_tests() + # TODO: @eustlb, remove this once #45534 is merged @unittest.skip("Audio-LMs have no separate base model without a head.") def test_model_base_model_prefix(self): pass + + # TODO: @eustlb, add this + # def test_mismatching_num_audio_tokens(self): + # pass From 66acc9ed86067a8d52faa9ed80cbb5e964f1d0d5 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 20 Apr 2026 22:04:43 +0200 Subject: [PATCH 09/38] audio_mask_key + updates --- .../configuration_granite_speech.py | 5 +++ tests/alm_tester.py | 14 +++----- .../test_modeling_audioflamingo3.py | 7 ++-- tests/models/glmasr/test_modeling_glmasr.py | 3 +- .../test_modeling_granite_speech.py | 33 +++++++------------ .../test_modeling_musicflamingo.py | 3 +- .../qwen2_audio/test_modeling_qwen2_audio.py | 7 ++-- .../test_modeling_voxtral_realtime.py | 1 - 8 files changed, 27 insertions(+), 46 deletions(-) diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py index d02ac9998696..7d922992a10f 100644 --- a/src/transformers/models/granite_speech/configuration_granite_speech.py +++ b/src/transformers/models/granite_speech/configuration_granite_speech.py @@ -53,6 +53,11 @@ class GraniteSpeechEncoderConfig(PreTrainedConfig): ```""" model_type = "granite_speech_encoder" + attribute_map = { + "hidden_size": "hidden_dim", + "num_hidden_layers": "num_layers", + "num_attention_heads": "num_heads", + } input_dim: int = 160 num_layers: int = 10 diff --git a/tests/alm_tester.py b/tests/alm_tester.py index 385382a13dc2..5ab4b76ce95b 100644 --- a/tests/alm_tester.py +++ b/tests/alm_tester.py @@ -35,7 +35,7 @@ class ALMModelTester: # If the model follows standard naming conventions, only `config_class` and # `conditional_generation_class` need to be set (others are optional). - # base_model_class = None, this should be added when #45534 is merged + base_model_class = None, # this should be added for most models when #45534 is merged config_class = None text_config_class = None audio_config_class = None @@ -50,6 +50,7 @@ class ALMModelTester: # Key name for the audio sub-config in the main config constructor. # Override to "encoder_config" for models like GraniteSpeech. audio_config_key = "audio_config" + audio_mask_key = None # to be set if audio-related mask has to be passed to the model's forward @property def all_model_classes(self): @@ -149,11 +150,7 @@ def place_audio_tokens(self, input_ids, config, num_audio_tokens): def get_audio_feature_key(self): """Key name for audio features in the inputs dict.""" - return "input_features" - - def get_audio_mask_key(self): - """Key name for audio attention mask. Return None if no audio mask needed.""" - return None + return "input_features" def create_audio_mask(self): """Create audio-level attention mask with contiguous valid regions per batch element. @@ -217,9 +214,8 @@ def prepare_config_and_inputs_for_common(self): self.get_audio_feature_key(): audio_features, } - audio_mask_key = self.get_audio_mask_key() - if audio_mask_key is not None: - inputs_dict[audio_mask_key] = audio_mask + if self.audio_mask_key is not None: + inputs_dict[self.audio_mask_key] = audio_mask inputs_dict.update(self.get_additional_inputs(config, input_ids, audio_features)) return config, inputs_dict diff --git a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py index 0d3dd954dda2..db17a400cab8 100644 --- a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py +++ b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py @@ -18,14 +18,12 @@ import unittest from pathlib import Path -import pytest - from transformers import ( AudioFlamingo3Config, AudioFlamingo3EncoderConfig, - Qwen2Config, AudioFlamingo3ForConditionalGeneration, AutoProcessor, + Qwen2Config, is_torch_available, ) from transformers.testing_utils import ( @@ -57,8 +55,7 @@ def __init__(self, parent, **kwargs): kwargs.setdefault("max_source_positions", (kwargs["feat_seq_length"] - 1) // 2 + 1) super().__init__(parent, **kwargs) - def get_audio_mask_key(self): - return "input_features_mask" + audio_mask_key = "input_features_mask" def create_audio_mask(self): # Full-length mask matches real processor output and lets the audio encoder dispatch to Flash diff --git a/tests/models/glmasr/test_modeling_glmasr.py b/tests/models/glmasr/test_modeling_glmasr.py index 59d8e5969523..5606f1c75fac 100644 --- a/tests/models/glmasr/test_modeling_glmasr.py +++ b/tests/models/glmasr/test_modeling_glmasr.py @@ -52,8 +52,7 @@ def __init__(self, parent, **kwargs): kwargs.setdefault("head_dim", 8) super().__init__(parent, **kwargs) - def get_audio_mask_key(self): - return "input_features_mask" + audio_mask_key = "input_features_mask" def get_audio_embeds_mask(self, audio_mask): # conv1 (s=1) preserves length; conv2 (s=2, k=3, p=1) halves; merge_factor=4 post-projector. diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py index 61b6d4db53d8..dd36955f469a 100644 --- a/tests/models/granite_speech/test_modeling_granite_speech.py +++ b/tests/models/granite_speech/test_modeling_granite_speech.py @@ -56,35 +56,24 @@ class GraniteSpeechModelTester(ALMModelTester): def __init__(self, parent, **kwargs): kwargs.setdefault("seq_length", 9) # 7 text + 2 audio tokens + kwargs.setdefault("num_audio_tokens", 2) kwargs.setdefault("sequence_dim", 844) kwargs.setdefault("feature_dim", 160) kwargs.setdefault("has_lora_adapter", True) kwargs.setdefault("downsample_rate", 5) kwargs.setdefault("window_size", 15) - # GraniteSpeechEncoderConfig fields (no attribute_map, so set explicitly). - kwargs.setdefault("input_dim", 160) - kwargs.setdefault("num_layers", 2) - kwargs.setdefault("hidden_dim", 32) - kwargs.setdefault("num_heads", 4) kwargs.setdefault("dim_head", 8) - kwargs.setdefault("feedforward_mult", 4) - kwargs.setdefault("context_size", 200) - kwargs.setdefault("conv_kernel_size", 15) - kwargs.setdefault("conv_expansion_factor", 2) - kwargs.setdefault("output_dim", 42) - # Q-Former projector config (passed through as a dict; ALM's get_config forwards unknowns). - kwargs.setdefault( - "projector_config", - { - "model_type": "blip_2_qformer", - "hidden_size": 32, - "num_hidden_layers": 2, - "num_attention_heads": 4, - "intermediate_size": 256, - "encoder_hidden_size": 32, - }, - ) + + kwargs["projector_config"] = { + "model_type": "blip_2_qformer", + "hidden_size": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 256, + "encoder_hidden_size": 32, + } + super().__init__(parent, **kwargs) def create_audio_features(self): diff --git a/tests/models/musicflamingo/test_modeling_musicflamingo.py b/tests/models/musicflamingo/test_modeling_musicflamingo.py index 25e714fc30ec..19da6506d1ba 100644 --- a/tests/models/musicflamingo/test_modeling_musicflamingo.py +++ b/tests/models/musicflamingo/test_modeling_musicflamingo.py @@ -63,8 +63,7 @@ def __init__(self, parent, **kwargs): kwargs.setdefault("max_source_positions", (kwargs["feat_seq_length"] - 1) // 2 + 1) super().__init__(parent, **kwargs) - def get_audio_mask_key(self): - return "input_features_mask" + audio_mask_key = "input_features_mask" def create_audio_mask(self): # Deterministic full-length mask — base default uses unseeded Python `random`, which makes diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py index 7e45ecfc4150..b3010fa82539 100644 --- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py +++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py @@ -59,8 +59,7 @@ def __init__(self, parent, **kwargs): kwargs.setdefault("encoder_ffn_dim", 32) super().__init__(parent, **kwargs) - def get_audio_mask_key(self): - return "feature_attention_mask" + audio_mask_key = "feature_attention_mask" def create_audio_mask(self): # Deterministic full-length mask: the base default randomizes via Python's `random`, which isn't @@ -96,9 +95,7 @@ def test_sdpa_can_compile_dynamic(self): def test_sdpa_can_dispatch_on_flash(self): pass - @unittest.skip( - reason="inputs_embeds is the audio-fused path; can't match raw token-only embeddings." - ) + @unittest.skip(reason="inputs_embeds is the audio-fused path; can't match raw token-only embeddings.") def test_inputs_embeds_matches_input_ids(self): pass diff --git a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py index f9699479aac9..86682cd558a0 100644 --- a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py +++ b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py @@ -91,7 +91,6 @@ def prepare_config_and_inputs_for_common(self): # the base-class `audio_embeds_mask.shape[1] <= seq_length` invariant because, for this model, # audio embeds legitimately exceed input length during generation. audio_features = self.create_audio_features() - audio_mask = self.create_audio_mask() input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) special_tokens = [self.pad_token_id, self.bos_token_id, self.eos_token_id, self.audio_token_id] From 63ca77e01e50951d999c5614214260e74e5234de Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 20 Apr 2026 22:08:14 +0200 Subject: [PATCH 10/38] typo --- tests/alm_tester.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/alm_tester.py b/tests/alm_tester.py index 5ab4b76ce95b..94e480e74b72 100644 --- a/tests/alm_tester.py +++ b/tests/alm_tester.py @@ -35,7 +35,7 @@ class ALMModelTester: # If the model follows standard naming conventions, only `config_class` and # `conditional_generation_class` need to be set (others are optional). - base_model_class = None, # this should be added for most models when #45534 is merged + base_model_class = None # this should be added for most models when #45534 is merged config_class = None text_config_class = None audio_config_class = None From 7588135e2623f693052ea709c390fbf2651a56f6 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 20 Apr 2026 22:41:02 +0200 Subject: [PATCH 11/38] simplify granite speech --- .../configuration_granite_speech.py | 7 ++++++- .../test_modeling_granite_speech.py | 21 +++++++------------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py index 7d922992a10f..dbdda02ccdb9 100644 --- a/src/transformers/models/granite_speech/configuration_granite_speech.py +++ b/src/transformers/models/granite_speech/configuration_granite_speech.py @@ -57,6 +57,7 @@ class GraniteSpeechEncoderConfig(PreTrainedConfig): "hidden_size": "hidden_dim", "num_hidden_layers": "num_layers", "num_attention_heads": "num_heads", + "num_mel_bins": "input_dim", } input_dim: int = 160 @@ -64,7 +65,7 @@ class GraniteSpeechEncoderConfig(PreTrainedConfig): hidden_dim: int = 1024 feedforward_mult: int = 4 num_heads: int = 8 - dim_head: int = 128 + dim_head: int | None = None output_dim: int = 42 context_size: int = 200 max_pos_emb: int = 512 @@ -72,6 +73,10 @@ class GraniteSpeechEncoderConfig(PreTrainedConfig): conv_kernel_size: int = 15 conv_expansion_factor: int = 2 + def __post_init__(self, **kwargs): + super().__post_init__(**kwargs) + if self.dim_head is None: + self.dim_head = self.hidden_dim // self.num_heads @auto_docstring(checkpoint="ibm-granite/granite-speech-3.3-2b") @strict diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py index dd36955f469a..18f07fc71bef 100644 --- a/tests/models/granite_speech/test_modeling_granite_speech.py +++ b/tests/models/granite_speech/test_modeling_granite_speech.py @@ -55,16 +55,6 @@ class GraniteSpeechModelTester(ALMModelTester): audio_config_key = "encoder_config" def __init__(self, parent, **kwargs): - kwargs.setdefault("seq_length", 9) # 7 text + 2 audio tokens - - kwargs.setdefault("num_audio_tokens", 2) - kwargs.setdefault("sequence_dim", 844) - kwargs.setdefault("feature_dim", 160) - kwargs.setdefault("has_lora_adapter", True) - kwargs.setdefault("downsample_rate", 5) - kwargs.setdefault("window_size", 15) - kwargs.setdefault("dim_head", 8) - kwargs["projector_config"] = { "model_type": "blip_2_qformer", "hidden_size": 32, @@ -77,11 +67,16 @@ def __init__(self, parent, **kwargs): super().__init__(parent, **kwargs) def create_audio_features(self): - return floats_tensor([self.batch_size, self.sequence_dim, self.feature_dim]) + # GraniteSpeech expects [B, seq_len, features] (time-first), unlike the standard [B, features, seq_len] + return floats_tensor([self.batch_size, self.feat_seq_length, self.num_mel_bins]) def get_audio_embeds_mask(self, audio_mask): - # Projector produces `num_audio_tokens` embeds per sample (fixed by window_size/downsample_rate). - return torch.ones([self.batch_size, self.num_audio_tokens], dtype=torch.long).to(torch_device) + # Projector: ceil(feat_seq_length / window_size) * (window_size // downsample_rate) tokens per sample. + import math + + nblocks = math.ceil(self.feat_seq_length / self.window_size) + num_audio_tokens = nblocks * (self.window_size // self.downsample_rate) + return torch.ones([self.batch_size, num_audio_tokens], dtype=torch.long).to(torch_device) def create_attention_mask(self, input_ids): return torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) From 41fed1c820e2745e7c5c9f9bfb5dbfa2aca751a6 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 20 Apr 2026 22:45:29 +0200 Subject: [PATCH 12/38] nits --- tests/models/audioflamingo3/test_modeling_audioflamingo3.py | 3 +-- tests/models/glmasr/test_modeling_glmasr.py | 3 +-- tests/models/musicflamingo/test_modeling_musicflamingo.py | 3 +-- tests/models/qwen2_audio/test_modeling_qwen2_audio.py | 3 +-- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py index db17a400cab8..9629fe3ba086 100644 --- a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py +++ b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py @@ -45,6 +45,7 @@ class AudioFlamingo3ModelTester(ALMModelTester): conditional_generation_class = AudioFlamingo3ForConditionalGeneration text_config_class = Qwen2Config audio_config_class = AudioFlamingo3EncoderConfig + audio_mask_key = "input_features_mask" def __init__(self, parent, **kwargs): # feat_seq_length → (L-1)//2+1 after conv2 → (·-2)//2+1 after avg_pool, so @@ -55,8 +56,6 @@ def __init__(self, parent, **kwargs): kwargs.setdefault("max_source_positions", (kwargs["feat_seq_length"] - 1) // 2 + 1) super().__init__(parent, **kwargs) - audio_mask_key = "input_features_mask" - def create_audio_mask(self): # Full-length mask matches real processor output and lets the audio encoder dispatch to Flash # Attention (which rejects non-null attn_masks) on `test_sdpa_can_dispatch_on_flash`. diff --git a/tests/models/glmasr/test_modeling_glmasr.py b/tests/models/glmasr/test_modeling_glmasr.py index 5606f1c75fac..76e4cd5cc6b5 100644 --- a/tests/models/glmasr/test_modeling_glmasr.py +++ b/tests/models/glmasr/test_modeling_glmasr.py @@ -44,6 +44,7 @@ class GlmAsrModelTester(ALMModelTester): conditional_generation_class = GlmAsrForConditionalGeneration text_config_class = LlamaConfig audio_config_class = GlmAsrEncoderConfig + audio_mask_key = "input_features_mask" def __init__(self, parent, **kwargs): # feat_seq_length=64 → conv2 (s=2): post_conv=32 → merge_factor=4: 8 audio embed tokens. @@ -52,8 +53,6 @@ def __init__(self, parent, **kwargs): kwargs.setdefault("head_dim", 8) super().__init__(parent, **kwargs) - audio_mask_key = "input_features_mask" - def get_audio_embeds_mask(self, audio_mask): # conv1 (s=1) preserves length; conv2 (s=2, k=3, p=1) halves; merge_factor=4 post-projector. audio_lengths = audio_mask.sum(-1) diff --git a/tests/models/musicflamingo/test_modeling_musicflamingo.py b/tests/models/musicflamingo/test_modeling_musicflamingo.py index 19da6506d1ba..6996ff4ccb71 100644 --- a/tests/models/musicflamingo/test_modeling_musicflamingo.py +++ b/tests/models/musicflamingo/test_modeling_musicflamingo.py @@ -56,6 +56,7 @@ class MusicFlamingoModelTester(ALMModelTester): conditional_generation_class = MusicFlamingoForConditionalGeneration text_config_class = Qwen2Config audio_config_class = AudioFlamingo3EncoderConfig + audio_mask_key = "input_features_mask" def __init__(self, parent, **kwargs): # feat_seq_length=60 → (60-1)//2+1=30 → (30-2)//2+1=15 audio embed tokens. @@ -63,8 +64,6 @@ def __init__(self, parent, **kwargs): kwargs.setdefault("max_source_positions", (kwargs["feat_seq_length"] - 1) // 2 + 1) super().__init__(parent, **kwargs) - audio_mask_key = "input_features_mask" - def create_audio_mask(self): # Deterministic full-length mask — base default uses unseeded Python `random`, which makes # multi-call generation-comparison tests (e.g. assisted decoding vs greedy) flaky. diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py index b3010fa82539..ade43ffabf39 100644 --- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py +++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py @@ -47,6 +47,7 @@ class Qwen2AudioModelTester(ALMModelTester): conditional_generation_class = Qwen2AudioForConditionalGeneration text_config_class = Qwen2Config audio_config_class = Qwen2AudioEncoderConfig + audio_mask_key = "feature_attention_mask" def __init__(self, parent, **kwargs): # feat_seq_length=60 → after conv2 s=2: 30 → after avg_pool s=2: 15 audio embed tokens. @@ -59,8 +60,6 @@ def __init__(self, parent, **kwargs): kwargs.setdefault("encoder_ffn_dim", 32) super().__init__(parent, **kwargs) - audio_mask_key = "feature_attention_mask" - def create_audio_mask(self): # Deterministic full-length mask: the base default randomizes via Python's `random`, which isn't # re-seeded per test call and desynchronizes the two `prepare_config_and_inputs_for_common` From e5971c7fab1a7e33ee64c15d62475e7cedf8224b Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 20 Apr 2026 23:05:07 +0200 Subject: [PATCH 13/38] some more cleaning --- .../models/qwen2_audio/configuration_qwen2_audio.py | 7 ++++++- tests/models/glmasr/test_modeling_glmasr.py | 3 --- tests/models/qwen2_audio/test_modeling_qwen2_audio.py | 4 ---- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py index a617f33e6177..6aec9eace900 100644 --- a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py @@ -42,7 +42,12 @@ class Qwen2AudioEncoderConfig(PreTrainedConfig): ```""" model_type = "qwen2_audio_encoder" - attribute_map = {"num_hidden_layers": "encoder_layers"} + attribute_map = { + "num_hidden_layers": "encoder_layers", + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + "intermediate_size": "encoder_ffn_dim", + } num_mel_bins: int = 128 encoder_layers: int = 32 diff --git a/tests/models/glmasr/test_modeling_glmasr.py b/tests/models/glmasr/test_modeling_glmasr.py index 76e4cd5cc6b5..0b2aae719d19 100644 --- a/tests/models/glmasr/test_modeling_glmasr.py +++ b/tests/models/glmasr/test_modeling_glmasr.py @@ -47,9 +47,6 @@ class GlmAsrModelTester(ALMModelTester): audio_mask_key = "input_features_mask" def __init__(self, parent, **kwargs): - # feat_seq_length=64 → conv2 (s=2): post_conv=32 → merge_factor=4: 8 audio embed tokens. - kwargs.setdefault("feat_seq_length", 64) - kwargs.setdefault("seq_length", 35) kwargs.setdefault("head_dim", 8) super().__init__(parent, **kwargs) diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py index ade43ffabf39..fc73d6dca607 100644 --- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py +++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py @@ -54,10 +54,6 @@ def __init__(self, parent, **kwargs): kwargs.setdefault("feat_seq_length", 60) # Encoder asserts input_features.shape[-1] == max_source_positions * conv1.stride * conv2.stride == 2 * max_source_positions. kwargs.setdefault("max_source_positions", kwargs["feat_seq_length"] // 2) - # Qwen2AudioEncoderConfig only maps `num_hidden_layers`; override remaining size knobs explicitly. - kwargs.setdefault("d_model", 32) - kwargs.setdefault("encoder_attention_heads", 2) - kwargs.setdefault("encoder_ffn_dim", 32) super().__init__(parent, **kwargs) def create_audio_mask(self): From 59703ddd3eab7cb978272dd7d83190620df02c20 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Tue, 21 Apr 2026 17:57:32 +0200 Subject: [PATCH 14/38] add test_mismatching_num_audio_tokens --- tests/alm_tester.py | 87 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 75 insertions(+), 12 deletions(-) diff --git a/tests/alm_tester.py b/tests/alm_tester.py index 94e480e74b72..340aee77df5c 100644 --- a/tests/alm_tester.py +++ b/tests/alm_tester.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import unittest from inspect import signature @@ -145,12 +146,18 @@ def place_audio_tokens(self, input_ids, config, num_audio_tokens): input_ids[input_ids == self.audio_token_id] = self.pad_token_id for i in range(input_ids.shape[0]): n = num_audio_tokens[i].item() if isinstance(num_audio_tokens, torch.Tensor) else num_audio_tokens + if 1 + int(n) > self.seq_length: + raise ValueError( + f"Cannot place {int(n)} audio tokens after BOS in a sequence of length {self.seq_length}. " + "This likely indicates a mismatch between your feature extraction/configuration and your sequence length. " + "Please ensure `seq_length` is >= the number of audio embedding positions + 1." + ) input_ids[i, 1 : 1 + int(n)] = self.audio_token_id return input_ids def get_audio_feature_key(self): """Key name for audio features in the inputs dict.""" - return "input_features" + return "input_features" def create_audio_mask(self): """Create audio-level attention mask with contiguous valid regions per batch element. @@ -180,14 +187,6 @@ def prepare_config_and_inputs_for_common(self): audio_mask = self.create_audio_mask() audio_embeds_mask = self.get_audio_embeds_mask(audio_mask) - if audio_embeds_mask.shape[1] > self.seq_length: - raise ValueError( - f"`audio_embeds_mask` has more tokens per sequence than `seq_length` allows " - f"({audio_embeds_mask.shape[1]} > {self.seq_length}). " - "This likely indicates a mismatch between your feature extraction/configuration and your sequence length. " - "Please ensure `seq_length` is >= the number of audio embedding positions." - ) - input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) special_tokens = [self.pad_token_id, self.bos_token_id, self.eos_token_id, self.audio_token_id] @@ -329,6 +328,70 @@ def test_config(self): def test_model_base_model_prefix(self): pass - # TODO: @eustlb, add this - # def test_mismatching_num_audio_tokens(self): - # pass + def test_mismatching_num_audio_tokens(self): + """ + Tests that ALMs throw an error with explicit message saying what is wrong + when number of audios don't match number of audio tokens in the text. + Also we need to test multi-audio cases when one prompt has multiple audio tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + audio_feature_key = self.model_tester.get_audio_feature_key() + audio_mask_key = self.model_tester.audio_mask_key + + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + model.eval() + curr_input_dict = copy.deepcopy(input_dict) + _ = model(**curr_input_dict) # successful forward with no modifications + + # Test 1: remove one audio but leave the audio tokens in the text + curr_input_dict[audio_feature_key] = curr_input_dict[audio_feature_key][-1:, ...] + if audio_mask_key is not None: + curr_input_dict[audio_mask_key] = curr_input_dict[audio_mask_key][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**curr_input_dict) + + # Test 2: add one audio but leave the audio tokens in the text + curr_input_dict = copy.deepcopy(input_dict) + curr_input_dict[audio_feature_key] = torch.cat( + [curr_input_dict[audio_feature_key], curr_input_dict[audio_feature_key][:1, ...]], dim=0 + ) + if audio_mask_key is not None: + curr_input_dict[audio_mask_key] = torch.cat( + [curr_input_dict[audio_mask_key], curr_input_dict[audio_mask_key][:1, ...]], dim=0 + ) + with self.assertRaises(ValueError): + _ = model(**curr_input_dict) + + # Test 3: duplicate the text along the seq dim so each prompt has twice as many + # audio tokens, while leaving the audio features unchanged -> mismatch + curr_input_dict = copy.deepcopy(input_dict) + curr_input_dict["input_ids"] = torch.cat( + [curr_input_dict["input_ids"], curr_input_dict["input_ids"]], dim=1 + ) + curr_input_dict["attention_mask"] = torch.cat( + [curr_input_dict["attention_mask"], curr_input_dict["attention_mask"]], dim=1 + ) + with self.assertRaises(ValueError): + _ = model(**curr_input_dict) + + # Test 4: multi-audio valid case. A prompt may contain multiple audio segments; + # all audio segments are concatenated along the batch dim on the audio side. + # Duplicating input_ids along seq dim (-> [audios, audios] per prompt) and the + # audio features along batch dim (-> batch_size * 2) must forward successfully. + curr_input_dict = copy.deepcopy(input_dict) + curr_input_dict["input_ids"] = torch.cat( + [curr_input_dict["input_ids"], curr_input_dict["input_ids"]], dim=1 + ) + curr_input_dict["attention_mask"] = torch.cat( + [curr_input_dict["attention_mask"], curr_input_dict["attention_mask"]], dim=1 + ) + curr_input_dict[audio_feature_key] = torch.cat( + [curr_input_dict[audio_feature_key], curr_input_dict[audio_feature_key]], dim=0 + ) + if audio_mask_key is not None: + curr_input_dict[audio_mask_key] = torch.cat( + [curr_input_dict[audio_mask_key], curr_input_dict[audio_mask_key]], dim=0 + ) + _ = model(**curr_input_dict) + From 6a67f32b5d4e58b55fab9858fea6afd41573deea Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Tue, 21 Apr 2026 18:00:14 +0200 Subject: [PATCH 15/38] add get_placeholder_mask --- .../audioflamingo3/modeling_audioflamingo3.py | 32 ++++++++++++++--- .../audioflamingo3/modular_audioflamingo3.py | 6 ++-- .../models/glmasr/modeling_glmasr.py | 32 ++++++++++++++--- .../granite_speech/modeling_granite_speech.py | 36 ++++++++++++++----- .../models/voxtral/modeling_voxtral.py | 32 ++++++++++++++--- .../models/voxtral/modular_voxtral.py | 32 ++++++++++++++--- 6 files changed, 142 insertions(+), 28 deletions(-) diff --git a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py index 1fbbc733c308..43028ab1c74c 100644 --- a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py +++ b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py @@ -34,7 +34,7 @@ from ...modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_compilable_check from ...utils.generic import merge_with_config_defaults from ...utils.output_capturing import capture_outputs from ..auto import AutoModel, AutoModelForCausalLM @@ -473,6 +473,30 @@ def get_audio_features( return audio_output + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + @can_return_tuple @auto_docstring def forward( @@ -559,10 +583,10 @@ def forward( audio_embeds = self.get_audio_features(input_features, input_features_mask, return_dict=True).pooler_output # replace text-audio token placeholders with audio embeddings - audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1) - inputs_embeds = inputs_embeds.masked_scatter( - audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device) + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device)) outputs: CausalLMOutputWithPast = self.language_model( inputs_embeds=inputs_embeds, diff --git a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py index c325bc85300e..20cf2189bffd 100644 --- a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py +++ b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py @@ -269,10 +269,10 @@ def forward( audio_embeds = self.get_audio_features(input_features, input_features_mask, return_dict=True).pooler_output # replace text-audio token placeholders with audio embeddings - audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1) - inputs_embeds = inputs_embeds.masked_scatter( - audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device) + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device)) outputs: CausalLMOutputWithPast = self.language_model( inputs_embeds=inputs_embeds, diff --git a/src/transformers/models/glmasr/modeling_glmasr.py b/src/transformers/models/glmasr/modeling_glmasr.py index aff96cad3217..8b15a9241522 100644 --- a/src/transformers/models/glmasr/modeling_glmasr.py +++ b/src/transformers/models/glmasr/modeling_glmasr.py @@ -30,7 +30,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, is_torch_available +from ...utils import TransformersKwargs, auto_docstring, is_torch_available, torch_compilable_check from ...utils.generic import can_return_tuple, maybe_autocast, merge_with_config_defaults from ...utils.output_capturing import capture_outputs from ..auto import AutoModel, AutoModelForCausalLM @@ -425,6 +425,30 @@ def get_audio_features( return audio_outputs + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + @can_return_tuple @auto_docstring def forward( @@ -477,10 +501,10 @@ def forward( audio_embeds = self.get_audio_features(input_features, input_features_mask, return_dict=True).pooler_output # replace text-audio token placeholders with audio embeddings - audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1) - inputs_embeds = inputs_embeds.masked_scatter( - audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device) + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device)) outputs: CausalLMOutputWithPast = self.language_model( inputs_embeds=inputs_embeds, diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py index 0fbc1d1035bf..b417f844b428 100644 --- a/src/transformers/models/granite_speech/modeling_granite_speech.py +++ b/src/transformers/models/granite_speech/modeling_granite_speech.py @@ -514,6 +514,30 @@ def prepare_inputs_for_generation( model_inputs["input_features"] = input_features return model_inputs + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + def get_merged_audio_embeddings( self, input_ids: torch.Tensor, audio_features: torch.Tensor, input_features_mask: torch.Tensor | None = None ) -> torch.Tensor: @@ -534,20 +558,14 @@ def get_merged_audio_embeddings( llm_input_ids = torch.where(is_audio_index, 0, input_ids) inputs_embeds = self.language_model.get_input_embeddings()(llm_input_ids) # [bsz, # features, hidden size] - # Mask the audio features into the text embeddings - special_audio_mask = is_audio_index.unsqueeze(-1) audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype) if input_features_mask is not None: - torch_compilable_check( - not torch.all(is_audio_index.int().sum(dim=1) != input_features_mask.int().sum(dim=1)), - "Number of audio tokens does not match number of audio features", - ) audio_features = audio_features[input_features_mask] - inputs_embeds = inputs_embeds.masked_scatter( - special_audio_mask, - audio_features, + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_features ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features) return inputs_embeds def generate(self, *args, **kwargs) -> torch.LongTensor: diff --git a/src/transformers/models/voxtral/modeling_voxtral.py b/src/transformers/models/voxtral/modeling_voxtral.py index 76da78cc558f..54466321b79e 100644 --- a/src/transformers/models/voxtral/modeling_voxtral.py +++ b/src/transformers/models/voxtral/modeling_voxtral.py @@ -32,7 +32,7 @@ from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, CausalLMOutputWithPast from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_compilable_check from ...utils.generic import merge_with_config_defaults from ...utils.output_capturing import capture_outputs from ..auto import AutoModel, AutoModelForCausalLM @@ -418,6 +418,30 @@ def get_audio_features( return audio_outputs + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + @can_return_tuple @auto_docstring def forward( @@ -473,10 +497,10 @@ def forward( audio_embeds = self.get_audio_features(input_features, return_dict=True).pooler_output # replace text-audio token placeholders with audio embeddings - audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1) - inputs_embeds = inputs_embeds.masked_scatter( - audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device) + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device)) outputs: BaseModelOutputWithPast = self.language_model( attention_mask=attention_mask, diff --git a/src/transformers/models/voxtral/modular_voxtral.py b/src/transformers/models/voxtral/modular_voxtral.py index c7b2c53e16d4..02e8e2806a0f 100644 --- a/src/transformers/models/voxtral/modular_voxtral.py +++ b/src/transformers/models/voxtral/modular_voxtral.py @@ -25,7 +25,7 @@ CausalLMOutputWithPast, ) from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check from ...utils.generic import merge_with_config_defaults from ...utils.output_capturing import capture_outputs from ..auto import AutoModel, AutoModelForCausalLM @@ -187,6 +187,30 @@ def get_audio_features( return audio_outputs + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + @can_return_tuple @auto_docstring def forward( @@ -242,10 +266,10 @@ def forward( audio_embeds = self.get_audio_features(input_features, return_dict=True).pooler_output # replace text-audio token placeholders with audio embeddings - audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1) - inputs_embeds = inputs_embeds.masked_scatter( - audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device) + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device)) outputs: BaseModelOutputWithPast = self.language_model( attention_mask=attention_mask, From b59f9583755fba2afa5e9effd1103c180b34b341 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Tue, 21 Apr 2026 18:36:56 +0200 Subject: [PATCH 16/38] specific to musicflamingo --- .../musicflamingo/modeling_musicflamingo.py | 39 +++++++++++++++++-- .../musicflamingo/modular_musicflamingo.py | 15 +++++-- 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/musicflamingo/modeling_musicflamingo.py b/src/transformers/models/musicflamingo/modeling_musicflamingo.py index adec95bbf3e1..3ebfc929f6a8 100644 --- a/src/transformers/models/musicflamingo/modeling_musicflamingo.py +++ b/src/transformers/models/musicflamingo/modeling_musicflamingo.py @@ -33,7 +33,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available, torch_compilable_check from ..auto import AutoModel, AutoModelForCausalLM from .configuration_musicflamingo import MusicFlamingoConfig @@ -268,6 +268,30 @@ def get_audio_features( return audio_output + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + @can_return_tuple @auto_docstring def forward( @@ -344,10 +368,10 @@ def forward( ).pooler_output # replace text-audio token placeholders with audio embeddings - audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1) - inputs_embeds = inputs_embeds.masked_scatter( - audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device) + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device)) outputs: CausalLMOutputWithPast = self.language_model( inputs_embeds=inputs_embeds, @@ -387,6 +411,13 @@ def _build_audio_timestamps( _, ends = torch.where(diff == -1) sample_lengths = (ends - starts).to(torch.long) + n_audio_tokens = audio_token_mask.sum() + n_audio_features = post_lengths.sum() + torch_compilable_check( + n_audio_tokens == n_audio_features, + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + # Account for 4x downsampling in audio encoder (conv2 and avg pooling) audio_embed_frame_step = self.config.audio_frame_step * 4 frame_offsets = ( diff --git a/src/transformers/models/musicflamingo/modular_musicflamingo.py b/src/transformers/models/musicflamingo/modular_musicflamingo.py index 7d98d0ffdeab..e16ae28f6c68 100644 --- a/src/transformers/models/musicflamingo/modular_musicflamingo.py +++ b/src/transformers/models/musicflamingo/modular_musicflamingo.py @@ -25,7 +25,7 @@ from ...modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available, torch_compilable_check from ..audioflamingo3.configuration_audioflamingo3 import AudioFlamingo3Config from ..audioflamingo3.modeling_audioflamingo3 import ( AudioFlamingo3ForConditionalGeneration, @@ -274,6 +274,13 @@ def _build_audio_timestamps( _, ends = torch.where(diff == -1) sample_lengths = (ends - starts).to(torch.long) + n_audio_tokens = audio_token_mask.sum() + n_audio_features = post_lengths.sum() + torch_compilable_check( + n_audio_tokens == n_audio_features, + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + # Account for 4x downsampling in audio encoder (conv2 and avg pooling) audio_embed_frame_step = self.config.audio_frame_step * 4 frame_offsets = ( @@ -408,10 +415,10 @@ def forward( ).pooler_output # replace text-audio token placeholders with audio embeddings - audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1) - inputs_embeds = inputs_embeds.masked_scatter( - audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device) + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device)) outputs: CausalLMOutputWithPast = self.language_model( inputs_embeds=inputs_embeds, From bb986b6631c08b9c7e269978ba27acc5d3568e86 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Tue, 21 Apr 2026 18:37:09 +0200 Subject: [PATCH 17/38] granite speech fix --- tests/models/granite_speech/test_modeling_granite_speech.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py index 18f07fc71bef..3493fde4a267 100644 --- a/tests/models/granite_speech/test_modeling_granite_speech.py +++ b/tests/models/granite_speech/test_modeling_granite_speech.py @@ -74,8 +74,9 @@ def get_audio_embeds_mask(self, audio_mask): # Projector: ceil(feat_seq_length / window_size) * (window_size // downsample_rate) tokens per sample. import math - nblocks = math.ceil(self.feat_seq_length / self.window_size) - num_audio_tokens = nblocks * (self.window_size // self.downsample_rate) + config = self.get_config() + nblocks = math.ceil(self.feat_seq_length / config.window_size) + num_audio_tokens = nblocks * (config.window_size // config.downsample_rate) return torch.ones([self.batch_size, num_audio_tokens], dtype=torch.long).to(torch_device) def create_attention_mask(self, input_ids): From 670c68c238afa8643764f9db30f61f1bdb77147a Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Wed, 22 Apr 2026 11:54:35 +0200 Subject: [PATCH 18/38] let's factorise alm/vlm testers --- tests/alm_tester.py | 218 ++++---------------------------- tests/multimodal_tester.py | 253 +++++++++++++++++++++++++++++++++++++ tests/vlm_tester.py | 222 +++----------------------------- 3 files changed, 296 insertions(+), 397 deletions(-) create mode 100644 tests/multimodal_tester.py diff --git a/tests/alm_tester.py b/tests/alm_tester.py index 340aee77df5c..fd16623994ea 100644 --- a/tests/alm_tester.py +++ b/tests/alm_tester.py @@ -16,54 +16,27 @@ import unittest from inspect import signature -from .test_configuration_common import ConfigTester +from .multimodal_tester import MultiModalModelTest, MultiModalModelTester from .test_modeling_common import ( - GenerationTesterMixin, - ModelTesterMixin, floats_tensor, ids_tensor, is_torch_available, - require_torch, torch_device, ) -from .test_pipeline_mixin import PipelineTesterMixin if is_torch_available(): import torch -class ALMModelTester: - # If the model follows standard naming conventions, only `config_class` and - # `conditional_generation_class` need to be set (others are optional). - base_model_class = None # this should be added for most models when #45534 is merged - config_class = None - text_config_class = None +class ALMModelTester(MultiModalModelTester): audio_config_class = None - conditional_generation_class = None - sequence_classification_class = None - # These attributes are required after the initialization phase of the tester. - _required_attributes = ("config_class", "conditional_generation_class") - - # Arguments that should be passed to the config class even if not in its signature. - forced_config_args = ["pad_token_id"] - - # Key name for the audio sub-config in the main config constructor. - # Override to "encoder_config" for models like GraniteSpeech. audio_config_key = "audio_config" - audio_mask_key = None # to be set if audio-related mask has to be passed to the model's forward - - @property - def all_model_classes(self): - return [ - model_class - for model_class in ( - self.base_model_class, - self.conditional_generation_class, - self.sequence_classification_class, - ) - if model_class is not None - ] + # Name under which the audio mask is passed to the model's forward (e.g. "feature_attention_mask" + # for Qwen2Audio). Leave as `None` if the model does not consume a separate audio-level mask; + # `_prepare_modality_inputs` then skips adding it to the inputs dict. + audio_mask_key = None + _required_attributes = MultiModalModelTester._required_attributes + ("audio_config_class",) @property def pipeline_model_mapping(self): @@ -76,61 +49,22 @@ def pipeline_model_mapping(self): return mapping def __init__(self, parent, **kwargs): - self.parent = parent - # Standard defaults - kwargs.setdefault("batch_size", 3) - - # TODO: explain here specifically why these values are chosen kwargs.setdefault("seq_length", 32) kwargs.setdefault("feat_seq_length", 128) kwargs.setdefault("num_mel_bins", 80) - kwargs.setdefault("is_training", True) - kwargs.setdefault("use_labels", True) kwargs.setdefault("pad_token_id", 1) - kwargs.setdefault("bos_token_id", 1) - kwargs.setdefault("eos_token_id", 2) kwargs.setdefault("audio_token_id", 0) - kwargs.setdefault("ignore_index", -100) - kwargs.setdefault("scope", None) - kwargs.setdefault("vocab_size", 99) - kwargs.setdefault("hidden_size", 32) - kwargs.setdefault("num_hidden_layers", 2) - kwargs.setdefault("num_attention_heads", 2) - kwargs.setdefault("num_key_value_heads", 2) - kwargs.setdefault("intermediate_size", 32) # Keep this divisible by 8 for fp16/bf16/fp32 16-bytes alignment - kwargs.setdefault("hidden_act", "gelu") - kwargs.setdefault("max_position_embeddings", 512) - - # Set all kwargs as instance attributes - for key, value in kwargs.items(): - setattr(self, key, value) - - for required_attribute in [ - # "base_model_class", # TODO: @eustlb, there is a discrepancy here between ALMs/ VLMs. XXModel and XXForConditionalGeneration - "config_class", - "conditional_generation_class", - "text_config_class", - "audio_config_class", - ]: - if getattr(self, required_attribute) is None: - raise ValueError( - f"You have inherited from ALMModelTester but did not set the {required_attribute} attribute." - ) - # Because audio-LMs have some different standards in how they handle audio tokens, we need - # a few methods that can be overridden if required: + super().__init__(parent, **kwargs) + + # -- Overridable ALM-specific hooks ------------------------------------------------------ def create_audio_features(self): """Create audio feature tensor. Override for different shapes (e.g. [B, T, features]).""" return floats_tensor([self.batch_size, self.num_mel_bins, self.feat_seq_length]) - def create_attention_mask(self, input_ids): - # TODO: check, this looks strange to force as default behavior - # Override for bidirectional attention models like Gemma3 - return torch.tril(torch.ones_like(input_ids).to(torch_device)) - def get_audio_embeds_mask(self, audio_embeds_mask): """Get audio embeds mask from audio mask. Override for different shapes.""" raise NotImplementedError("This method should be overridden in the subclass") @@ -174,115 +108,39 @@ def create_audio_mask(self): audio_mask = ((positions >= offsets[:, None]) & (positions < offsets[:, None] + lengths[:, None])).long() return audio_mask - def get_additional_inputs(self, config, input_ids, audio_features): - """Return dict of model-specific extra inputs (e.g. image_sizes for multi-modal).""" - return {} + # -- Hooks consumed by the shared base --------------------------------------------------- - # End of overridable methods + def _special_token_ids(self): + return super()._special_token_ids() | {self.audio_token_id} - def prepare_config_and_inputs_for_common(self): - # TODO: add a clear diagram that explains input prep + def _build_modality_sub_configs(self): + return {self.audio_config_key: self.get_audio_config()} + def _prepare_modality_inputs(self, input_ids, config): + # TODO: add a clear diagram that explains input prep ? audio_features = self.create_audio_features() audio_mask = self.create_audio_mask() audio_embeds_mask = self.get_audio_embeds_mask(audio_mask) - - input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) - - special_tokens = [self.pad_token_id, self.bos_token_id, self.eos_token_id, self.audio_token_id] - for i in range(self.vocab_size): - if i not in special_tokens: - safe_token_id = i - break - else: - raise ValueError("vocab_size is too small and there is no token ID that is not a special token!") - - # Avoid flaky tests, clear any special tokens in ids_tensor - # audio_token_id is handled separately by place_audio_tokens() - input_ids[input_ids == self.pad_token_id] = safe_token_id - input_ids[input_ids == self.eos_token_id] = safe_token_id - - config = self.get_config() num_audio_tokens = audio_embeds_mask.sum(dim=1) input_ids = self.place_audio_tokens(input_ids, config, num_audio_tokens) - attention_mask = self.create_attention_mask(input_ids) - - inputs_dict = { - "input_ids": input_ids, - "attention_mask": attention_mask, - self.get_audio_feature_key(): audio_features, - } + modality_inputs = {self.get_audio_feature_key(): audio_features} if self.audio_mask_key is not None: - inputs_dict[self.audio_mask_key] = audio_mask - - inputs_dict.update(self.get_additional_inputs(config, input_ids, audio_features)) - return config, inputs_dict - - @property - def config_args(self): - return list(signature(self.config_class.__init__).parameters.keys()) + modality_inputs[self.audio_mask_key] = audio_mask + return input_ids, modality_inputs, audio_features - @property - def text_config_args(self): - args = list(signature(self.text_config_class.__init__).parameters.keys()) - for token_arg in ["pad_token_id", "bos_token_id", "eos_token_id"]: # Not always explicitly in the sig - if token_arg not in args: - args.append(token_arg) - return args + # -- Audio sub-config construction ------------------------------------------------------- @property def audio_config_args(self): return list(signature(self.audio_config_class.__init__).parameters.keys()) - def get_config(self): - kwargs = {} - attribute_map = getattr(self.config_class, "attribute_map", {}) - model_name_to_common_name = {v: k for k, v in attribute_map.items()} - for k in self.config_args + self.forced_config_args: - if hasattr(self, k) and k != "self": - kwargs[k] = getattr(self, k) - elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]): - kwargs[k] = getattr(self, model_name_to_common_name[k]) - kwargs["text_config"] = self.get_text_config() - kwargs[self.audio_config_key] = self.get_audio_config() - return self.config_class(**kwargs) - - def get_text_config(self): - kwargs = {} - attribute_map = getattr(self.text_config_class, "attribute_map", {}) - model_name_to_common_name = {v: k for k, v in attribute_map.items()} - for k in self.text_config_args: - if hasattr(self, k) and k != "self": - kwargs[k] = getattr(self, k) - elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]): - kwargs[k] = getattr(self, model_name_to_common_name[k]) - return self.text_config_class(**kwargs) - def get_audio_config(self): - kwargs = {} - attribute_map = getattr(self.audio_config_class, "attribute_map", {}) - model_name_to_common_name = {v: k for k, v in attribute_map.items()} - for k in self.audio_config_args: - if hasattr(self, k) and k != "self": - kwargs[k] = getattr(self, k) - elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]): - kwargs[k] = getattr(self, model_name_to_common_name[k]) + kwargs = self._collect_kwargs(self.audio_config_args, self.audio_config_class) return self.audio_config_class(**kwargs) - def create_and_check_model( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - ): - model = self.base_model_class(config=config) - model.to(torch_device) - model.eval() - model(input_ids, attention_mask=input_mask) - result = model(input_ids) - self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) - -@require_torch -class ALMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin): +class ALMModelTest(MultiModalModelTest): """ Base test class for Audio-Language Models. @@ -294,35 +152,6 @@ class ALMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin) - `pipeline_model_mapping`: Override if not using default from model_tester """ - model_tester_class = None - all_model_classes = None - pipeline_model_mapping = None - - # Audio-LMs are always composite - _is_composite = True - - def setUp(self): - if self.model_tester_class is None: - raise ValueError("You have inherited from ALMModelTest but did not set the model_tester_class attribute.") - self.model_tester = self.model_tester_class(self) - self.config_tester = ConfigTester(self, config_class=self.model_tester.config_class, has_text_modality=False) - - if self.pipeline_model_mapping is None: - if self.all_model_classes is not None: - raise ValueError( - "Tests that inherit from `ALMModelTest` and set `all_model_classes` must manually set " - "`pipeline_model_mapping`." - ) - else: - self.pipeline_model_mapping = self.model_tester.pipeline_model_mapping - - if self.all_model_classes is None: - self.all_model_classes = self.model_tester.all_model_classes - - def test_config(self): - """Test config common functionality.""" - self.config_tester.run_common_tests() - # TODO: @eustlb, remove this once #45534 is merged @unittest.skip("Audio-LMs have no separate base model without a head.") def test_model_base_model_prefix(self): @@ -394,4 +223,3 @@ def test_mismatching_num_audio_tokens(self): [curr_input_dict[audio_mask_key], curr_input_dict[audio_mask_key]], dim=0 ) _ = model(**curr_input_dict) - diff --git a/tests/multimodal_tester.py b/tests/multimodal_tester.py new file mode 100644 index 000000000000..1a52a5be303c --- /dev/null +++ b/tests/multimodal_tester.py @@ -0,0 +1,253 @@ +# Copyright 2026 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from inspect import signature + +from .test_configuration_common import ConfigTester +from .test_modeling_common import ( + GenerationTesterMixin, + ModelTesterMixin, + ids_tensor, + is_torch_available, + require_torch, + torch_device, +) +from .test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + +class MultiModalModelTester: + """Shared tester base for VLM (vision-language) and ALM (audio-language). + + Concrete subclasses (e.g. `VLMModelTester`, `ALMModelTester`) supply: + - the modality-specific sub-config class (`vision_config_class` for VLMs, `audio_config_class` for ALMs, ...), + - the modality-specific defaults and helper methods, + - the hooks `_build_modality_sub_configs` and `_prepare_modality_inputs`, + - optionally an extended `_special_token_ids` and `pipeline_model_mapping`. + + This tester provides shared logic for evaluating and verifying models that combine text with other modalities, + centering on the needs of vision-language (VLM) and audio-language (ALM) models. + """ + + # If the model follows the standard naming conventions, only `base_model_class` needs to be set + # (the others are inferred from available public classes). + base_model_class = None + config_class = None + text_config_class = None + conditional_generation_class = None + sequence_classification_class = None + + # Required attributes after the initialization phase of the tester. Subclasses extend. + _required_attributes = ("config_class", "text_config_class", "conditional_generation_class") + + # Arguments that should be passed to the config class even if not in its signature + forced_config_args = ["pad_token_id"] + + @property + def all_model_classes(self): + # Models that set `all_model_classes` in their `XXXModelTest` class must have a new class that doesn't fit + # any of the common classes. + return [ + model_class + for model_class in ( + self.base_model_class, + self.conditional_generation_class, + self.sequence_classification_class, + ) + if model_class is not None + ] + + def __init__(self, parent, **kwargs): + self.parent = parent + + # Text-side defaults shared by every multimodal tester. Subclasses are expected to `setdefault` + # their modality-specific kwargs (and any differing values such as `pad_token_id`) *before* calling super. + kwargs.setdefault("batch_size", 3) + kwargs.setdefault("is_training", True) + kwargs.setdefault("use_input_mask", True) + kwargs.setdefault("use_labels", True) + kwargs.setdefault("vocab_size", 99) + kwargs.setdefault("hidden_size", 32) + kwargs.setdefault("num_hidden_layers", 2) + kwargs.setdefault("num_attention_heads", 2) + kwargs.setdefault("num_key_value_heads", 2) + kwargs.setdefault("intermediate_size", 32) # Keep this divisible by 8 for fp16/bf16/fp32 16-bytes alignment + kwargs.setdefault("hidden_act", "gelu") + kwargs.setdefault("max_position_embeddings", 512) + kwargs.setdefault("bos_token_id", 1) + kwargs.setdefault("eos_token_id", 2) + kwargs.setdefault("ignore_index", -100) + kwargs.setdefault("scope", None) + + for key, value in kwargs.items(): + setattr(self, key, value) + + self._check_required_attributes() + + def _check_required_attributes(self): + for required_attribute in self._required_attributes: + if getattr(self, required_attribute, None) is None: + raise ValueError( + f"You have inherited from {type(self).__name__} but did not set the {required_attribute} attribute." + ) + + # -- Overridable modality hooks ----------------------------------------------------------- + + def create_attention_mask(self, input_ids): + """Default causal (lower-triangular) attention mask. Override for bidirectional models like Gemma3.""" + return torch.tril(torch.ones_like(input_ids).to(torch_device)) + + def get_additional_inputs(self, config, input_ids, modality_tensor): + """Model-specific extra inputs (e.g. LlavaNext `image_sizes`, Qwen3VL `mm_token_type_ids`).""" + return {} + + def _special_token_ids(self): + """Special token ids that must never appear as random text tokens. Subclasses add modality tokens.""" + return {self.pad_token_id, self.bos_token_id, self.eos_token_id} + + def _build_modality_sub_configs(self): + """Return the {sub-config-key: sub-config-instance} entries for the main config constructor.""" + raise NotImplementedError + + def _prepare_modality_inputs(self, input_ids, config): + """Create modality features, place modality placeholder tokens in ``input_ids``, and return: + + (input_ids_with_placeholders, modality_inputs_dict, modality_tensor_for_additional_inputs) + """ + raise NotImplementedError + + # -- End of overridable hooks ------------------------------------------------------------- + + def _safe_token_id(self): + """Smallest token ID that is not a special token. Used to scrub random ids_tensor outputs.""" + special_tokens = self._special_token_ids() + for i in range(self.vocab_size): + if i not in special_tokens: + return i + raise ValueError("vocab_size is too small and there is no token ID that is not a special token!") + + def prepare_config_and_inputs_for_common(self): + config = self.get_config() + + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + # Avoid flaky tests by scrubbing any accidental special tokens produced by ids_tensor. + # Modality placeholder tokens are scrubbed and placed by `_prepare_modality_inputs`. + safe_token_id = self._safe_token_id() + input_ids[input_ids == self.pad_token_id] = safe_token_id + input_ids[input_ids == self.eos_token_id] = safe_token_id + + input_ids, modality_inputs, modality_tensor = self._prepare_modality_inputs(input_ids, config) + + # Create attention mask with final input_ids (after modality placeholders are placed) — important + # for models that derive padding from token values. + attention_mask = self.create_attention_mask(input_ids) if self.use_input_mask else None + + inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask} + inputs_dict.update(modality_inputs) + inputs_dict.update(self.get_additional_inputs(config, input_ids, modality_tensor)) + return config, inputs_dict + + # -- Config construction helpers ---------------------------------------------------------- + + @property + def config_args(self): + return list(signature(self.config_class.__init__).parameters.keys()) + + @property + def text_config_args(self): + args = list(signature(self.text_config_class.__init__).parameters.keys()) + for token_arg in ["pad_token_id", "bos_token_id", "eos_token_id"]: # Not always explicitly in the sig + if token_arg not in args: + args.append(token_arg) + return args + + def _collect_kwargs(self, sig_keys, config_class): + """Collect kwargs for ``config_class`` by matching ``sig_keys`` (and its ``attribute_map``) against ``self``.""" + attribute_map = getattr(config_class, "attribute_map", {}) + model_name_to_common_name = {v: k for k, v in attribute_map.items()} + kwargs = {} + for k in sig_keys: + if hasattr(self, k) and k != "self": + kwargs[k] = getattr(self, k) + elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]): + kwargs[k] = getattr(self, model_name_to_common_name[k]) + return kwargs + + def get_config(self): + kwargs = self._collect_kwargs(self.config_args + self.forced_config_args, self.config_class) + kwargs["text_config"] = self.get_text_config() + kwargs.update(self._build_modality_sub_configs()) + return self.config_class(**kwargs) + + def get_text_config(self): + kwargs = self._collect_kwargs(self.text_config_args, self.text_config_class) + return self.text_config_class(**kwargs) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = self.base_model_class(config=config) + model.to(torch_device) + model.eval() + model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + +@require_torch +class MultiModalModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin): + """Shared test-class base for multimodal model families. + + Subclasses must set: + - ``model_tester_class``: The tester class (subclass of ``MultiModalModelTester``) + + Optional: + - ``all_model_classes``: override if not using the default from the model tester + - ``pipeline_model_mapping``: override if not using the default from the model tester + """ + + model_tester_class = None + all_model_classes = None + pipeline_model_mapping = None + + # Multimodal models are always composite + _is_composite = True + + def setUp(self): + if self.model_tester_class is None: + raise ValueError( + f"You have inherited from {type(self).__name__} but did not set the model_tester_class attribute." + ) + self.model_tester = self.model_tester_class(self) + self.config_tester = ConfigTester(self, config_class=self.model_tester.config_class, has_text_modality=False) + + if self.pipeline_model_mapping is None: + if self.all_model_classes is not None: + raise ValueError( + f"Tests that inherit from `{type(self).__name__}` and set `all_model_classes` must manually set " + "`pipeline_model_mapping`." + ) + else: + self.pipeline_model_mapping = self.model_tester.pipeline_model_mapping + + if self.all_model_classes is None: + self.all_model_classes = self.model_tester.all_model_classes + + def test_config(self): + """Test config common functionality.""" + self.config_tester.run_common_tests() diff --git a/tests/vlm_tester.py b/tests/vlm_tester.py index c40b42785836..7a435028c5e4 100644 --- a/tests/vlm_tester.py +++ b/tests/vlm_tester.py @@ -16,90 +16,42 @@ import unittest from inspect import signature -from .test_configuration_common import ConfigTester +from .multimodal_tester import MultiModalModelTest, MultiModalModelTester from .test_modeling_common import ( - GenerationTesterMixin, - ModelTesterMixin, floats_tensor, - ids_tensor, is_torch_available, - require_torch, torch_device, ) -from .test_pipeline_mixin import PipelineTesterMixin if is_torch_available(): import torch -class VLMModelTester: - # If the model follows the standard naming conventions, only `base_model_class` needs to be set (the others are - # inferred from available public classes). - base_model_class = None - config_class = None - text_config_class = None +class VLMModelTester(MultiModalModelTester): vision_config_class = None - conditional_generation_class = None - sequence_classification_class = None - # These attributes are required after the initialization phase of the tester. - _required_attributes = ("base_model_class", "config_class", "conditional_generation_class") - - # Arguments that should be passed to the config class even if not in its signature - forced_config_args = ["pad_token_id"] - - @property - def all_model_classes(self): - # Models that set `all_model_classes` in their `XXXModelTest` class must have a new class that doesn't fit - # any of the common classes. - return [ - model_class - for model_class in ( - self.base_model_class, - self.conditional_generation_class, - self.sequence_classification_class, - ) - if model_class is not None - ] + _required_attributes = MultiModalModelTester._required_attributes + ("base_model_class", "vision_config_class") @property def pipeline_model_mapping(self): - mapping = { + return { "feature-extraction": self.base_model_class, "image-text-to-text": self.conditional_generation_class, } - return mapping def __init__(self, parent, **kwargs): - self.parent = parent - # Standard defaults - kwargs.setdefault("batch_size", 3) - kwargs.setdefault("is_training", True) - kwargs.setdefault("use_input_mask", True) kwargs.setdefault("use_token_type_ids", False) - kwargs.setdefault("use_labels", True) - kwargs.setdefault("vocab_size", 99) - kwargs.setdefault("hidden_size", 32) - kwargs.setdefault("num_hidden_layers", 2) - kwargs.setdefault("num_attention_heads", 2) - kwargs.setdefault("num_key_value_heads", 2) - kwargs.setdefault("intermediate_size", 32) # Keep this divisible by 8 for fp16/bf16/fp32 16-bytes alignment - kwargs.setdefault("hidden_act", "gelu") kwargs.setdefault("hidden_dropout_prob", 0.1) kwargs.setdefault("attention_probs_dropout_prob", 0.1) - kwargs.setdefault("max_position_embeddings", 512) kwargs.setdefault("type_vocab_size", 16) kwargs.setdefault("type_sequence_label_size", 2) kwargs.setdefault("initializer_range", 0.02) kwargs.setdefault("num_labels", 3) kwargs.setdefault("num_choices", 4) kwargs.setdefault("pad_token_id", 0) - kwargs.setdefault("bos_token_id", 1) - kwargs.setdefault("eos_token_id", 2) kwargs.setdefault("image_token_id", 3) kwargs.setdefault("is_decoder", False) - kwargs.setdefault("scope", None) kwargs.setdefault("expert_interval", 1) kwargs.setdefault("moe_layer_start_index", 0) kwargs.setdefault("moe_intermediate_size", 12) @@ -108,54 +60,29 @@ def __init__(self, parent, **kwargs): kwargs.setdefault("moe_num_shared_experts", 2) kwargs.setdefault("num_experts_per_tok", 2) kwargs.setdefault("num_experts", 8) - kwargs.setdefault("mamba_n_groups", 1) - kwargs.setdefault("mamba_n_heads", 16) - kwargs.setdefault("mamba_d_state", 16) - kwargs.setdefault("mamba_d_conv", 4) - kwargs.setdefault("mamba_expand", 2) - kwargs.setdefault("mamba_chunk_size", 16) kwargs.setdefault("image_size", 8) kwargs.setdefault("patch_size", 4) kwargs.setdefault("num_channels", 3) kwargs.setdefault("projection_dim", 32) kwargs.setdefault("projector_hidden_act", "gelu") - kwargs.setdefault("ignore_index", -100) kwargs.setdefault("vision_feature_select_strategy", "default") kwargs.setdefault("vision_feature_layer", -1) kwargs.setdefault("tie_word_embeddings", False) - - # Computed defaults (can still be overridden in derived classes) - kwargs.setdefault("head_dim", kwargs["hidden_size"] // kwargs["num_attention_heads"]) kwargs.setdefault("num_image_tokens", (kwargs["image_size"] // kwargs["patch_size"]) ** 2) kwargs.setdefault("seq_length", 7 + kwargs["num_image_tokens"]) - # Set all kwargs as instance attributes - for key, value in kwargs.items(): - setattr(self, key, value) + super().__init__(parent, **kwargs) - for required_attribute in [ - "base_model_class", - "config_class", - "conditional_generation_class", - "text_config_class", - "vision_config_class", - ]: - if getattr(self, required_attribute) is None: - raise ValueError( - f"You have inherited from VLMModelTester but did not set the {required_attribute} attribute." - ) + # Computed default depending on base-class defaults for hidden_size / num_attention_heads. + if not hasattr(self, "head_dim"): + self.head_dim = self.hidden_size // self.num_attention_heads - # Because VLMs have some different standards in how they handle image tokens, we need a few methods - # that can be overridden if required: + # -- Overridable VLM-specific hooks ------------------------------------------------------ def create_pixel_values(self): # Override to 5D for patch-based models return floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size], scale=1.0) - def create_attention_mask(self, input_ids): - # Override for bidirectional attention models like Gemma3 - return torch.tril(torch.ones_like(input_ids).to(torch_device)) - def place_image_tokens(self, input_ids, config): # Override if the image tokens shouldn't be placed at the start of the test sequence image_token_id = getattr(config, "image_token_id", self.image_token_id) @@ -166,111 +93,31 @@ def place_image_tokens(self, input_ids, config): input_ids[:, : self.num_image_tokens] = image_token_id return input_ids - def get_additional_inputs(self, config, input_ids, pixel_values): - # Override for model-specific inputs like LlavaNext's image_sizes - return {} + # -- Hooks consumed by the shared base --------------------------------------------------- - # End of overridable methods + def _special_token_ids(self): + return super()._special_token_ids() | {self.image_token_id} - def prepare_config_and_inputs_for_common(self): - input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) - pixel_values = self.create_pixel_values() - - config = self.get_config() - - special_tokens = [self.pad_token_id, self.bos_token_id, self.eos_token_id, self.image_token_id] - for i in range(self.vocab_size): - if i not in special_tokens: - # The smallest token ID that is not a special token - safe_token_id = i - break - else: - raise ValueError("vocab_size is too small and there is no token ID that is not a special token!") - - # Avoid flaky tests, clear any special tokens in ids_tensor - # image_token_id is handled separately by place_image_tokens() - input_ids[input_ids == self.pad_token_id] = safe_token_id - input_ids[input_ids == self.eos_token_id] = safe_token_id + def _build_modality_sub_configs(self): + return {"vision_config": self.get_vision_config()} + def _prepare_modality_inputs(self, input_ids, config): + pixel_values = self.create_pixel_values() input_ids = self.place_image_tokens(input_ids, config) + return input_ids, {"pixel_values": pixel_values}, pixel_values - # Create attention mask with final input_ids (after image tokens are placed) - # This is important for models that use padding masks based on token values - input_mask = None - if self.use_input_mask: - input_mask = self.create_attention_mask(input_ids) - - inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask, "pixel_values": pixel_values} - - additional_inputs = self.get_additional_inputs(config, input_ids, pixel_values) - inputs_dict.update(additional_inputs) - - return config, inputs_dict - - @property - def config_args(self): - return list(signature(self.config_class.__init__).parameters.keys()) - - @property - def text_config_args(self): - args = list(signature(self.text_config_class.__init__).parameters.keys()) - for token_arg in ["pad_token_id", "bos_token_id", "eos_token_id"]: # Not always explicitly in the sig - if token_arg not in args: - args.append(token_arg) - return args + # -- Vision sub-config construction ------------------------------------------------------ @property def vision_config_args(self): return list(signature(self.vision_config_class.__init__).parameters.keys()) - def get_config(self): - kwargs = {} - attribute_map = getattr(self.config_class, "attribute_map", {}) - model_name_to_common_name = {v: k for k, v in attribute_map.items()} - for k in self.config_args + self.forced_config_args: - if hasattr(self, k) and k != "self": - kwargs[k] = getattr(self, k) - elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]): - kwargs[k] = getattr(self, model_name_to_common_name[k]) - kwargs["text_config"] = self.get_text_config() - kwargs["vision_config"] = self.get_vision_config() - return self.config_class(**kwargs) - - def get_text_config(self): - kwargs = {} - attribute_map = getattr(self.text_config_class, "attribute_map", {}) - model_name_to_common_name = {v: k for k, v in attribute_map.items()} - for k in self.text_config_args: - if hasattr(self, k) and k != "self": - kwargs[k] = getattr(self, k) - elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]): - kwargs[k] = getattr(self, model_name_to_common_name[k]) - return self.text_config_class(**kwargs) - def get_vision_config(self): - kwargs = {} - attribute_map = getattr(self.vision_config_class, "attribute_map", {}) - model_name_to_common_name = {v: k for k, v in attribute_map.items()} - for k in self.vision_config_args: - if hasattr(self, k) and k != "self": - kwargs[k] = getattr(self, k) - elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]): - kwargs[k] = getattr(self, model_name_to_common_name[k]) + kwargs = self._collect_kwargs(self.vision_config_args, self.vision_config_class) return self.vision_config_class(**kwargs) - def create_and_check_model( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - ): - model = self.base_model_class(config=config) - model.to(torch_device) - model.eval() - model(input_ids, attention_mask=input_mask) - result = model(input_ids) - self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) - -@require_torch -class VLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin): +class VLMModelTest(MultiModalModelTest): """ Base test class for Vision-Language Models. @@ -282,35 +129,6 @@ class VLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin) - `pipeline_model_mapping`: Override if not using default from model_tester """ - model_tester_class = None - all_model_classes = None - pipeline_model_mapping = None - - # VLMs are always composite - _is_composite = True - - def setUp(self): - if self.model_tester_class is None: - raise ValueError("You have inherited from VLMModelTest but did not set the model_tester_class attribute.") - self.model_tester = self.model_tester_class(self) - self.config_tester = ConfigTester(self, config_class=self.model_tester.config_class, has_text_modality=False) - - if self.pipeline_model_mapping is None: - if self.all_model_classes is not None: - raise ValueError( - "Tests that inherit from `VLMModelTest` and set `all_model_classes` must manually set " - "`pipeline_model_mapping`." - ) - else: - self.pipeline_model_mapping = self.model_tester.pipeline_model_mapping - - if self.all_model_classes is None: - self.all_model_classes = self.model_tester.all_model_classes - - def test_config(self): - """Test config common functionality.""" - self.config_tester.run_common_tests() - def test_mismatching_num_image_tokens(self): """ Tests that VLMs throw an error with explicit message saying what is wrong From c9534432c615de97e7d15c9c437e95af07866495 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Wed, 22 Apr 2026 12:11:13 +0200 Subject: [PATCH 19/38] make fix-repo --- .../configuration_granite_speech.py | 1 + .../configuration_qwen2_5_omni.py | 7 +++- .../configuration_qwen3_omni_moe.py | 7 +++- .../vibevoice_asr/modeling_vibevoice_asr.py | 32 +++++++++++++++++- .../modeling_voxtral_realtime.py | 33 ++++++++++++++++++- .../test_modeling_granite_speech.py | 2 +- 6 files changed, 77 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py index dbdda02ccdb9..e5532b3bf880 100644 --- a/src/transformers/models/granite_speech/configuration_granite_speech.py +++ b/src/transformers/models/granite_speech/configuration_granite_speech.py @@ -78,6 +78,7 @@ def __post_init__(self, **kwargs): if self.dim_head is None: self.dim_head = self.hidden_dim // self.num_heads + @auto_docstring(checkpoint="ibm-granite/granite-speech-3.3-2b") @strict class GraniteSpeechConfig(PreTrainedConfig): diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index 1564d2b36de9..081823bf222f 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -99,7 +99,12 @@ class Qwen2_5OmniAudioEncoderConfig(PreTrainedConfig): ```""" model_type = "qwen2_5_omni_audio_encoder" - attribute_map = {"num_hidden_layers": "encoder_layers"} + attribute_map = { + "num_hidden_layers": "encoder_layers", + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + "intermediate_size": "encoder_ffn_dim", + } num_mel_bins: int = 128 encoder_layers: int = 32 diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index 1ba13364401a..482030541e33 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -47,7 +47,12 @@ class Qwen3OmniMoeAudioEncoderConfig(PreTrainedConfig): """ model_type = "qwen3_omni_moe_audio_encoder" - attribute_map = {"num_hidden_layers": "encoder_layers"} + attribute_map = { + "num_hidden_layers": "encoder_layers", + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + "intermediate_size": "encoder_ffn_dim", + } num_mel_bins: int = 128 encoder_layers: int = 32 diff --git a/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py b/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py index 703bb6ca5130..5a1cb1b8895e 100644 --- a/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py +++ b/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py @@ -28,7 +28,13 @@ from ...modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling +from ...utils import ( + TransformersKwargs, + auto_docstring, + can_return_tuple, + is_torchdynamo_compiling, + torch_compilable_check, +) from ..auto import AutoModel, AutoModelForCausalLM from .configuration_vibevoice_asr import VibeVoiceAsrConfig @@ -362,6 +368,30 @@ def get_audio_features( return BaseModelOutputWithPooling(last_hidden_state=acoustic_latents, pooler_output=combined_features) + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py index 07325b0ea559..dbecd9a6f530 100644 --- a/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py +++ b/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py @@ -39,7 +39,14 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging +from ...utils import ( + TransformersKwargs, + auto_docstring, + can_return_tuple, + is_torchdynamo_compiling, + logging, + torch_compilable_check, +) from ...utils.generic import maybe_autocast, merge_with_config_defaults from ...utils.output_capturing import capture_outputs from ..auto import AutoModel @@ -1007,6 +1014,30 @@ def get_audio_features( return audio_outputs + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + @can_return_tuple @auto_docstring def forward( diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py index 3493fde4a267..f54350185c43 100644 --- a/tests/models/granite_speech/test_modeling_granite_speech.py +++ b/tests/models/granite_speech/test_modeling_granite_speech.py @@ -63,7 +63,7 @@ def __init__(self, parent, **kwargs): "intermediate_size": 256, "encoder_hidden_size": 32, } - + super().__init__(parent, **kwargs) def create_audio_features(self): From 874040992375d09ff521abc400c3f32d80a1c8f0 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Wed, 22 Apr 2026 15:56:38 +0200 Subject: [PATCH 20/38] unskip test_sdpa_can_dispatch_on_flash on qwen2_audio --- tests/models/qwen2_audio/test_modeling_qwen2_audio.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py index fc73d6dca607..669b5a4287a9 100644 --- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py +++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py @@ -86,10 +86,6 @@ class Qwen2AudioForConditionalGenerationModelTest(ALMModelTest, unittest.TestCas def test_sdpa_can_compile_dynamic(self): pass - @unittest.skip(reason="Compile not yet supported because in Qwen2Audio models") - def test_sdpa_can_dispatch_on_flash(self): - pass - @unittest.skip(reason="inputs_embeds is the audio-fused path; can't match raw token-only embeddings.") def test_inputs_embeds_matches_input_ids(self): pass From dde65f61fa3bf84988411c25f3737c1f02ba08e2 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Wed, 22 Apr 2026 16:24:31 +0200 Subject: [PATCH 21/38] should not be skipped --- tests/models/glmasr/test_modeling_glmasr.py | 15 --------------- .../musicflamingo/test_modeling_musicflamingo.py | 15 --------------- .../qwen2_audio/test_modeling_qwen2_audio.py | 5 ----- .../vibevoice_asr/test_modeling_vibevoice_asr.py | 14 -------------- .../test_modeling_voxtral_realtime.py | 4 ---- 5 files changed, 53 deletions(-) diff --git a/tests/models/glmasr/test_modeling_glmasr.py b/tests/models/glmasr/test_modeling_glmasr.py index 0b2aae719d19..b19e91a61209 100644 --- a/tests/models/glmasr/test_modeling_glmasr.py +++ b/tests/models/glmasr/test_modeling_glmasr.py @@ -15,8 +15,6 @@ import unittest -import pytest - from transformers import ( AutoProcessor, GlmAsrConfig, @@ -77,19 +75,6 @@ class GlmAsrForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): def test_inputs_embeds_matches_input_ids(self): pass - @unittest.skip(reason="Compile not yet supported for GlmAsr models") - @pytest.mark.torch_compile_test - def test_sdpa_can_compile_dynamic(self): - pass - - @unittest.skip(reason="Compile not yet supported for GlmAsr models") - def test_sdpa_can_dispatch_on_flash(self): - pass - - @unittest.skip(reason="GlmAsr tests avoid right-padding equivalence; fusion is in-place.") - def test_flash_attn_2_inference_equivalence_right_padding(self): - pass - @require_torch class GlmAsrForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/musicflamingo/test_modeling_musicflamingo.py b/tests/models/musicflamingo/test_modeling_musicflamingo.py index 6996ff4ccb71..2615af219ff5 100644 --- a/tests/models/musicflamingo/test_modeling_musicflamingo.py +++ b/tests/models/musicflamingo/test_modeling_musicflamingo.py @@ -19,8 +19,6 @@ import unittest from pathlib import Path -import pytest - from transformers import ( AudioFlamingo3EncoderConfig, AutoProcessor, @@ -160,19 +158,6 @@ def test_build_audio_timestamps_reconstructs_windows_from_input_ids(self): def test_inputs_embeds_matches_input_ids(self): pass - @unittest.skip(reason="Compile not yet supported for MusicFlamingo models") - @pytest.mark.torch_compile_test - def test_sdpa_can_compile_dynamic(self): - pass - - @unittest.skip(reason="Compile not yet supported for MusicFlamingo models") - def test_sdpa_can_dispatch_on_flash(self): - pass - - @unittest.skip(reason="MusicFlamingo tests avoid right-padding equivalence; fusion is in-place.") - def test_flash_attn_2_inference_equivalence_right_padding(self): - pass - @require_torch class MusicFlamingoForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py index 669b5a4287a9..869e8ff93753 100644 --- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py +++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py @@ -81,11 +81,6 @@ class Qwen2AudioForConditionalGenerationModelTest(ALMModelTest, unittest.TestCas model_tester_class = Qwen2AudioModelTester pipeline_model_mapping = {"any-to-any": Qwen2AudioForConditionalGeneration} if is_torch_available() else {} - @unittest.skip(reason="Compile not yet supported because in Qwen2Audio models") - @pytest.mark.torch_compile_test - def test_sdpa_can_compile_dynamic(self): - pass - @unittest.skip(reason="inputs_embeds is the audio-fused path; can't match raw token-only embeddings.") def test_inputs_embeds_matches_input_ids(self): pass diff --git a/tests/models/vibevoice_asr/test_modeling_vibevoice_asr.py b/tests/models/vibevoice_asr/test_modeling_vibevoice_asr.py index be0ece165e36..fc8bb11568ea 100644 --- a/tests/models/vibevoice_asr/test_modeling_vibevoice_asr.py +++ b/tests/models/vibevoice_asr/test_modeling_vibevoice_asr.py @@ -17,7 +17,6 @@ import unittest from pathlib import Path -import pytest from parameterized import parameterized from transformers import ( @@ -150,19 +149,6 @@ def setUp(self): def test_inputs_embeds_matches_input_ids(self): pass - @unittest.skip(reason="Compile not yet supported for VibeVoiceAsr models") - @pytest.mark.torch_compile_test - def test_sdpa_can_compile_dynamic(self): - pass - - @unittest.skip(reason="Compile not yet supported for VibeVoiceAsr models") - def test_sdpa_can_dispatch_on_flash(self): - pass - - @unittest.skip(reason="VibeVoiceAsr tests avoid right-padding equivalence; fusion is in-place.") - def test_flash_attn_2_inference_equivalence_right_padding(self): - pass - @unittest.skip(reason="VibeVoiceAsr has no separate base model without a head.") def test_model_base_model_prefix(self): pass diff --git a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py index 86682cd558a0..24bf9ccbd706 100644 --- a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py +++ b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py @@ -159,10 +159,6 @@ def test_generate_compile_model_forward_fullgraph(self): def test_generate_with_and_without_position_ids(self): super().test_generate_with_and_without_position_ids() - @unittest.skip(reason="VoxtralRealtime does not have a base model") - def test_model_base_model_prefix(self): - pass - @unittest.skip( reason="This test does not apply to VoxtralRealtime since input_features must be provided along input_ids" ) From 19b37c5adad555adb650fb9863fc0e3dc3b6d272 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Wed, 22 Apr 2026 16:33:02 +0200 Subject: [PATCH 22/38] make fix-repo --- tests/models/qwen2_audio/test_modeling_qwen2_audio.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py index 869e8ff93753..1557217fdd63 100644 --- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py +++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py @@ -18,7 +18,6 @@ from urllib.request import urlopen import librosa -import pytest from transformers import ( AutoProcessor, From b47621a9fb02efeb51869df863e54356b1173671 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Wed, 22 Apr 2026 17:51:47 +0200 Subject: [PATCH 23/38] test_mismatching_num_audio_tokens should be skipped for voxtral_realtime --- src/transformers/models/esm/configuration_esm.py | 4 ++-- .../voxtral_realtime/test_modeling_voxtral_realtime.py | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/esm/configuration_esm.py b/src/transformers/models/esm/configuration_esm.py index a00dcf8b39e3..7875d88ecee8 100644 --- a/src/transformers/models/esm/configuration_esm.py +++ b/src/transformers/models/esm/configuration_esm.py @@ -159,12 +159,12 @@ class EsmConfig(PreTrainedConfig): mask_token_id (`int`, *optional*): The index of the mask token in the vocabulary. This must be included in the config because of the "mask-dropout" scaling trick, which will scale the inputs depending on the number of masked tokens. + rope_theta (`float`, defaults to 10000.0): + The base period of the RoPE embeddings. Only used when `position_embedding_type` is set to `"rotary"`. position_embedding_type (`str`, *optional*, defaults to `"absolute"`): Type of position embedding. Choose either `"absolute"` or "rotary"`. emb_layer_norm_before (`bool`, *optional*): Whether to apply layer normalization after embeddings but before the main stem of the network. - rope_theta (`float`, defaults to 10000.0): - The base period of the RoPE embeddings. Only used when `position_embedding_type` is set to `"rotary"`. token_dropout (`bool`, defaults to `False`): When this is enabled, masked tokens are treated as if they had been dropped out by input dropout. is_folding_model (`bool`, defaults to `False`): diff --git a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py index 24bf9ccbd706..150d7a894104 100644 --- a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py +++ b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py @@ -159,6 +159,13 @@ def test_generate_compile_model_forward_fullgraph(self): def test_generate_with_and_without_position_ids(self): super().test_generate_with_and_without_position_ids() + @unittest.skip( + reason="This test does not apply to VoxtralRealtime: audio tokens are not replaced in inputs_embeds, " + "audio and text embeddings are summed instead." + ) + def test_mismatching_num_audio_tokens(self): + pass + @unittest.skip( reason="This test does not apply to VoxtralRealtime since input_features must be provided along input_ids" ) From b9d30be1262245c8e658dfdd3e8624660a10e660 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 27 Apr 2026 14:48:59 +0900 Subject: [PATCH 24/38] nit --- tests/multimodal_tester.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/multimodal_tester.py b/tests/multimodal_tester.py index 1a52a5be303c..41c1be171dd7 100644 --- a/tests/multimodal_tester.py +++ b/tests/multimodal_tester.py @@ -31,7 +31,7 @@ class MultiModalModelTester: - """Shared tester base for VLM (vision-language) and ALM (audio-language). + """Shared tester base for VLM (vision-language) and ALM (audio-language) models. Concrete subclasses (e.g. `VLMModelTester`, `ALMModelTester`) supply: - the modality-specific sub-config class (`vision_config_class` for VLMs, `audio_config_class` for ALMs, ...), From 8d2e4b7623b88cafa969de8d63baddf3346eadeb Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 27 Apr 2026 14:57:30 +0900 Subject: [PATCH 25/38] _special_token_ids as property and skipped in prepare_config_and_inputs_for_common --- tests/alm_tester.py | 3 ++- tests/multimodal_tester.py | 7 ++++--- tests/vlm_tester.py | 3 ++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/alm_tester.py b/tests/alm_tester.py index fd16623994ea..25647221c3a5 100644 --- a/tests/alm_tester.py +++ b/tests/alm_tester.py @@ -110,8 +110,9 @@ def create_audio_mask(self): # -- Hooks consumed by the shared base --------------------------------------------------- + @property def _special_token_ids(self): - return super()._special_token_ids() | {self.audio_token_id} + return super()._special_token_ids | {self.audio_token_id} def _build_modality_sub_configs(self): return {self.audio_config_key: self.get_audio_config()} diff --git a/tests/multimodal_tester.py b/tests/multimodal_tester.py index 41c1be171dd7..72de0834bf55 100644 --- a/tests/multimodal_tester.py +++ b/tests/multimodal_tester.py @@ -115,6 +115,7 @@ def get_additional_inputs(self, config, input_ids, modality_tensor): """Model-specific extra inputs (e.g. LlavaNext `image_sizes`, Qwen3VL `mm_token_type_ids`).""" return {} + @property def _special_token_ids(self): """Special token ids that must never appear as random text tokens. Subclasses add modality tokens.""" return {self.pad_token_id, self.bos_token_id, self.eos_token_id} @@ -134,7 +135,7 @@ def _prepare_modality_inputs(self, input_ids, config): def _safe_token_id(self): """Smallest token ID that is not a special token. Used to scrub random ids_tensor outputs.""" - special_tokens = self._special_token_ids() + special_tokens = self._special_token_ids for i in range(self.vocab_size): if i not in special_tokens: return i @@ -148,8 +149,8 @@ def prepare_config_and_inputs_for_common(self): # Avoid flaky tests by scrubbing any accidental special tokens produced by ids_tensor. # Modality placeholder tokens are scrubbed and placed by `_prepare_modality_inputs`. safe_token_id = self._safe_token_id() - input_ids[input_ids == self.pad_token_id] = safe_token_id - input_ids[input_ids == self.eos_token_id] = safe_token_id + for token_id in self._special_token_ids: + input_ids[input_ids == token_id] = safe_token_id input_ids, modality_inputs, modality_tensor = self._prepare_modality_inputs(input_ids, config) diff --git a/tests/vlm_tester.py b/tests/vlm_tester.py index 7a435028c5e4..31914ebfc95d 100644 --- a/tests/vlm_tester.py +++ b/tests/vlm_tester.py @@ -95,8 +95,9 @@ def place_image_tokens(self, input_ids, config): # -- Hooks consumed by the shared base --------------------------------------------------- + @property def _special_token_ids(self): - return super()._special_token_ids() | {self.image_token_id} + return super()._special_token_ids | {self.image_token_id} def _build_modality_sub_configs(self): return {"vision_config": self.get_vision_config()} From cbd526f24f9fb976e5916f208e9693e86715d8f7 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 27 Apr 2026 14:59:59 +0900 Subject: [PATCH 26/38] MoE params in common class --- tests/multimodal_tester.py | 8 ++++++++ tests/vlm_tester.py | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/multimodal_tester.py b/tests/multimodal_tester.py index 72de0834bf55..66c9ab12ddca 100644 --- a/tests/multimodal_tester.py +++ b/tests/multimodal_tester.py @@ -90,6 +90,14 @@ def __init__(self, parent, **kwargs): kwargs.setdefault("max_position_embeddings", 512) kwargs.setdefault("bos_token_id", 1) kwargs.setdefault("eos_token_id", 2) + kwargs.setdefault("expert_interval", 1) + kwargs.setdefault("moe_layer_start_index", 0) + kwargs.setdefault("moe_intermediate_size", 12) + kwargs.setdefault("shared_expert_intermediate_size", 36) + kwargs.setdefault("shared_expert_gate", True) + kwargs.setdefault("moe_num_shared_experts", 2) + kwargs.setdefault("num_experts_per_tok", 2) + kwargs.setdefault("num_experts", 8) kwargs.setdefault("ignore_index", -100) kwargs.setdefault("scope", None) diff --git a/tests/vlm_tester.py b/tests/vlm_tester.py index 31914ebfc95d..685dc09facd4 100644 --- a/tests/vlm_tester.py +++ b/tests/vlm_tester.py @@ -52,14 +52,6 @@ def __init__(self, parent, **kwargs): kwargs.setdefault("pad_token_id", 0) kwargs.setdefault("image_token_id", 3) kwargs.setdefault("is_decoder", False) - kwargs.setdefault("expert_interval", 1) - kwargs.setdefault("moe_layer_start_index", 0) - kwargs.setdefault("moe_intermediate_size", 12) - kwargs.setdefault("shared_expert_intermediate_size", 36) - kwargs.setdefault("shared_expert_gate", True) - kwargs.setdefault("moe_num_shared_experts", 2) - kwargs.setdefault("num_experts_per_tok", 2) - kwargs.setdefault("num_experts", 8) kwargs.setdefault("image_size", 8) kwargs.setdefault("patch_size", 4) kwargs.setdefault("num_channels", 3) From 12dfcd04bedab5f12a635ceb6e6536e033d78b2c Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 27 Apr 2026 16:17:26 +0900 Subject: [PATCH 27/38] add _TEXT_MODEL_TESTER_DEFAULTS to avoid divergence --- src/transformers/testing_utils.py | 28 +++++++++++++++ tests/causal_lm_tester.py | 60 +++++++------------------------ tests/multimodal_tester.py | 33 ++++++----------- 3 files changed, 51 insertions(+), 70 deletions(-) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 863242a695c6..908337fd4fd4 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -228,6 +228,34 @@ "conditional_generation_class": "ForConditionalGeneration", } +# Shared text-model defaults for CausalLMModelTester and MultiModalModelTester. +_TEXT_MODEL_TESTER_DEFAULTS = { + "batch_size": 13, + "seq_length": 7, + "is_training": True, + "use_input_mask": True, + "use_labels": True, + "vocab_size": 99, + "hidden_size": 32, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "num_key_value_heads": 2, + "intermediate_size": 32, + "hidden_act": "gelu", + "max_position_embeddings": 512, + "pad_token_id": 0, + "bos_token_id": 1, + "eos_token_id": 2, + "expert_interval": 1, + "moe_layer_start_index": 0, + "moe_intermediate_size": 16, + "shared_expert_intermediate_size": 36, + "shared_expert_gate": True, + "moe_num_shared_experts": 2, + "num_experts_per_tok": 2, + "num_experts": 8, +} + if is_torch_available(): import torch diff --git a/tests/causal_lm_tester.py b/tests/causal_lm_tester.py index b3398f13c393..6b94a520d4f2 100644 --- a/tests/causal_lm_tester.py +++ b/tests/causal_lm_tester.py @@ -22,6 +22,7 @@ from transformers.models.auto.auto_factory import getattribute_from_module from transformers.testing_utils import ( _COMMON_MODEL_NAMES_MAP, + _TEXT_MODEL_TESTER_DEFAULTS, is_flaky, require_flash_attn, require_torch_accelerator, @@ -166,84 +167,43 @@ def pipeline_model_mapping(self): def __init__( self, parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, use_token_type_ids=False, - use_labels=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=2, - num_attention_heads=2, - num_key_value_heads=2, - intermediate_size=32, - hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, - max_position_embeddings=512, type_vocab_size=16, type_sequence_label_size=2, initializer_range=0.02, num_labels=3, num_choices=4, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, is_decoder=False, scope=None, - expert_interval=1, - moe_layer_start_index=0, - moe_intermediate_size=16, - shared_expert_intermediate_size=36, - shared_expert_gate=True, - moe_num_shared_experts=2, - num_experts_per_tok=2, - num_experts=8, mamba_n_groups=1, mamba_n_heads=16, mamba_d_state=16, mamba_d_conv=4, mamba_expand=2, mamba_chunk_size=16, + **kwargs, ): self._verify_and_infer_model_attributes() self.parent = parent - self.batch_size = batch_size - self.seq_length = seq_length - self.is_training = is_training - self.use_input_mask = use_input_mask + + # Apply shared text-model defaults, then let caller kwargs override + for key, default in _TEXT_MODEL_TESTER_DEFAULTS.items(): + setattr(self, key, kwargs.pop(key, default)) + + # CausalLM-specific defaults (not shared with multimodal testers) self.use_token_type_ids = use_token_type_ids - self.use_labels = use_labels - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.type_sequence_label_size = type_sequence_label_size self.initializer_range = initializer_range self.num_labels = num_labels self.num_choices = num_choices - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id self.scope = scope self.head_dim = self.hidden_size // self.num_attention_heads self.is_decoder = is_decoder - self.expert_interval = expert_interval - self.moe_layer_start_index = moe_layer_start_index - self.moe_intermediate_size = moe_intermediate_size - self.shared_expert_intermediate_size = shared_expert_intermediate_size - self.shared_expert_gate = shared_expert_gate - self.moe_num_shared_experts = moe_num_shared_experts - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts self.mamba_n_groups = mamba_n_groups self.mamba_n_heads = mamba_n_heads self.mamba_d_state = mamba_d_state @@ -252,6 +212,10 @@ def __init__( self.mamba_chunk_size = mamba_chunk_size self.tie_word_embeddings = False + # Any remaining kwargs become attributes (for model-specific params) + for key, value in kwargs.items(): + setattr(self, key, value) + def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) diff --git a/tests/multimodal_tester.py b/tests/multimodal_tester.py index 66c9ab12ddca..7c1e0ea6a75f 100644 --- a/tests/multimodal_tester.py +++ b/tests/multimodal_tester.py @@ -15,6 +15,8 @@ from inspect import signature from .test_configuration_common import ConfigTester +from transformers.testing_utils import _TEXT_MODEL_TESTER_DEFAULTS + from .test_modeling_common import ( GenerationTesterMixin, ModelTesterMixin, @@ -74,30 +76,17 @@ def all_model_classes(self): def __init__(self, parent, **kwargs): self.parent = parent - # Text-side defaults shared by every multimodal tester. Subclasses are expected to `setdefault` - # their modality-specific kwargs (and any differing values such as `pad_token_id`) *before* calling super. + # Multimodal-specific overrides of shared defaults (applied before the shared + # defaults so they take precedence, but after any subclass setdefault calls). kwargs.setdefault("batch_size", 3) - kwargs.setdefault("is_training", True) - kwargs.setdefault("use_input_mask", True) - kwargs.setdefault("use_labels", True) - kwargs.setdefault("vocab_size", 99) - kwargs.setdefault("hidden_size", 32) - kwargs.setdefault("num_hidden_layers", 2) - kwargs.setdefault("num_attention_heads", 2) - kwargs.setdefault("num_key_value_heads", 2) - kwargs.setdefault("intermediate_size", 32) # Keep this divisible by 8 for fp16/bf16/fp32 16-bytes alignment - kwargs.setdefault("hidden_act", "gelu") - kwargs.setdefault("max_position_embeddings", 512) - kwargs.setdefault("bos_token_id", 1) - kwargs.setdefault("eos_token_id", 2) - kwargs.setdefault("expert_interval", 1) - kwargs.setdefault("moe_layer_start_index", 0) kwargs.setdefault("moe_intermediate_size", 12) - kwargs.setdefault("shared_expert_intermediate_size", 36) - kwargs.setdefault("shared_expert_gate", True) - kwargs.setdefault("moe_num_shared_experts", 2) - kwargs.setdefault("num_experts_per_tok", 2) - kwargs.setdefault("num_experts", 8) + + # Apply shared text-model defaults for anything not already set. + # Subclasses are expected to `setdefault` their modality-specific kwargs + # (and any differing values such as `pad_token_id`) *before* calling super. + for key, default in _TEXT_MODEL_TESTER_DEFAULTS.items(): + kwargs.setdefault(key, default) + kwargs.setdefault("ignore_index", -100) kwargs.setdefault("scope", None) From 95b1f20296aa97dac7a1b2c10c44e9254231a01a Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 27 Apr 2026 16:28:22 +0900 Subject: [PATCH 28/38] nit --- tests/vlm_tester.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/vlm_tester.py b/tests/vlm_tester.py index 685dc09facd4..d8cae2e215f6 100644 --- a/tests/vlm_tester.py +++ b/tests/vlm_tester.py @@ -49,7 +49,6 @@ def __init__(self, parent, **kwargs): kwargs.setdefault("initializer_range", 0.02) kwargs.setdefault("num_labels", 3) kwargs.setdefault("num_choices", 4) - kwargs.setdefault("pad_token_id", 0) kwargs.setdefault("image_token_id", 3) kwargs.setdefault("is_decoder", False) kwargs.setdefault("image_size", 8) From c2aa666ec2790f78f03c7c41366f96513928432e Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 27 Apr 2026 16:36:31 +0900 Subject: [PATCH 29/38] clearer inits --- tests/alm_tester.py | 7 ++++--- tests/vlm_tester.py | 7 +++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/alm_tester.py b/tests/alm_tester.py index 25647221c3a5..fe339188cf52 100644 --- a/tests/alm_tester.py +++ b/tests/alm_tester.py @@ -49,12 +49,13 @@ def pipeline_model_mapping(self): return mapping def __init__(self, parent, **kwargs): - # Standard defaults + # Overrides of _TEXT_MODEL_TESTER_DEFAULTS kwargs.setdefault("seq_length", 32) - kwargs.setdefault("feat_seq_length", 128) + kwargs.setdefault("pad_token_id", 1) + # ALM-specific defaults + kwargs.setdefault("feat_seq_length", 128) kwargs.setdefault("num_mel_bins", 80) - kwargs.setdefault("pad_token_id", 1) kwargs.setdefault("audio_token_id", 0) super().__init__(parent, **kwargs) diff --git a/tests/vlm_tester.py b/tests/vlm_tester.py index d8cae2e215f6..be175032b34d 100644 --- a/tests/vlm_tester.py +++ b/tests/vlm_tester.py @@ -40,7 +40,11 @@ def pipeline_model_mapping(self): } def __init__(self, parent, **kwargs): - # Standard defaults + # Overrides of _TEXT_MODEL_TESTER_DEFAULTS + kwargs.setdefault("seq_length", 7 + kwargs.get("num_image_tokens", (kwargs.get("image_size", 8) // kwargs.get("patch_size", 4)) ** 2)) + kwargs.setdefault("pad_token_id", 0) + + # VLM-specific defaults kwargs.setdefault("use_token_type_ids", False) kwargs.setdefault("hidden_dropout_prob", 0.1) kwargs.setdefault("attention_probs_dropout_prob", 0.1) @@ -60,7 +64,6 @@ def __init__(self, parent, **kwargs): kwargs.setdefault("vision_feature_layer", -1) kwargs.setdefault("tie_word_embeddings", False) kwargs.setdefault("num_image_tokens", (kwargs["image_size"] // kwargs["patch_size"]) ** 2) - kwargs.setdefault("seq_length", 7 + kwargs["num_image_tokens"]) super().__init__(parent, **kwargs) From 5e36c9f87d717d43497b4ba9a73481c6f29d1a65 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 27 Apr 2026 16:44:22 +0900 Subject: [PATCH 30/38] _prepare_modality_inputs return dict --- tests/alm_tester.py | 2 +- tests/models/gemma3/test_modeling_gemma3.py | 2 +- tests/models/llava_next/test_modeling_llava_next.py | 2 +- tests/models/qwen3_vl/test_modeling_qwen3_vl.py | 2 +- .../qwen3_vl_moe/test_modeling_qwen3_vl_moe.py | 2 +- tests/multimodal_tester.py | 13 ++++++++----- tests/vlm_tester.py | 2 +- 7 files changed, 14 insertions(+), 11 deletions(-) diff --git a/tests/alm_tester.py b/tests/alm_tester.py index fe339188cf52..b51cc4f11880 100644 --- a/tests/alm_tester.py +++ b/tests/alm_tester.py @@ -129,7 +129,7 @@ def _prepare_modality_inputs(self, input_ids, config): modality_inputs = {self.get_audio_feature_key(): audio_features} if self.audio_mask_key is not None: modality_inputs[self.audio_mask_key] = audio_mask - return input_ids, modality_inputs, audio_features + return input_ids, modality_inputs # -- Audio sub-config construction ------------------------------------------------------- diff --git a/tests/models/gemma3/test_modeling_gemma3.py b/tests/models/gemma3/test_modeling_gemma3.py index fe65a3f83bcf..02a7004d73e3 100644 --- a/tests/models/gemma3/test_modeling_gemma3.py +++ b/tests/models/gemma3/test_modeling_gemma3.py @@ -281,7 +281,7 @@ def create_attention_mask(self, input_ids): # Gemma3 uses padding mask for bidirectional attention on image tokens return input_ids.ne(self.pad_token_id).to(torch_device) - def get_additional_inputs(self, config, input_ids, pixel_values): + def get_additional_inputs(self, config, input_ids, modality_inputs): # Gemma3 requires specific token_type_ids for bidirectional attention on image tokens token_type_ids = torch.zeros_like(input_ids) token_type_ids[input_ids == config.image_token_id] = 1 diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index a5bd146fcc6d..6f3c2aa03751 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -84,7 +84,7 @@ def create_pixel_values(self): ] ) - def get_additional_inputs(self, config, input_ids, pixel_values): + def get_additional_inputs(self, config, input_ids, modality_inputs): """LlavaNext requires image_sizes tensor""" return { "image_sizes": torch.tensor([[self.image_size, self.image_size]] * self.batch_size), diff --git a/tests/models/qwen3_vl/test_modeling_qwen3_vl.py b/tests/models/qwen3_vl/test_modeling_qwen3_vl.py index 9874ce4a8203..d80cb3819486 100644 --- a/tests/models/qwen3_vl/test_modeling_qwen3_vl.py +++ b/tests/models/qwen3_vl/test_modeling_qwen3_vl.py @@ -107,7 +107,7 @@ def place_image_tokens(self, input_ids, config): input_ids[:, 0] = self.vision_start_token_id return input_ids - def get_additional_inputs(self, config, input_ids, pixel_values): + def get_additional_inputs(self, config, input_ids, modality_inputs): mm_token_type_ids = torch.zeros_like(input_ids) mm_token_type_ids[input_ids == self.image_token_id] = 1 return { diff --git a/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py b/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py index 0b0523de3b71..03a93ef1d7fd 100644 --- a/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py +++ b/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py @@ -106,7 +106,7 @@ def place_image_tokens(self, input_ids, config): input_ids[:, 0] = self.vision_start_token_id return input_ids - def get_additional_inputs(self, config, input_ids, pixel_values): + def get_additional_inputs(self, config, input_ids, modality_inputs): # Qwen3VL requires image_grid_thw tensor mm_token_type_ids = torch.zeros_like(input_ids) mm_token_type_ids[input_ids == self.image_token_id] = 1 diff --git a/tests/multimodal_tester.py b/tests/multimodal_tester.py index 7c1e0ea6a75f..3a91f536f429 100644 --- a/tests/multimodal_tester.py +++ b/tests/multimodal_tester.py @@ -108,8 +108,11 @@ def create_attention_mask(self, input_ids): """Default causal (lower-triangular) attention mask. Override for bidirectional models like Gemma3.""" return torch.tril(torch.ones_like(input_ids).to(torch_device)) - def get_additional_inputs(self, config, input_ids, modality_tensor): - """Model-specific extra inputs (e.g. LlavaNext `image_sizes`, Qwen3VL `mm_token_type_ids`).""" + def get_additional_inputs(self, config, input_ids, modality_inputs): + """Model-specific extra inputs (e.g. LlavaNext `image_sizes`, Qwen3VL `mm_token_type_ids`). + + ``modality_inputs`` is the full dict returned by ``_prepare_modality_inputs``. + """ return {} @property @@ -124,7 +127,7 @@ def _build_modality_sub_configs(self): def _prepare_modality_inputs(self, input_ids, config): """Create modality features, place modality placeholder tokens in ``input_ids``, and return: - (input_ids_with_placeholders, modality_inputs_dict, modality_tensor_for_additional_inputs) + (input_ids_with_placeholders, modality_inputs_dict) """ raise NotImplementedError @@ -149,7 +152,7 @@ def prepare_config_and_inputs_for_common(self): for token_id in self._special_token_ids: input_ids[input_ids == token_id] = safe_token_id - input_ids, modality_inputs, modality_tensor = self._prepare_modality_inputs(input_ids, config) + input_ids, modality_inputs = self._prepare_modality_inputs(input_ids, config) # Create attention mask with final input_ids (after modality placeholders are placed) — important # for models that derive padding from token values. @@ -157,7 +160,7 @@ def prepare_config_and_inputs_for_common(self): inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask} inputs_dict.update(modality_inputs) - inputs_dict.update(self.get_additional_inputs(config, input_ids, modality_tensor)) + inputs_dict.update(self.get_additional_inputs(config, input_ids, modality_inputs)) return config, inputs_dict # -- Config construction helpers ---------------------------------------------------------- diff --git a/tests/vlm_tester.py b/tests/vlm_tester.py index be175032b34d..ba08097e048a 100644 --- a/tests/vlm_tester.py +++ b/tests/vlm_tester.py @@ -99,7 +99,7 @@ def _build_modality_sub_configs(self): def _prepare_modality_inputs(self, input_ids, config): pixel_values = self.create_pixel_values() input_ids = self.place_image_tokens(input_ids, config) - return input_ids, {"pixel_values": pixel_values}, pixel_values + return input_ids, {"pixel_values": pixel_values} # -- Vision sub-config construction ------------------------------------------------------ From 184227cb20e4034b175933e5307c852d84e60f22 Mon Sep 17 00:00:00 2001 From: Tarek Ziade Date: Mon, 4 May 2026 15:13:17 +0200 Subject: [PATCH 31/38] format --- tests/multimodal_tester.py | 2 +- tests/vlm_tester.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/multimodal_tester.py b/tests/multimodal_tester.py index 3a91f536f429..22559876689b 100644 --- a/tests/multimodal_tester.py +++ b/tests/multimodal_tester.py @@ -14,9 +14,9 @@ from inspect import signature -from .test_configuration_common import ConfigTester from transformers.testing_utils import _TEXT_MODEL_TESTER_DEFAULTS +from .test_configuration_common import ConfigTester from .test_modeling_common import ( GenerationTesterMixin, ModelTesterMixin, diff --git a/tests/vlm_tester.py b/tests/vlm_tester.py index ba08097e048a..bce23b71e142 100644 --- a/tests/vlm_tester.py +++ b/tests/vlm_tester.py @@ -41,7 +41,10 @@ def pipeline_model_mapping(self): def __init__(self, parent, **kwargs): # Overrides of _TEXT_MODEL_TESTER_DEFAULTS - kwargs.setdefault("seq_length", 7 + kwargs.get("num_image_tokens", (kwargs.get("image_size", 8) // kwargs.get("patch_size", 4)) ** 2)) + kwargs.setdefault( + "seq_length", + 7 + kwargs.get("num_image_tokens", (kwargs.get("image_size", 8) // kwargs.get("patch_size", 4)) ** 2), + ) kwargs.setdefault("pad_token_id", 0) # VLM-specific defaults From d77fbb95d8a0c7539a6ee4ff6266bf53f04eed0d Mon Sep 17 00:00:00 2001 From: Tarek Ziade Date: Mon, 4 May 2026 15:14:41 +0200 Subject: [PATCH 32/38] split line for readability --- tests/vlm_tester.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/vlm_tester.py b/tests/vlm_tester.py index bce23b71e142..05be8bdfa8f1 100644 --- a/tests/vlm_tester.py +++ b/tests/vlm_tester.py @@ -43,7 +43,11 @@ def __init__(self, parent, **kwargs): # Overrides of _TEXT_MODEL_TESTER_DEFAULTS kwargs.setdefault( "seq_length", - 7 + kwargs.get("num_image_tokens", (kwargs.get("image_size", 8) // kwargs.get("patch_size", 4)) ** 2), + 7 + + kwargs.get( + "num_image_tokens", + (kwargs.get("image_size", 8) // kwargs.get("patch_size", 4)) ** 2, + ), ) kwargs.setdefault("pad_token_id", 0) From 902dbba3d740812b3c75903a0fad3c62b0ba6581 Mon Sep 17 00:00:00 2001 From: Tarek Ziade Date: Mon, 4 May 2026 15:22:47 +0200 Subject: [PATCH 33/38] ran python utils/check_modular_conversion.py --fix_and_overwrite --- .../configuration_granite_speech_plus.py | 13 ++++++- .../modeling_granite_speech_plus.py | 36 ++++++++++++++----- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/granite_speech_plus/configuration_granite_speech_plus.py b/src/transformers/models/granite_speech_plus/configuration_granite_speech_plus.py index c17c3f7391f9..1eec538091a4 100644 --- a/src/transformers/models/granite_speech_plus/configuration_granite_speech_plus.py +++ b/src/transformers/models/granite_speech_plus/configuration_granite_speech_plus.py @@ -62,13 +62,19 @@ class GraniteSpeechPlusEncoderConfig(PreTrainedConfig): ```""" model_type = "granite_speech_plus_encoder" + attribute_map = { + "hidden_size": "hidden_dim", + "num_hidden_layers": "num_layers", + "num_attention_heads": "num_heads", + "num_mel_bins": "input_dim", + } input_dim: int = 160 num_layers: int = 10 hidden_dim: int = 1024 feedforward_mult: int = 4 num_heads: int = 8 - dim_head: int = 128 + dim_head: int | None = None output_dim: int = 42 context_size: int = 200 max_pos_emb: int = 512 @@ -78,6 +84,11 @@ class GraniteSpeechPlusEncoderConfig(PreTrainedConfig): cat_hidden_layers: list[int] | None = None + def __post_init__(self, **kwargs): + super().__post_init__(**kwargs) + if self.dim_head is None: + self.dim_head = self.hidden_dim // self.num_heads + @auto_docstring(checkpoint="ibm-granite/granite-speech-4.1-2b-plus") @strict diff --git a/src/transformers/models/granite_speech_plus/modeling_granite_speech_plus.py b/src/transformers/models/granite_speech_plus/modeling_granite_speech_plus.py index 11020d261498..6293d9eb5941 100644 --- a/src/transformers/models/granite_speech_plus/modeling_granite_speech_plus.py +++ b/src/transformers/models/granite_speech_plus/modeling_granite_speech_plus.py @@ -537,6 +537,30 @@ def prepare_inputs_for_generation( model_inputs["input_features"] = input_features return model_inputs + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + def get_merged_audio_embeddings( self, input_ids: torch.Tensor, audio_features: torch.Tensor, input_features_mask: torch.Tensor | None = None ) -> torch.Tensor: @@ -557,20 +581,14 @@ def get_merged_audio_embeddings( llm_input_ids = torch.where(is_audio_index, 0, input_ids) inputs_embeds = self.language_model.get_input_embeddings()(llm_input_ids) # [bsz, # features, hidden size] - # Mask the audio features into the text embeddings - special_audio_mask = is_audio_index.unsqueeze(-1) audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype) if input_features_mask is not None: - torch_compilable_check( - not torch.all(is_audio_index.int().sum(dim=1) != input_features_mask.int().sum(dim=1)), - "Number of audio tokens does not match number of audio features", - ) audio_features = audio_features[input_features_mask] - inputs_embeds = inputs_embeds.masked_scatter( - special_audio_mask, - audio_features, + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_features ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features) return inputs_embeds def generate(self, *args, **kwargs) -> torch.LongTensor: From dcdead1df6de9f3146da691d2536fbd9d26b8a5e Mon Sep 17 00:00:00 2001 From: Tarek Ziade Date: Tue, 5 May 2026 12:37:11 +0200 Subject: [PATCH 34/38] testing auto cancel From 628343dc23997c29d3f16bc33bb858b430819a1f Mon Sep 17 00:00:00 2001 From: Tarek Ziade Date: Tue, 5 May 2026 12:37:44 +0200 Subject: [PATCH 35/38] testing auto cancel - part 2 From c1a47720b70f00df9977e5f21ae35ce40712e93f Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 11 May 2026 11:14:04 +0200 Subject: [PATCH 36/38] remove comment --- tests/alm_tester.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/alm_tester.py b/tests/alm_tester.py index b51cc4f11880..c34d4d45524c 100644 --- a/tests/alm_tester.py +++ b/tests/alm_tester.py @@ -119,7 +119,6 @@ def _build_modality_sub_configs(self): return {self.audio_config_key: self.get_audio_config()} def _prepare_modality_inputs(self, input_ids, config): - # TODO: add a clear diagram that explains input prep ? audio_features = self.create_audio_features() audio_mask = self.create_audio_mask() audio_embeds_mask = self.get_audio_embeds_mask(audio_mask) From 9322315383d3be3f9efeff8018315319b8519cd4 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 11 May 2026 11:34:20 +0200 Subject: [PATCH 37/38] udpate granite speech plus tests --- .../test_modeling_granite_speech_plus.py | 178 ++++-------------- 1 file changed, 39 insertions(+), 139 deletions(-) diff --git a/tests/models/granite_speech_plus/test_modeling_granite_speech_plus.py b/tests/models/granite_speech_plus/test_modeling_granite_speech_plus.py index 4108a4fbb79b..21f1d997efb4 100644 --- a/tests/models/granite_speech_plus/test_modeling_granite_speech_plus.py +++ b/tests/models/granite_speech_plus/test_modeling_granite_speech_plus.py @@ -15,18 +15,18 @@ import unittest -from parameterized import parameterized - -from transformers import AutoProcessor, GraniteSpeechPlusConfig, GraniteSpeechPlusForConditionalGeneration +from transformers import ( + AutoProcessor, + GraniteSpeechPlusConfig, + GraniteSpeechPlusEncoderConfig, + GraniteSpeechPlusForConditionalGeneration, +) from transformers.testing_utils import cleanup, require_torch, slow, torch_device -from transformers.utils import ModelOutput, is_datasets_available, is_torch_available +from transformers.utils import is_datasets_available, is_torch_available -from ...test_configuration_common import ConfigTester -from ..granite_speech.test_modeling_granite_speech import ( - GraniteSpeechForConditionalGenerationModelTest as _GraniteSpeechModelTestBase, -) from ..granite_speech.test_modeling_granite_speech import ( - GraniteSpeechForConditionalGenerationModelTester as _GraniteSpeechModelTesterBase, + GraniteSpeechForConditionalGenerationModelTest, + GraniteSpeechModelTester, ) @@ -35,155 +35,55 @@ if is_datasets_available(): from datasets import load_dataset -from transformers import set_seed - -class GraniteSpeechPlusForConditionalGenerationModelTester(_GraniteSpeechModelTesterBase): +class GraniteSpeechPlusForConditionalGenerationModelTester(GraniteSpeechModelTester): """ - Plus variant that exercises the ``encoder_hidden_layers`` concat path. The projector's - ``encoder_hidden_size`` is scaled to match ``encoder_config.hidden_dim * (len(encoder_hidden_layers) + 1)``. + Plus variant that exercises the ``cat_hidden_layers`` concat path. The projector's + ``encoder_hidden_size`` is scaled to match ``encoder_config.hidden_dim * (len(cat_hidden_layers) + 1)``. """ - def __init__(self, parent, encoder_hidden_layers=(0,), **kwargs): - projector_config = kwargs.pop( - "projector_config", - { - "attention_probs_dropout_prob": 0.1, - "cross_attention_frequency": 1, - "encoder_hidden_size": 64, # 32 (hidden_dim) * (1 intermediate + 1 last) = 64 - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 32, - "initializer_range": 0.02, - "intermediate_size": 256, - "layer_norm_eps": 1e-12, - "max_position_embeddings": 2048, - "model_type": "blip_2_qformer", - "num_attention_heads": 4, - "num_hidden_layers": 2, - "use_qformer_text_input": False, - "vocab_size": 30522, - }, - ) - super().__init__(parent=parent, projector_config=projector_config, **kwargs) - self.encoder_hidden_layers = list(encoder_hidden_layers) - self.encoder_config["cat_hidden_layers"] = self.encoder_hidden_layers + config_class = GraniteSpeechPlusConfig + conditional_generation_class = GraniteSpeechPlusForConditionalGeneration + audio_config_class = GraniteSpeechPlusEncoderConfig - def get_config(self): - return GraniteSpeechPlusConfig( - encoder_config=self.encoder_config, - text_config=self.text_config, - projector_config=self.projector_config, - audio_token_index=self.audio_token_index, - tie_word_embeddings=self.tie_word_embeddings, - initializer_range=self.initializer_range, - has_lora_adapter=self.has_lora_adapter, - ) + def __init__(self, parent, cat_hidden_layers=(0,), **kwargs): + super().__init__(parent, **kwargs) + self.cat_hidden_layers = list(cat_hidden_layers) + # Projector encoder_hidden_size must equal hidden_dim * (len(cat_hidden_layers) + 1). + self.projector_config = { + "model_type": "blip_2_qformer", + "hidden_size": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 256, + "encoder_hidden_size": 32 * (len(self.cat_hidden_layers) + 1), + } @require_torch -class GraniteSpeechPlusForConditionalGenerationModelTest(_GraniteSpeechModelTestBase): +class GraniteSpeechPlusForConditionalGenerationModelTest(GraniteSpeechForConditionalGenerationModelTest): """ Model tester for `GraniteSpeechPlusForConditionalGeneration`. """ - all_model_classes = (GraniteSpeechPlusForConditionalGeneration,) if is_torch_available() else () + model_tester_class = GraniteSpeechPlusForConditionalGenerationModelTester pipeline_model_mapping = {"any-to-any": GraniteSpeechPlusForConditionalGeneration} if is_torch_available() else {} - def setUp(self): - self.model_tester = GraniteSpeechPlusForConditionalGenerationModelTester(self) - self.config_tester = ConfigTester( - self, - config_class=GraniteSpeechPlusConfig, - has_text_modality=False, - ) + # The cat path changes the encoder output feature dim, so the generic shape assertion in + # `test_get_audio_features_output` (which assumes hidden_dim) does not apply. + skip_test_audio_features_output_shape = True def test_encoder_hidden_layers_concat_shape(self): - """With ``encoder_hidden_layers`` set, get_audio_features concatenates the selected intermediate - hidden states with the final hidden state before the projector.""" + """``encoder_config.cat_hidden_layers`` concatenates selected intermediate hidden states with the final + hidden state along the feature dim before the projector.""" config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = GraniteSpeechPlusForConditionalGeneration(config).to( - self.model_tester.parent.device if hasattr(self.model_tester.parent, "device") else "cpu" - ) - model.eval() + model = GraniteSpeechPlusForConditionalGeneration(config).to(torch_device).eval() with torch.no_grad(): out = model.get_audio_features(inputs_dict["input_features"].to(next(model.parameters()).device)) - self.assertEqual(out.pooler_output.shape[0], inputs_dict["input_features"].shape[0]) - - @parameterized.expand([True, False, None]) - def test_get_audio_features_output(self, return_dict: bool | None): - for model_class in self.all_model_classes: - if not hasattr(model_class, "get_audio_features"): - continue - - config, inputs_dict = self._audio_features_prepare_config_and_inputs() - if return_dict is not None: - config.return_dict = return_dict - - model = model_class(config).eval() - model = model.to(torch_device) - - set_seed(42) - with torch.no_grad(): - outputs = model.get_audio_features(**inputs_dict) - - if return_dict in (True, None): - self.assertTrue( - isinstance(outputs, ModelOutput), "get_audio_features() must return a BaseModelOutputWithPooling" - ) - self.assertTrue( - hasattr(outputs, "last_hidden_state"), - "get_audio_features() must return a BaseModelOutputWithPooling with last_hidden_state", - ) - self.assertTrue( - hasattr(outputs, "pooler_output"), - "get_audio_features() must return a BaseModelOutputWithPooling with pooler_output", - ) - self.assertTrue( - hasattr(outputs, "hidden_states"), - "get_audio_features() must return a BaseModelOutputWithPooling with hidden_states", - ) - if self.has_attentions: - self.assertTrue( - hasattr(outputs, "attentions"), - "get_audio_features() must return a BaseModelOutputWithPooling with attentions", - ) - - if getattr(self, "skip_test_audio_features_output_shape", False): - return - - last_hidden_state_shape = outputs.last_hidden_state.shape - - if "input_features" in inputs_dict: - batch_size = inputs_dict["input_features"].shape[0] - else: - batch_size = inputs_dict["input_values"].shape[0] - self.assertEqual( - last_hidden_state_shape[0], - batch_size, - f"batch_size mismatch, full shape: {last_hidden_state_shape}", - ) - - audio_config = config.audio_config if hasattr(config, "audio_config") else config - hidden_size = None - if hasattr(audio_config, "projection_dim"): - hidden_size = audio_config.projection_dim - elif hasattr(audio_config, "hidden_size"): - hidden_size = audio_config.hidden_size - elif hasattr(audio_config, "encoder_config"): - hidden_size = audio_config.encoder_config.hidden_dim * ( - len(audio_config.encoder_config.cat_hidden_layers) + 1 - ) - elif hasattr(audio_config, "encoder_ffn_dim"): - hidden_size = audio_config.encoder_ffn_dim - self.assertEqual( - last_hidden_state_shape[-1], - hidden_size, - f"hidden_size mismatch, full shape: {last_hidden_state_shape}", - ) - - else: - self.assertIsInstance(outputs, tuple, "get_audio_features() must return a tuple if return_dict=False") + cat_factor = len(config.encoder_config.cat_hidden_layers) + 1 + expected_hidden_size = config.encoder_config.hidden_dim * cat_factor + self.assertEqual(out.last_hidden_state.shape[0], inputs_dict["input_features"].shape[0]) + self.assertEqual(out.last_hidden_state.shape[-1], expected_hidden_size) class GraniteSpeechPlusForConditionalGenerationIntegrationTest(unittest.TestCase): From 95da79839b74104c2ef10a9a09eb79017023edc4 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Mon, 11 May 2026 12:15:34 +0200 Subject: [PATCH 38/38] fix test --- utils/check_repo.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/utils/check_repo.py b/utils/check_repo.py index ed77fedb5745..5a7484409e31 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -832,8 +832,9 @@ def find_tested_models(test_file: str) -> set[str]: model_tested.add(tested_class) # Same as above, but for ALMModelTester. Audio-LMs typically only set `conditional_generation_class` - # (no base_model_class). - audio_class_match = re.search(r"class \w+\(ALMModelTester\)", content) + # (no base_model_class). `GraniteSpeechModelTester` is listed because `GraniteSpeechPlusForConditionalGenerationModelTester` + # uses `ALMModelTester` indirectly through it; in the future we may want to resolve inheritance properly. + audio_class_match = re.search(r"class \w+\((?:ALMModelTester|GraniteSpeechModelTester)\)", content) if audio_class_match is not None: audio_content = content[audio_class_match.start() :] for test_class_type in [