From 3562c7f19828113377e0e022516fb1bfe9ea8ee5 Mon Sep 17 00:00:00 2001
From: Tarek Ziade <tarek@ziade.org>
Date: Mon, 13 Apr 2026 08:29:37 +0200
Subject: [PATCH 01/38] audio tester

---
 tests/audio_tester.py                         | 322 ++++++++++++++++++
 .../test_modeling_audioflamingo3.py           | 166 ++-------
 .../test_modeling_granite_speech.py           | 277 ++++++---------
 .../qwen2_audio/test_modeling_qwen2_audio.py  | 150 +-------
 4 files changed, 444 insertions(+), 471 deletions(-)
 create mode 100644 tests/audio_tester.py

diff --git a/tests/audio_tester.py b/tests/audio_tester.py
new file mode 100644
index 000000000000..b2d900a2236d
--- /dev/null
+++ b/tests/audio_tester.py
@@ -0,0 +1,322 @@
+# Copyright 2026 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+from inspect import signature
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import (
+    GenerationTesterMixin,
+    ModelTesterMixin,
+    floats_tensor,
+    ids_tensor,
+    is_torch_available,
+    require_torch,
+    torch_device,
+)
+from .test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+
+class AudioModelTester:
+    # If the model follows standard naming conventions, only `config_class` and
+    # `conditional_generation_class` need to be set (others are optional).
+    config_class = None
+    conditional_generation_class = None
+    base_model_class = None
+    sequence_classification_class = None
+
+    # Key name for the audio sub-config in the main config constructor.
+    # Override to "encoder_config" for models like GraniteSpeech.
+    audio_config_key = "audio_config"
+
+    # Model attribute name for the audio encoder (used in SDPA dispatch tests).
+    # Set to None to skip audio encoder SDPA checking.
+    audio_tower_attr = "audio_tower"
+
+    # Arguments that should be passed to the config class even if not in its signature.
+    forced_config_args = ["pad_token_id"]
+
+    _required_attributes = ("config_class", "conditional_generation_class")
+
+    @property
+    def all_model_classes(self):
+        return [
+            model_class
+            for model_class in (
+                self.base_model_class,
+                self.conditional_generation_class,
+                self.sequence_classification_class,
+            )
+            if model_class is not None
+        ]
+
+    @property
+    def pipeline_model_mapping(self):
+        return {"any-to-any": self.conditional_generation_class}
+
+    def __init__(self, parent, **kwargs):
+        self.parent = parent
+
+        # Standard defaults
+        kwargs.setdefault("batch_size", 3)
+        kwargs.setdefault("seq_length", 25)
+        kwargs.setdefault("feat_seq_length", 60)
+        kwargs.setdefault("num_mel_bins", 80)
+        kwargs.setdefault("is_training", True)
+        kwargs.setdefault("use_labels", True)
+        kwargs.setdefault("pad_token_id", 1)
+        kwargs.setdefault("bos_token_id", 1)
+        kwargs.setdefault("eos_token_id", 2)
+        kwargs.setdefault("audio_token_id", 0)
+        kwargs.setdefault("audio_token_index", 0)  # Alias for models that use this name
+        kwargs.setdefault("ignore_index", -100)
+        kwargs.setdefault("scope", None)
+
+        # Text config defaults (small Qwen2-style backbone)
+        kwargs.setdefault(
+            "text_config",
+            {
+                "model_type": "qwen2",
+                "intermediate_size": 36,
+                "initializer_range": 0.02,
+                "hidden_size": 32,
+                "max_position_embeddings": 52,
+                "num_hidden_layers": 2,
+                "num_attention_heads": 4,
+                "num_key_value_heads": 2,
+                "vocab_size": 99,
+                "pad_token_id": 1,
+            },
+        )
+
+        # Audio config defaults (small Whisper-style encoder)
+        kwargs.setdefault(
+            "audio_config",
+            {
+                "model_type": "qwen2_audio_encoder",
+                "d_model": 16,
+                "encoder_attention_heads": 4,
+                "encoder_ffn_dim": 16,
+                "encoder_layers": 2,
+                "num_mel_bins": 80,
+                "max_source_positions": 30,
+                "initializer_range": 0.02,
+            },
+        )
+
+        # Optional projector config (e.g. GraniteSpeech uses a Q-Former projector)
+        kwargs.setdefault("projector_config", None)
+
+        # Set all kwargs as instance attributes
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+
+        # Derived from text config (needed by ModelTesterMixin)
+        self.vocab_size = self.text_config.get("vocab_size", 99)
+        self.hidden_size = self.text_config.get("hidden_size", 32)
+        self.num_hidden_layers = self.text_config.get("num_hidden_layers", 2)
+        self.num_attention_heads = self.text_config.get("num_attention_heads", 4)
+        self.encoder_seq_length = self.seq_length
+
+        for required_attribute in self._required_attributes:
+            if getattr(self, required_attribute) is None:
+                raise ValueError(
+                    f"You have inherited from AudioModelTester but did not set the {required_attribute} attribute."
+                )
+
+    # Because audio-LMs have some different standards in how they handle audio tokens, we need
+    # a few methods that can be overridden if required:
+
+    def create_audio_features(self):
+        """Create audio feature tensor. Override for different shapes (e.g. [B, T, features])."""
+        return floats_tensor([self.batch_size, self.num_mel_bins, self.feat_seq_length])
+
+    def create_attention_mask(self, input_ids):
+        """Create text attention mask. Override for models without a padding sentinel."""
+        attention_mask = torch.ones_like(input_ids, dtype=torch.long).to(torch_device)
+        attention_mask[:, :1] = 0  # Padding sentinel
+        return attention_mask
+
+    def get_num_audio_tokens(self, audio_features):
+        """Compute number of audio placeholder tokens from features. Override for different subsampling."""
+        # Default: 2-stage pooling (common for Whisper-style encoders)
+        input_length = (audio_features.shape[-1] - 1) // 2 + 1
+        return (input_length - 2) // 2 + 1
+
+    def place_audio_tokens(self, input_ids, config, num_audio_tokens):
+        """Place audio placeholder tokens in input_ids. Override for different placement."""
+        input_ids = input_ids.clone()
+        input_ids[input_ids == self.audio_token_id] = self.pad_token_id
+        input_ids[:, 1 : 1 + num_audio_tokens] = self.audio_token_id
+        return input_ids
+
+    def get_audio_feature_key(self):
+        """Key name for audio features in the inputs dict."""
+        return "input_features"
+
+    def get_audio_mask_key(self):
+        """Key name for audio attention mask. Return None if no audio mask needed."""
+        return None
+
+    def create_audio_mask(self, audio_features):
+        """Create audio-level attention mask. Override for bool masks or different shapes."""
+        return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.long).to(torch_device)
+
+    def get_additional_inputs(self, config, input_ids, audio_features):
+        """Return dict of model-specific extra inputs (e.g. image_sizes for multi-modal)."""
+        return {}
+
+    # End of overridable methods
+
+    @property
+    def config_args(self):
+        return list(signature(self.config_class.__init__).parameters.keys())
+
+    def get_config(self):
+        kwargs = {}
+        skip_keys = {"self", "text_config", self.audio_config_key, "projector_config"}
+        attribute_map = getattr(self.config_class, "attribute_map", {})
+        model_name_to_common_name = {v: k for k, v in attribute_map.items()}
+        for k in self.config_args + self.forced_config_args:
+            if k in skip_keys:
+                continue
+            if hasattr(self, k) and k != "self":
+                kwargs[k] = getattr(self, k)
+            elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]):
+                kwargs[k] = getattr(self, model_name_to_common_name[k])
+        kwargs["text_config"] = self.text_config
+        kwargs[self.audio_config_key] = self.audio_config
+        if self.projector_config is not None:
+            kwargs["projector_config"] = self.projector_config
+        return self.config_class(**kwargs)
+
+    def prepare_config_and_inputs_for_common(self):
+        config = self.get_config()
+        audio_features = self.create_audio_features()
+        num_audio_tokens = self.get_num_audio_tokens(audio_features)
+
+        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
+        input_ids = self.place_audio_tokens(input_ids, config, num_audio_tokens)
+        attention_mask = self.create_attention_mask(input_ids)
+
+        inputs_dict = {
+            self.get_audio_feature_key(): audio_features,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+
+        audio_mask_key = self.get_audio_mask_key()
+        if audio_mask_key is not None:
+            inputs_dict[audio_mask_key] = self.create_audio_mask(audio_features)
+
+        inputs_dict.update(self.get_additional_inputs(config, input_ids, audio_features))
+        return config, inputs_dict
+
+
+@require_torch
+class AudioModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin):
+    """
+    Base test class for Audio-Language Models.
+
+    Subclasses should set:
+    - `model_tester_class`: The tester class (subclass of AudioModelTester)
+
+    Optional:
+    - `all_model_classes`: Override if not using default from model_tester
+    - `pipeline_model_mapping`: Override if not using default from model_tester
+    """
+
+    model_tester_class = None
+    all_model_classes = None
+    pipeline_model_mapping = None
+
+    # Audio-LMs are always composite
+    _is_composite = True
+
+    def setUp(self):
+        if self.model_tester_class is None:
+            raise ValueError(
+                "You have inherited from AudioModelTest but did not set the model_tester_class attribute."
+            )
+        self.model_tester = self.model_tester_class(self)
+        self.config_tester = ConfigTester(self, config_class=self.model_tester.config_class, has_text_modality=False)
+
+        if self.pipeline_model_mapping is None:
+            if self.all_model_classes is not None:
+                raise ValueError(
+                    "Tests that inherit from `AudioModelTest` and set `all_model_classes` must manually set "
+                    "`pipeline_model_mapping`."
+                )
+            else:
+                self.pipeline_model_mapping = self.model_tester.pipeline_model_mapping
+
+        if self.all_model_classes is None:
+            self.all_model_classes = self.model_tester.all_model_classes
+
+    def test_config(self):
+        """Test config common functionality."""
+        self.config_tester.run_common_tests()
+
+    def test_sdpa_can_dispatch_composite_models(self):
+        """Verify SDPA toggles propagate correctly to audio and text sub-modules."""
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
+        if not self._is_composite:
+            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                # SDPA (default)
+                model_sdpa = model_class.from_pretrained(tmpdirname)
+                model_sdpa = model_sdpa.eval().to(torch_device)
+
+                text_attn = "sdpa" if model.language_model._supports_sdpa else "eager"
+
+                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+                self.assertTrue(model.language_model.config._attn_implementation == text_attn)
+
+                audio_tower_attr = self.model_tester.audio_tower_attr
+                if audio_tower_attr is not None:
+                    audio_tower = getattr(model, audio_tower_attr)
+                    audio_attn = "sdpa" if audio_tower._supports_sdpa else "eager"
+                    self.assertTrue(audio_tower.config._attn_implementation == audio_attn)
+
+                # Eager
+                model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
+                model_eager = model_eager.eval().to(torch_device)
+                self.assertTrue(model_eager.config._attn_implementation == "eager")
+                self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
+
+                if audio_tower_attr is not None:
+                    self.assertTrue(getattr(model_eager, audio_tower_attr).config._attn_implementation == "eager")
+
+                for _, submodule in model_eager.named_modules():
+                    class_name = submodule.__class__.__name__
+                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+                        raise ValueError("The eager model should not have SDPA attention layers")
+
+    @unittest.skip("Audio-LMs have no separate base model without a head.")
+    def test_model_base_model_prefix(self):
+        pass
diff --git a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
index 7301812e7032..8726443bbfca 100644
--- a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
+++ b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
@@ -15,7 +15,6 @@
 """Testing suite for the PyTorch AudioFlamingo3 model."""
 
 import json
-import tempfile
 import unittest
 from pathlib import Path
 
@@ -34,56 +33,21 @@
     torch_device,
 )
 
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...audio_tester import AudioModelTest, AudioModelTester
 
 
 if is_torch_available():
     import torch
 
 
-class AudioFlamingo3ModelTester:
-    """
-    Builds a tiny AudioFlamingo3 config and synthetic inputs that respect AF3's
-    post-pool token accounting: num <sound> tokens per sample == post-pool frame count.
-    """
+class AudioFlamingo3ModelTester(AudioModelTester):
+    config_class = AudioFlamingo3Config
+    conditional_generation_class = AudioFlamingo3ForConditionalGeneration
 
-    def __init__(
-        self,
-        parent,
-        audio_token_id=0,
-        seq_length=25,
-        feat_seq_length=60,
-        text_config=None,
-        audio_config=None,
-        is_training=True,
-    ):
-        self.parent = parent
-        self.audio_token_id = audio_token_id
-        self.seq_length = seq_length
-        self.feat_seq_length = feat_seq_length
-        self.is_training = is_training
-
-        # Small text backbone (Qwen2-ish)
-        if text_config is None:
-            text_config = {
-                "model_type": "qwen2",
-                "intermediate_size": 36,
-                "initializer_range": 0.02,
-                "hidden_size": 32,
-                "max_position_embeddings": 52,
-                "num_hidden_layers": 2,
-                "num_attention_heads": 4,
-                "num_key_value_heads": 2,
-                "use_labels": True,
-                "use_mrope": False,
-                "vocab_size": 99,
-                "pad_token_id": 1,  # Ensure pad token != audio token
-            }
-        # Small audio encoder (AF3 Whisper-style)
-        if audio_config is None:
-            audio_config = {
+    def __init__(self, parent, **kwargs):
+        kwargs.setdefault(
+            "audio_config",
+            {
                 "model_type": "audioflamingo3_encoder",
                 "hidden_size": 16,
                 "num_attention_heads": 4,
@@ -92,70 +56,24 @@ def __init__(
                 "num_mel_bins": 80,
                 "max_source_positions": 30,
                 "initializer_range": 0.02,
-            }
-
-        self.text_config = text_config
-        self.audio_config = audio_config
-
-        self.batch_size = 3
-        self.vocab_size = text_config["vocab_size"]
-        self.hidden_size = text_config["hidden_size"]
-        self.num_attention_heads = text_config["num_attention_heads"]
-        self.num_hidden_layers = text_config["num_hidden_layers"]
-        self.encoder_seq_length = seq_length
-
-    def get_config(self):
-        return AudioFlamingo3Config(
-            text_config=self.text_config,
-            audio_config=self.audio_config,
-            audio_token_id=self.audio_token_id,
+            },
         )
+        super().__init__(parent, **kwargs)
 
-    def prepare_config_and_inputs(self):
-        # (#windows == batch_size, n_mels, T_mel)
-        input_features_values = floats_tensor(
-            [self.batch_size, self.audio_config["num_mel_bins"], self.feat_seq_length]
-        )
-        config = self.get_config()
-        # Per-window mel validity (all ones => full length)
-        input_features_mask = torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device)
-        return config, input_features_values, input_features_mask
-
-    def _post_pool_tokens_per_window(self, T_mel):
-        # Mirror AF3 processor math:
-        pre = (T_mel - 1) // 2 + 1
-        post = (pre - 2) // 2 + 1
-        return post
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_features_values, input_features_mask = self.prepare_config_and_inputs()
-        # Every window has same T_mel here
-        num_audio_tokens_per_sample = self._post_pool_tokens_per_window(input_features_values.shape[-1])
-
-        # Build token ids with valid range and K <sound> tokens
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
-        attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=torch_device)
-        attention_mask[:, :1] = 0  # left padding sentinel
-
-        # Fill first K positions (after padding) with the audio token id, for each sample
-        input_ids[:, 1 : 1 + num_audio_tokens_per_sample] = config.audio_token_id
-
-        inputs_dict = {
-            "input_features": input_features_values,
-            "input_features_mask": input_features_mask,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
+    def get_audio_mask_key(self):
+        return "input_features_mask"
+
+    def create_audio_mask(self, audio_features):
+        return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device)
 
 
 @require_torch
-class AudioFlamingo3ForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class AudioFlamingo3ForConditionalGenerationModelTest(AudioModelTest, unittest.TestCase):
     """
     Model tester for `AudioFlamingo3ForConditionalGeneration`.
     """
 
-    all_model_classes = (AudioFlamingo3ForConditionalGeneration,) if is_torch_available() else ()
+    model_tester_class = AudioFlamingo3ModelTester
     # TODO: @eustlb, this is incorrect
     pipeline_model_mapping = (
         {
@@ -165,14 +83,10 @@ class AudioFlamingo3ForConditionalGenerationModelTest(ModelTesterMixin, Generati
         if is_torch_available()
         else {}
     )
-    _is_composite = True
-
-    def setUp(self):
-        self.model_tester = AudioFlamingo3ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=AudioFlamingo3Config, has_text_modality=False)
 
     @unittest.skip(
-        reason="This test does not apply to AudioFlamingo3 since inputs_embeds corresponding to audio tokens are replaced when input features are provided."
+        reason="This test does not apply to AudioFlamingo3 since inputs_embeds corresponding to audio tokens "
+        "are replaced when input features are provided."
     )
     def test_inputs_embeds_matches_input_ids(self):
         pass
@@ -190,48 +104,6 @@ def test_sdpa_can_dispatch_on_flash(self):
     def test_flash_attn_2_inference_equivalence_right_padding(self):
         pass
 
-    @unittest.skip(reason="AudioFlamingo3 has no separate base model without a head.")
-    def test_model_base_model_prefix(self):
-        pass
-
-    def test_sdpa_can_dispatch_composite_models(self):
-        # AF3 is audio+text composite; verify SDPA toggles propagate to submodules.
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        if not self._is_composite:
-            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                # SDPA (default)
-                model_sdpa = model_class.from_pretrained(tmpdirname)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                text_attn = "sdpa" if model.language_model._supports_sdpa else "eager"
-                audio_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager"
-
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-                self.assertTrue(model.language_model.config._attn_implementation == text_attn)
-                self.assertTrue(model.audio_tower.config._attn_implementation == audio_attn)
-
-                # Eager
-                model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
-                model_eager = model_eager.eval().to(torch_device)
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-                self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
-                self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager")
-
-                for _, submodule in model_eager.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        raise ValueError("The eager model should not have SDPA attention layers")
-
 
 @require_torch
 class AudioFlamingo3ForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index c5e7aa3defcd..498f4fac0e12 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Testing suite for the IBM Granite Speech model."""
 
-import tempfile
 import unittest
 
 import pytest
@@ -35,14 +34,8 @@
     is_torch_available,
 )
 
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    floats_tensor,
-    ids_tensor,
-)
-from ...test_pipeline_mixin import PipelineTesterMixin
+from ...audio_tester import AudioModelTest, AudioModelTester
+from ...test_modeling_common import floats_tensor
 
 
 if is_torch_available():
@@ -52,129 +45,101 @@
     from datasets import load_dataset
 
 
-class GraniteSpeechForConditionalGenerationModelTester:
-    def __init__(
-        self,
-        parent,
-        seq_length=7,
-        encoder_config={
-            "model_type": "granite_speech_encoder",
-            "context_size": 200,
-            "conv_expansion_factor": 2,
-            "conv_kernel_size": 15,
-            "dim_head": 32,
-            "dropout": 0.1,
-            "feedforward_mult": 4,
-            "hidden_dim": 32,
-            "input_dim": 160,
-            "num_heads": 4,
-            "num_layers": 2,
-            "output_dim": 42,
-        },
-        text_config={
-            "model_type": "granite",
-            "is_training": True,
-            "seq_length": 7,
-            "use_token_type_ids": False,
-            "use_labels": True,
-            "vocab_size": 99,
-            "hidden_size": 32,
-            "num_hidden_layers": 2,
-            "num_attention_heads": 4,
-            "intermediate_size": 37,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.1,
-            "attention_probs_dropout_prob": 0.1,
-            "max_position_embeddings": 580,
-            "type_vocab_size": 16,
-            "type_sequence_label_size": 2,
-            "initializer_range": 0.02,
-            "num_labels": 3,
-            "num_choices": 4,
-            "pad_token_id": 1,
-        },
-        projector_config={
-            "attention_probs_dropout_prob": 0.1,
-            "cross_attention_frequency": 1,
-            "encoder_hidden_size": 32,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.1,
-            "hidden_size": 32,
-            "initializer_range": 0.02,
-            "intermediate_size": 256,
-            "layer_norm_eps": 1e-12,
-            "max_position_embeddings": 2048,
-            "model_type": "blip_2_qformer",
-            "num_attention_heads": 4,
-            "num_hidden_layers": 2,
-            "use_qformer_text_input": False,
-            "vocab_size": 30522,
-        },
-        audio_token_index=0,
-        tie_word_embeddings=True,
-        initializer_range=0.02,
-        has_lora_adapter=True,
-        downsample_rate=5,
-        window_size=15,
-        is_training=True,
-    ):
-        self.parent = parent
-        self.encoder_config = encoder_config
-        self.text_config = text_config
-        self.projector_config = projector_config
-        self.audio_token_index = audio_token_index
-        self.tie_word_embeddings = tie_word_embeddings
-        self.initializer_range = initializer_range
-        self.has_lora_adapter = has_lora_adapter
-        self.downsample_rate = downsample_rate
-        self.window_size = window_size
-        self.is_training = is_training
-
-        # Dims for audio features
-        self.sequence_dim = 844
-        self.feature_dim = 160
-        self.num_attention_heads = text_config["num_attention_heads"]
-        self.num_hidden_layers = text_config["num_hidden_layers"]
-        self.hidden_size = text_config["hidden_size"]
-        self.batch_size = 3
-        self.pad_token_id = text_config["pad_token_id"]
-        self.seq_len = 7
-        self.num_audio_tokens = 2
-        self.seq_length = seq_length + self.num_audio_tokens
-
-    def get_config(self):
-        return GraniteSpeechConfig(
-            encoder_config=self.encoder_config,
-            text_config=self.text_config,
-            projector_config=self.projector_config,
-            audio_token_index=self.audio_token_index,
-            tie_word_embeddings=self.tie_word_embeddings,
-            initializer_range=self.initializer_range,
-            has_lora_adapter=self.has_lora_adapter,
+class GraniteSpeechModelTester(AudioModelTester):
+    config_class = GraniteSpeechConfig
+    conditional_generation_class = GraniteSpeechForConditionalGeneration
+    audio_config_key = "encoder_config"
+    audio_tower_attr = None  # Encoder SDPA not checked
+
+    def __init__(self, parent, **kwargs):
+        kwargs.setdefault("seq_length", 9)  # 7 text + 2 audio tokens
+        kwargs.setdefault("num_audio_tokens", 2)
+        kwargs.setdefault("sequence_dim", 844)
+        kwargs.setdefault("feature_dim", 160)
+        kwargs.setdefault("audio_token_index", 0)
+        kwargs.setdefault("tie_word_embeddings", True)
+        kwargs.setdefault("initializer_range", 0.02)
+        kwargs.setdefault("has_lora_adapter", True)
+        kwargs.setdefault("downsample_rate", 5)
+        kwargs.setdefault("window_size", 15)
+        kwargs.setdefault(
+            "text_config",
+            {
+                "model_type": "granite",
+                "is_training": True,
+                "seq_length": 7,
+                "use_token_type_ids": False,
+                "use_labels": True,
+                "vocab_size": 99,
+                "hidden_size": 32,
+                "num_hidden_layers": 2,
+                "num_attention_heads": 4,
+                "intermediate_size": 37,
+                "hidden_act": "gelu",
+                "hidden_dropout_prob": 0.1,
+                "attention_probs_dropout_prob": 0.1,
+                "max_position_embeddings": 580,
+                "type_vocab_size": 16,
+                "type_sequence_label_size": 2,
+                "initializer_range": 0.02,
+                "num_labels": 3,
+                "num_choices": 4,
+                "pad_token_id": 1,
+            },
         )
-
-    def prepare_config_and_inputs(self):
-        input_features = floats_tensor(
-            [self.batch_size, self.sequence_dim, self.feature_dim],
+        kwargs.setdefault(
+            "audio_config",
+            {
+                "model_type": "granite_speech_encoder",
+                "context_size": 200,
+                "conv_expansion_factor": 2,
+                "conv_kernel_size": 15,
+                "dim_head": 32,
+                "dropout": 0.1,
+                "feedforward_mult": 4,
+                "hidden_dim": 32,
+                "input_dim": 160,
+                "num_heads": 4,
+                "num_layers": 2,
+                "output_dim": 42,
+            },
+        )
+        kwargs.setdefault(
+            "projector_config",
+            {
+                "attention_probs_dropout_prob": 0.1,
+                "cross_attention_frequency": 1,
+                "encoder_hidden_size": 32,
+                "hidden_act": "gelu",
+                "hidden_dropout_prob": 0.1,
+                "hidden_size": 32,
+                "initializer_range": 0.02,
+                "intermediate_size": 256,
+                "layer_norm_eps": 1e-12,
+                "max_position_embeddings": 2048,
+                "model_type": "blip_2_qformer",
+                "num_attention_heads": 4,
+                "num_hidden_layers": 2,
+                "use_qformer_text_input": False,
+                "vocab_size": 30522,
+            },
         )
-        config = self.get_config()
-        return config, input_features
+        super().__init__(parent, **kwargs)
 
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_features = config_and_inputs
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
-        attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
-        input_ids[input_ids == config.audio_token_index] = self.pad_token_id
+    def create_audio_features(self):
+        return floats_tensor([self.batch_size, self.sequence_dim, self.feature_dim])
 
-        input_ids[:, : self.num_audio_tokens] = config.audio_token_index
+    def create_attention_mask(self, input_ids):
+        return torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
 
-        inputs_dict = {
-            "input_features": input_features,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
+    def get_num_audio_tokens(self, audio_features):
+        return self.num_audio_tokens
+
+    def place_audio_tokens(self, input_ids, config, num_audio_tokens):
+        input_ids = input_ids.clone()
+        input_ids[input_ids == self.audio_token_id] = self.pad_token_id
+        input_ids[:, :num_audio_tokens] = self.audio_token_id
+        return input_ids
 
     def create_and_check_granite_speech_model_fp16_forward(self, config, input_ids, input_features, attention_mask):
         model = GraniteSpeechForConditionalGeneration(config=config)
@@ -211,27 +176,16 @@ def create_and_check_granite_speech_model_fp16_autocast_forward(
 
 
 @require_torch
-class GraniteSpeechForConditionalGenerationModelTest(
-    ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase
-):
+class GraniteSpeechForConditionalGenerationModelTest(AudioModelTest, unittest.TestCase):
     """
     Model tester for `GraniteSpeechForConditionalGeneration`.
     """
 
-    all_model_classes = (GraniteSpeechForConditionalGeneration,) if is_torch_available() else ()
+    model_tester_class = GraniteSpeechModelTester
     pipeline_model_mapping = {"any-to-any": GraniteSpeechForConditionalGeneration} if is_torch_available() else {}
-    _is_composite = True
-
-    def setUp(self):
-        self.model_tester = GraniteSpeechForConditionalGenerationModelTester(self)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=GraniteSpeechConfig,
-            has_text_modality=False,
-        )
 
     def test_inputs_embeds(self):
-        # overwrite inputs_embeds tests because we need to delete "input features" for the audio model
+        # Overwrite inputs_embeds tests because we need to delete "input_features" for the audio model
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
@@ -251,53 +205,12 @@ def test_inputs_embeds(self):
             with torch.no_grad():
                 model(**inputs)
 
-    def test_sdpa_can_dispatch_composite_models(self):
-        # overwrite because Granite Speech is audio+text model (not vision+text)
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        if not self._is_composite:
-            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
-
-        for model_class in self.all_model_classes:
-            # NOTE - currently we only enable alternate attention implementations on
-            # the encapsulated LLM; in the future, this should be added for the conformer
-            # encoder as well.
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model_sdpa = model_class.from_pretrained(tmpdirname)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                text_attn = "sdpa" if model.language_model._supports_sdpa else "eager"
-
-                # `None` as it is the requested one which will be assigned to each sub-config
-                # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-                self.assertTrue(model.language_model.config._attn_implementation == text_attn)
-
-                model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
-                model_eager = model_eager.eval().to(torch_device)
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-                self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
-
-                for name, submodule in model_eager.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        raise ValueError("The eager model should not have SDPA attention layers")
-
     @pytest.mark.generate
     @slow
     @unittest.skip(reason="Granite Speech doesn't support SDPA for all backbones")
     def test_eager_matches_sdpa_generate(self):
         pass
 
-    @unittest.skip(reason="GraniteSpeech has no separate base model without a head.")
-    def test_model_base_model_prefix(self):
-        pass
-
 
 class GraniteSpeechForConditionalGenerationIntegrationTest(unittest.TestCase):
     def setUp(self):
diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
index 4df16b9f6f4b..a1caaa4e7ae1 100644
--- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
+++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch Qwen2Audio model."""
 
-import tempfile
 import unittest
 from io import BytesIO
 from urllib.request import urlopen
@@ -34,121 +33,29 @@
     torch_device,
 )
 
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
+from ...audio_tester import AudioModelTest, AudioModelTester
 
 
 if is_torch_available():
     import torch
 
 
-class Qwen2AudioModelTester:
-    def __init__(
-        self,
-        parent,
-        ignore_index=-100,
-        audio_token_index=0,
-        seq_length=25,
-        feat_seq_length=60,
-        text_config={
-            "model_type": "qwen2",
-            "intermediate_size": 36,
-            "initializer_range": 0.02,
-            "hidden_size": 32,
-            "max_position_embeddings": 52,
-            "num_hidden_layers": 2,
-            "num_attention_heads": 4,
-            "num_key_value_heads": 2,
-            "use_labels": True,
-            "use_mrope": False,
-            "vocab_size": 99,
-            "pad_token_id": 1,  # can't be the same as the audio token id
-        },
-        is_training=True,
-        audio_config={
-            "model_type": "qwen2_audio_encoder",
-            "d_model": 16,
-            "encoder_attention_heads": 4,
-            "encoder_ffn_dim": 16,
-            "encoder_layers": 2,
-            "num_mel_bins": 80,
-            "max_source_positions": 30,
-            "initializer_range": 0.02,
-        },
-    ):
-        self.parent = parent
-        self.ignore_index = ignore_index
-        self.audio_token_index = audio_token_index
-        self.text_config = text_config
-        self.audio_config = audio_config
-        self.seq_length = seq_length
-        self.feat_seq_length = feat_seq_length
-
-        self.num_hidden_layers = text_config["num_hidden_layers"]
-        self.vocab_size = text_config["vocab_size"]
-        self.hidden_size = text_config["hidden_size"]
-        self.num_attention_heads = text_config["num_attention_heads"]
-        self.is_training = is_training
-
-        self.batch_size = 3
-        self.encoder_seq_length = seq_length
-
-    def get_config(self):
-        return Qwen2AudioConfig(
-            text_config=self.text_config,
-            audio_config=self.audio_config,
-            ignore_index=self.ignore_index,
-            audio_token_index=self.audio_token_index,
-        )
+class Qwen2AudioModelTester(AudioModelTester):
+    config_class = Qwen2AudioConfig
+    conditional_generation_class = Qwen2AudioForConditionalGeneration
 
-    def prepare_config_and_inputs(self):
-        input_features_values = floats_tensor(
-            [
-                self.batch_size,
-                self.audio_config["num_mel_bins"],
-                self.feat_seq_length,
-            ]
-        )
-        config = self.get_config()
-        feature_attention_mask = torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.long).to(torch_device)
-        return config, input_features_values, feature_attention_mask
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_features_values, feature_attention_mask = config_and_inputs
-        input_length = (input_features_values.shape[-1] - 1) // 2 + 1
-        num_audio_tokens = (input_length - 2) // 2 + 1
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
-        attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
-        attention_mask[:, :1] = 0
-        # we are giving 3 audios let's make sure we pass in 3 audios tokens
-        input_ids[:, 1 : 1 + num_audio_tokens] = config.audio_token_index
-        inputs_dict = {
-            "input_features": input_features_values,
-            "feature_attention_mask": feature_attention_mask,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
+    def get_audio_mask_key(self):
+        return "feature_attention_mask"
 
 
 @require_torch
-class Qwen2AudioForConditionalGenerationModelTest(
-    ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase
-):
+class Qwen2AudioForConditionalGenerationModelTest(AudioModelTest, unittest.TestCase):
     """
     Model tester for `Qwen2AudioForConditionalGeneration`.
     """
 
-    all_model_classes = (Qwen2AudioForConditionalGeneration,) if is_torch_available() else ()
+    model_tester_class = Qwen2AudioModelTester
     pipeline_model_mapping = {"any-to-any": Qwen2AudioForConditionalGeneration} if is_torch_available() else {}
-    _is_composite = True
-
-    def setUp(self):
-        self.model_tester = Qwen2AudioModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Qwen2AudioConfig, has_text_modality=False)
 
     @unittest.skip(reason="Compile not yet supported because in Qwen2Audio models")
     @pytest.mark.torch_compile_test
@@ -159,47 +66,6 @@ def test_sdpa_can_compile_dynamic(self):
     def test_sdpa_can_dispatch_on_flash(self):
         pass
 
-    @unittest.skip(reason="Qwen2Audio has no separate base model without a head.")
-    def test_model_base_model_prefix(self):
-        pass
-
-    def test_sdpa_can_dispatch_composite_models(self):
-        # overwrite because Qwen2 is audio+text model (not vision+text)
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        if not self._is_composite:
-            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model_sdpa = model_class.from_pretrained(tmpdirname)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                text_attn = "sdpa" if model.language_model._supports_sdpa else "eager"
-                vision_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager"
-
-                # `None` as it is the requested one which will be assigned to each sub-config
-                # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-                self.assertTrue(model.language_model.config._attn_implementation == text_attn)
-                self.assertTrue(model.audio_tower.config._attn_implementation == vision_attn)
-
-                model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
-                model_eager = model_eager.eval().to(torch_device)
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-                self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
-                self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager")
-
-                for name, submodule in model_eager.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        raise ValueError("The eager model should not have SDPA attention layers")
-
 
 @require_torch
 class Qwen2AudioForConditionalGenerationIntegrationTest(unittest.TestCase):

From 0817bdbd3c4332e07216d1e50e84893810f8af2b Mon Sep 17 00:00:00 2001
From: Tarek Ziade <tarek@ziade.org>
Date: Mon, 13 Apr 2026 08:57:12 +0200
Subject: [PATCH 02/38] tweak check repo for audio tester

---
 utils/check_repo.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/utils/check_repo.py b/utils/check_repo.py
index b1a3d158c716..0706e67236ee 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -776,6 +776,23 @@ def find_tested_models(test_file: str) -> set[str]:
                 continue
             model_tested.add(tested_class)
 
+    # Same as above, but for AudioModelTester. Audio-LMs typically only set `conditional_generation_class`
+    # (no base_model_class).
+    audio_class_match = re.search(r"class \w+\(AudioModelTester\)", content)
+    if audio_class_match is not None:
+        audio_content = content[audio_class_match.start() :]
+        for test_class_type in [
+            "config_class",
+            "conditional_generation_class",
+            "base_model_class",
+            "sequence_classification_class",
+        ]:
+            tested_class = re.findall(rf"{test_class_type}\s+=.*", audio_content)
+            if tested_class:
+                tested_class = tested_class[0].split("=")[1].strip()
+                if tested_class != "None":
+                    model_tested.add(tested_class)
+
     return model_tested
 
 
From 356c922ee0b3944d78c58c9753d8c1bc2d30ac7f Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 13 Apr 2026 14:06:53 +0200
Subject: [PATCH 03/38] audio -> ALM

---
 tests/{audio_tester.py => alm_tester.py}             | 12 ++++++------
 .../audioflamingo3/test_modeling_audioflamingo3.py   |  6 +++---
 .../granite_speech/test_modeling_granite_speech.py   |  6 +++---
 .../models/qwen2_audio/test_modeling_qwen2_audio.py  |  6 +++---
 utils/check_repo.py                                  |  4 ++--
 5 files changed, 17 insertions(+), 17 deletions(-)
 rename tests/{audio_tester.py => alm_tester.py} (96%)

diff --git a/tests/audio_tester.py b/tests/alm_tester.py
similarity index 96%
rename from tests/audio_tester.py
rename to tests/alm_tester.py
index b2d900a2236d..4c47cf7eb538 100644
--- a/tests/audio_tester.py
+++ b/tests/alm_tester.py
@@ -33,7 +33,7 @@
     import torch
 
 
-class AudioModelTester:
+class ALMModelTester:
     # If the model follows standard naming conventions, only `config_class` and
     # `conditional_generation_class` need to be set (others are optional).
     config_class = None
@@ -137,7 +137,7 @@ def __init__(self, parent, **kwargs):
         for required_attribute in self._required_attributes:
             if getattr(self, required_attribute) is None:
                 raise ValueError(
-                    f"You have inherited from AudioModelTester but did not set the {required_attribute} attribute."
+                    f"You have inherited from ALMModelTester but did not set the {required_attribute} attribute."
                 )
 
     # Because audio-LMs have some different standards in how they handle audio tokens, we need
@@ -230,12 +230,12 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class AudioModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin):
+class ALMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin):
     """
     Base test class for Audio-Language Models.
 
     Subclasses should set:
-    - `model_tester_class`: The tester class (subclass of AudioModelTester)
+    - `model_tester_class`: The tester class (subclass of ALMModelTester)
 
     Optional:
     - `all_model_classes`: Override if not using default from model_tester
@@ -252,7 +252,7 @@ class AudioModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
     def setUp(self):
         if self.model_tester_class is None:
             raise ValueError(
-                "You have inherited from AudioModelTest but did not set the model_tester_class attribute."
+                "You have inherited from ALMModelTest but did not set the model_tester_class attribute."
             )
         self.model_tester = self.model_tester_class(self)
         self.config_tester = ConfigTester(self, config_class=self.model_tester.config_class, has_text_modality=False)
@@ -260,7 +260,7 @@ def setUp(self):
         if self.pipeline_model_mapping is None:
             if self.all_model_classes is not None:
                 raise ValueError(
-                    "Tests that inherit from `AudioModelTest` and set `all_model_classes` must manually set "
+                    "Tests that inherit from `ALMModelTest` and set `all_model_classes` must manually set "
                     "`pipeline_model_mapping`."
                 )
             else:
diff --git a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
index 8726443bbfca..86d82cf4294d 100644
--- a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
+++ b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
@@ -33,14 +33,14 @@
     torch_device,
 )
 
-from ...audio_tester import AudioModelTest, AudioModelTester
+from ...alm_tester import ALMModelTest, ALMModelTester
 
 
 if is_torch_available():
     import torch
 
 
-class AudioFlamingo3ModelTester(AudioModelTester):
+class AudioFlamingo3ModelTester(ALMModelTester):
     config_class = AudioFlamingo3Config
     conditional_generation_class = AudioFlamingo3ForConditionalGeneration
 
@@ -68,7 +68,7 @@ def create_audio_mask(self, audio_features):
 
 
 @require_torch
-class AudioFlamingo3ForConditionalGenerationModelTest(AudioModelTest, unittest.TestCase):
+class AudioFlamingo3ForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase):
     """
     Model tester for `AudioFlamingo3ForConditionalGeneration`.
     """
diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index 498f4fac0e12..4b0e91ddbd36 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -34,7 +34,7 @@
     is_torch_available,
 )
 
-from ...audio_tester import AudioModelTest, AudioModelTester
+from ...alm_tester import ALMModelTest, ALMModelTester
 from ...test_modeling_common import floats_tensor
 
 
@@ -45,7 +45,7 @@
     from datasets import load_dataset
 
 
-class GraniteSpeechModelTester(AudioModelTester):
+class GraniteSpeechModelTester(ALMModelTester):
     config_class = GraniteSpeechConfig
     conditional_generation_class = GraniteSpeechForConditionalGeneration
     audio_config_key = "encoder_config"
@@ -176,7 +176,7 @@ def create_and_check_granite_speech_model_fp16_autocast_forward(
 
 
 @require_torch
-class GraniteSpeechForConditionalGenerationModelTest(AudioModelTest, unittest.TestCase):
+class GraniteSpeechForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase):
     """
     Model tester for `GraniteSpeechForConditionalGeneration`.
     """
diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
index a1caaa4e7ae1..5733a4347568 100644
--- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
+++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
@@ -33,14 +33,14 @@
     torch_device,
 )
 
-from ...audio_tester import AudioModelTest, AudioModelTester
+from ...alm_tester import ALMModelTest, ALMModelTester
 
 
 if is_torch_available():
     import torch
 
 
-class Qwen2AudioModelTester(AudioModelTester):
+class Qwen2AudioModelTester(ALMModelTester):
     config_class = Qwen2AudioConfig
     conditional_generation_class = Qwen2AudioForConditionalGeneration
 
@@ -49,7 +49,7 @@ def get_audio_mask_key(self):
 
 
 @require_torch
-class Qwen2AudioForConditionalGenerationModelTest(AudioModelTest, unittest.TestCase):
+class Qwen2AudioForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase):
     """
     Model tester for `Qwen2AudioForConditionalGeneration`.
     """
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 0706e67236ee..3199d6cf4b2f 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -776,9 +776,9 @@ def find_tested_models(test_file: str) -> set[str]:
                 continue
             model_tested.add(tested_class)
 
-    # Same as above, but for AudioModelTester. Audio-LMs typically only set `conditional_generation_class`
+    # Same as above, but for ALMModelTester. Audio-LMs typically only set `conditional_generation_class`
     # (no base_model_class).
-    audio_class_match = re.search(r"class \w+\(AudioModelTester\)", content)
+    audio_class_match = re.search(r"class \w+\(ALMModelTester\)", content)
     if audio_class_match is not None:
         audio_content = content[audio_class_match.start() :]
         for test_class_type in [

From 9663a8e56fe1c86b9833d251a90def0f4add31b8 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 13 Apr 2026 17:38:32 +0200
Subject: [PATCH 04/38] ALMTester: no audio/text defaults; better input prep

---
 tests/alm_tester.py | 231 ++++++++++++++++++++++++++++----------------
 1 file changed, 146 insertions(+), 85 deletions(-)

diff --git a/tests/alm_tester.py b/tests/alm_tester.py
index 4c47cf7eb538..5fd50997f470 100644
--- a/tests/alm_tester.py
+++ b/tests/alm_tester.py
@@ -75,8 +75,11 @@ def __init__(self, parent, **kwargs):
 
         # Standard defaults
         kwargs.setdefault("batch_size", 3)
-        kwargs.setdefault("seq_length", 25)
-        kwargs.setdefault("feat_seq_length", 60)
+
+        # TODO: explain here specifically why these values are chosen
+        kwargs.setdefault("seq_length", 32)
+        kwargs.setdefault("feat_seq_length", 128)
+
         kwargs.setdefault("num_mel_bins", 80)
         kwargs.setdefault("is_training", True)
         kwargs.setdefault("use_labels", True)
@@ -84,42 +87,17 @@ def __init__(self, parent, **kwargs):
         kwargs.setdefault("bos_token_id", 1)
         kwargs.setdefault("eos_token_id", 2)
         kwargs.setdefault("audio_token_id", 0)
-        kwargs.setdefault("audio_token_index", 0)  # Alias for models that use this name
         kwargs.setdefault("ignore_index", -100)
         kwargs.setdefault("scope", None)
-
-        # Text config defaults (small Qwen2-style backbone)
-        kwargs.setdefault(
-            "text_config",
-            {
-                "model_type": "qwen2",
-                "intermediate_size": 36,
-                "initializer_range": 0.02,
-                "hidden_size": 32,
-                "max_position_embeddings": 52,
-                "num_hidden_layers": 2,
-                "num_attention_heads": 4,
-                "num_key_value_heads": 2,
-                "vocab_size": 99,
-                "pad_token_id": 1,
-            },
-        )
-
-        # Audio config defaults (small Whisper-style encoder)
-        kwargs.setdefault(
-            "audio_config",
-            {
-                "model_type": "qwen2_audio_encoder",
-                "d_model": 16,
-                "encoder_attention_heads": 4,
-                "encoder_ffn_dim": 16,
-                "encoder_layers": 2,
-                "num_mel_bins": 80,
-                "max_source_positions": 30,
-                "initializer_range": 0.02,
-            },
-        )
-
+        kwargs.setdefault("vocab_size", 99)
+        kwargs.setdefault("hidden_size", 32)
+        kwargs.setdefault("num_hidden_layers", 2)
+        kwargs.setdefault("num_attention_heads", 2)
+        kwargs.setdefault("num_key_value_heads", 2)
+        kwargs.setdefault("intermediate_size", 32)  # Keep this divisible by 8 for fp16/bf16/fp32 16-bytes alignment
+        kwargs.setdefault("hidden_act", "gelu")
+        kwargs.setdefault("max_position_embeddings", 512)
+    
         # Optional projector config (e.g. GraniteSpeech uses a Q-Former projector)
         kwargs.setdefault("projector_config", None)
 
@@ -127,14 +105,20 @@ def __init__(self, parent, **kwargs):
         for key, value in kwargs.items():
             setattr(self, key, value)
 
-        # Derived from text config (needed by ModelTesterMixin)
-        self.vocab_size = self.text_config.get("vocab_size", 99)
-        self.hidden_size = self.text_config.get("hidden_size", 32)
-        self.num_hidden_layers = self.text_config.get("num_hidden_layers", 2)
-        self.num_attention_heads = self.text_config.get("num_attention_heads", 4)
-        self.encoder_seq_length = self.seq_length
-
-        for required_attribute in self._required_attributes:
+        # # Derived from text config (needed by ModelTesterMixin)
+        # self.vocab_size = self.text_config.get("vocab_size", 99)
+        # self.hidden_size = self.text_config.get("hidden_size", 32)
+        # self.num_hidden_layers = self.text_config.get("num_hidden_layers", 2)
+        # self.num_attention_heads = self.text_config.get("num_attention_heads", 4)
+        # self.encoder_seq_length = self.seq_length
+
+        for required_attribute in [
+            # "base_model_class", # TODO: @eustlb, there is a discrepancy here between ALMs/ VLMs. XXModel and XXForConditionalGeneration
+            "config_class",
+            "conditional_generation_class",
+            "text_config_class",
+            "audio_config_class",
+        ]:
             if getattr(self, required_attribute) is None:
                 raise ValueError(
                     f"You have inherited from ALMModelTester but did not set the {required_attribute} attribute."
@@ -148,22 +132,23 @@ def create_audio_features(self):
         return floats_tensor([self.batch_size, self.num_mel_bins, self.feat_seq_length])
 
     def create_attention_mask(self, input_ids):
-        """Create text attention mask. Override for models without a padding sentinel."""
-        attention_mask = torch.ones_like(input_ids, dtype=torch.long).to(torch_device)
-        attention_mask[:, :1] = 0  # Padding sentinel
-        return attention_mask
+        # TODO: check, this looks strange to force as default behavior
+        # Override for bidirectional attention models like Gemma3
+        return torch.tril(torch.ones_like(input_ids).to(torch_device))
 
-    def get_num_audio_tokens(self, audio_features):
-        """Compute number of audio placeholder tokens from features. Override for different subsampling."""
-        # Default: 2-stage pooling (common for Whisper-style encoders)
-        input_length = (audio_features.shape[-1] - 1) // 2 + 1
-        return (input_length - 2) // 2 + 1
+    def get_audio_embeds_mask(self, audio_embeds_mask):
+        """Get audio embeds mask from audio mask. Override for different shapes."""
+        raise NotImplementedError("This method should be overridden in the subclass")
 
     def place_audio_tokens(self, input_ids, config, num_audio_tokens):
-        """Place audio placeholder tokens in input_ids. Override for different placement."""
+        """Place audio placeholder tokens at random positions in input_ids. Override for different placement."""
         input_ids = input_ids.clone()
         input_ids[input_ids == self.audio_token_id] = self.pad_token_id
-        input_ids[:, 1 : 1 + num_audio_tokens] = self.audio_token_id
+        for i in range(input_ids.shape[0]):
+            n = num_audio_tokens[i].item() if isinstance(num_audio_tokens, torch.Tensor) else num_audio_tokens
+            available_positions = torch.arange(1, input_ids.shape[1])  # skip position 0 (BOS)
+            perm = torch.randperm(len(available_positions))[:n]
+            input_ids[i, available_positions[perm]] = self.audio_token_id
         return input_ids
 
     def get_audio_feature_key(self):
@@ -174,9 +159,20 @@ def get_audio_mask_key(self):
         """Key name for audio attention mask. Return None if no audio mask needed."""
         return None
 
-    def create_audio_mask(self, audio_features):
-        """Create audio-level attention mask. Override for bool masks or different shapes."""
-        return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.long).to(torch_device)
+    def create_audio_mask(self):
+        """Create audio-level attention mask with contiguous valid regions per batch element.
+
+        Each element gets a random offset and length, producing masks like [0, 0, 1, 1, 1, 0, 0].
+        """
+        # Sample lengths in [1, feat_seq_length] and offsets in [0, feat_seq_length - length]
+        lengths = ids_tensor([self.batch_size], vocab_size=self.feat_seq_length).abs() + 1
+        lengths = lengths.clamp(max=self.feat_seq_length)
+        offsets = ids_tensor([self.batch_size], vocab_size=self.feat_seq_length).abs()
+        offsets = offsets % (self.feat_seq_length - lengths + 1)
+
+        positions = torch.arange(self.feat_seq_length, device=torch_device)[None, :]
+        audio_mask = ((positions >= offsets[:, None]) & (positions < offsets[:, None] + lengths[:, None])).long()
+        return audio_mask
 
     def get_additional_inputs(self, config, input_ids, audio_features):
         """Return dict of model-specific extra inputs (e.g. image_sizes for multi-modal)."""
@@ -184,50 +180,115 @@ def get_additional_inputs(self, config, input_ids, audio_features):
 
     # End of overridable methods
 
-    @property
-    def config_args(self):
-        return list(signature(self.config_class.__init__).parameters.keys())
-
-    def get_config(self):
-        kwargs = {}
-        skip_keys = {"self", "text_config", self.audio_config_key, "projector_config"}
-        attribute_map = getattr(self.config_class, "attribute_map", {})
-        model_name_to_common_name = {v: k for k, v in attribute_map.items()}
-        for k in self.config_args + self.forced_config_args:
-            if k in skip_keys:
-                continue
-            if hasattr(self, k) and k != "self":
-                kwargs[k] = getattr(self, k)
-            elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]):
-                kwargs[k] = getattr(self, model_name_to_common_name[k])
-        kwargs["text_config"] = self.text_config
-        kwargs[self.audio_config_key] = self.audio_config
-        if self.projector_config is not None:
-            kwargs["projector_config"] = self.projector_config
-        return self.config_class(**kwargs)
-
     def prepare_config_and_inputs_for_common(self):
-        config = self.get_config()
+        # TODO: add a clear diagram that explains input prep
+
         audio_features = self.create_audio_features()
-        num_audio_tokens = self.get_num_audio_tokens(audio_features)
+        audio_mask = self.create_audio_mask()
+        audio_embeds_mask = self.get_audio_embeds_mask(audio_mask)
 
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
+        if audio_embeds_mask.shape[1] > self.seq_length:
+            raise ValueError(
+                f"`audio_embeds_mask` has more tokens per sequence than `seq_length` allows "
+                f"({audio_embeds_mask.shape[1]} > {self.seq_length}). "
+                "This likely indicates a mismatch between your feature extraction/configuration and your sequence length. "
+                "Please ensure `seq_length` is >= the number of audio embedding positions."
+            )
+         
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        special_tokens = [self.pad_token_id, self.bos_token_id, self.eos_token_id, self.audio_token_id]
+        for i in range(self.vocab_size):
+            if i not in special_tokens:
+                safe_token_id = i
+                break
+        else:
+            raise ValueError("vocab_size is too small and there is no token ID that is not a special token!")
+
+        # Avoid flaky tests, clear any special tokens in ids_tensor
+        # audio_token_id is handled separately by place_audio_tokens()
+        input_ids[input_ids == self.pad_token_id] = safe_token_id
+        input_ids[input_ids == self.eos_token_id] = safe_token_id
+
+        config = self.get_config()
+        num_audio_tokens = audio_embeds_mask.sum(dim=1)
         input_ids = self.place_audio_tokens(input_ids, config, num_audio_tokens)
         attention_mask = self.create_attention_mask(input_ids)
 
         inputs_dict = {
-            self.get_audio_feature_key(): audio_features,
             "input_ids": input_ids,
             "attention_mask": attention_mask,
+            self.get_audio_feature_key(): audio_features,
         }
 
         audio_mask_key = self.get_audio_mask_key()
         if audio_mask_key is not None:
-            inputs_dict[audio_mask_key] = self.create_audio_mask(audio_features)
+            inputs_dict[audio_mask_key] = audio_mask
 
         inputs_dict.update(self.get_additional_inputs(config, input_ids, audio_features))
         return config, inputs_dict
 
+    @property
+    def config_args(self):
+        return list(signature(self.config_class.__init__).parameters.keys())
+    
+    @property
+    def text_config_args(self):
+        args = list(signature(self.text_config_class.__init__).parameters.keys())
+        for token_arg in ["pad_token_id", "bos_token_id", "eos_token_id"]:  # Not always explicitly in the sig
+            if token_arg not in args:
+                args.append(token_arg)
+        return args
+
+    @property
+    def audio_config_args(self):
+        return list(signature(self.audio_config_class.__init__).parameters.keys())
+
+    def get_config(self):
+        kwargs = {}
+        attribute_map = getattr(self.config_class, "attribute_map", {})
+        model_name_to_common_name = {v: k for k, v in attribute_map.items()}
+        for k in self.config_args + self.forced_config_args:
+            if hasattr(self, k) and k != "self":
+                kwargs[k] = getattr(self, k)
+            elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]):
+                kwargs[k] = getattr(self, model_name_to_common_name[k])
+        kwargs["text_config"] = self.get_text_config()
+        kwargs["audio_config"] = self.get_audio_config()
+        return self.config_class(**kwargs)
+
+    def get_text_config(self):
+        kwargs = {}
+        attribute_map = getattr(self.text_config_class, "attribute_map", {})
+        model_name_to_common_name = {v: k for k, v in attribute_map.items()}
+        for k in self.text_config_args:
+            if hasattr(self, k) and k != "self":
+                kwargs[k] = getattr(self, k)
+            elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]):
+                kwargs[k] = getattr(self, model_name_to_common_name[k])
+        return self.text_config_class(**kwargs)
+
+    def get_audio_config(self):
+        kwargs = {}
+        attribute_map = getattr(self.audio_config_class, "attribute_map", {})
+        model_name_to_common_name = {v: k for k, v in attribute_map.items()}
+        for k in self.audio_config_args:
+            if hasattr(self, k) and k != "self":
+                kwargs[k] = getattr(self, k)
+            elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]):
+                kwargs[k] = getattr(self, model_name_to_common_name[k])
+        return self.audio_config_class(**kwargs)
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = self.base_model_class(config=config)
+        model.to(torch_device)
+        model.eval()
+        model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
 
 @require_torch
 class ALMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin):

From a599b1de9051e6f0f47867f05e08e5b07e2c7731 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Sun, 19 Apr 2026 15:24:48 +0200
Subject: [PATCH 05/38] udpate test_sdpa_can_dispatch_composite_models to
 hanlde ALMs

---
 tests/alm_tester.py                           | 49 -------------------
 .../test_modeling_audioflamingo3.py           | 49 +++++++++----------
 tests/test_modeling_common.py                 | 26 ++++++----
 3 files changed, 39 insertions(+), 85 deletions(-)

diff --git a/tests/alm_tester.py b/tests/alm_tester.py
index 5fd50997f470..4223e9a87ca4 100644
--- a/tests/alm_tester.py
+++ b/tests/alm_tester.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tempfile
 import unittest
 from inspect import signature
 
@@ -45,10 +44,6 @@ class ALMModelTester:
     # Override to "encoder_config" for models like GraniteSpeech.
     audio_config_key = "audio_config"
 
-    # Model attribute name for the audio encoder (used in SDPA dispatch tests).
-    # Set to None to skip audio encoder SDPA checking.
-    audio_tower_attr = "audio_tower"
-
     # Arguments that should be passed to the config class even if not in its signature.
     forced_config_args = ["pad_token_id"]
 
@@ -334,50 +329,6 @@ def test_config(self):
         """Test config common functionality."""
         self.config_tester.run_common_tests()
 
-    def test_sdpa_can_dispatch_composite_models(self):
-        """Verify SDPA toggles propagate correctly to audio and text sub-modules."""
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        if not self._is_composite:
-            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                # SDPA (default)
-                model_sdpa = model_class.from_pretrained(tmpdirname)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                text_attn = "sdpa" if model.language_model._supports_sdpa else "eager"
-
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-                self.assertTrue(model.language_model.config._attn_implementation == text_attn)
-
-                audio_tower_attr = self.model_tester.audio_tower_attr
-                if audio_tower_attr is not None:
-                    audio_tower = getattr(model, audio_tower_attr)
-                    audio_attn = "sdpa" if audio_tower._supports_sdpa else "eager"
-                    self.assertTrue(audio_tower.config._attn_implementation == audio_attn)
-
-                # Eager
-                model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
-                model_eager = model_eager.eval().to(torch_device)
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-                self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
-
-                if audio_tower_attr is not None:
-                    self.assertTrue(getattr(model_eager, audio_tower_attr).config._attn_implementation == "eager")
-
-                for _, submodule in model_eager.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        raise ValueError("The eager model should not have SDPA attention layers")
-
     @unittest.skip("Audio-LMs have no separate base model without a head.")
     def test_model_base_model_prefix(self):
         pass
diff --git a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
index 86d82cf4294d..153c6ba11b52 100644
--- a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
+++ b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
@@ -22,6 +22,8 @@
 
 from transformers import (
     AudioFlamingo3Config,
+    AudioFlamingo3EncoderConfig,
+    Qwen2Config,
     AudioFlamingo3ForConditionalGeneration,
     AutoProcessor,
     is_torch_available,
@@ -43,29 +45,35 @@
 class AudioFlamingo3ModelTester(ALMModelTester):
     config_class = AudioFlamingo3Config
     conditional_generation_class = AudioFlamingo3ForConditionalGeneration
+    text_config_class = Qwen2Config
+    audio_config_class = AudioFlamingo3EncoderConfig
+
 
     def __init__(self, parent, **kwargs):
-        kwargs.setdefault(
-            "audio_config",
-            {
-                "model_type": "audioflamingo3_encoder",
-                "hidden_size": 16,
-                "num_attention_heads": 4,
-                "intermediate_size": 16,
-                "num_hidden_layers": 2,
-                "num_mel_bins": 80,
-                "max_source_positions": 30,
-                "initializer_range": 0.02,
-            },
-        )
+        # feat_seq_length → (L-1)//2+1 after conv2 → (·-2)//2+1 after avg_pool, so
+        # feat_seq_length=60 gives 15 audio embed tokens (fits inside seq_length=32 + BOS + text).
+        kwargs.setdefault("feat_seq_length", 60)
+        # Encoder adds a learned positional embedding of size max_source_positions to post-conv2 features,
+        # so it must equal (feat_seq_length - 1) // 2 + 1.
+        kwargs.setdefault("max_source_positions", (kwargs["feat_seq_length"] - 1) // 2 + 1)
         super().__init__(parent, **kwargs)
 
     def get_audio_mask_key(self):
         return "input_features_mask"
 
-    def create_audio_mask(self, audio_features):
+    def create_audio_mask(self):
         return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device)
 
+    def get_audio_embeds_mask(self, audio_mask):
+        # Mirrors AudioFlamingo3Encoder._get_feat_extract_output_lengths:
+        # conv2 (k=3,s=2,p=1) then avg_pool (k=2,s=2).
+        input_lengths = audio_mask.sum(-1)
+        input_lengths = (input_lengths - 1) // 2 + 1
+        output_lengths = (input_lengths - 2) // 2 + 1
+        max_len = int(output_lengths.max().item())
+        positions = torch.arange(max_len, device=audio_mask.device)[None, :]
+        return (positions < output_lengths[:, None]).long()
+
 
 @require_torch
 class AudioFlamingo3ForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase):
@@ -91,19 +99,6 @@ class AudioFlamingo3ForConditionalGenerationModelTest(ALMModelTest, unittest.Tes
     def test_inputs_embeds_matches_input_ids(self):
         pass
 
-    @unittest.skip(reason="Compile not yet supported for AudioFlamingo3 models")
-    @pytest.mark.torch_compile_test
-    def test_sdpa_can_compile_dynamic(self):
-        pass
-
-    @unittest.skip(reason="Compile not yet supported for AudioFlamingo3 models")
-    def test_sdpa_can_dispatch_on_flash(self):
-        pass
-
-    @unittest.skip(reason="AudioFlamingo3 tests avoid right-padding equivalence; fusion is in-place.")
-    def test_flash_attn_2_inference_equivalence_right_padding(self):
-        pass
-
 
 @require_torch
 class AudioFlamingo3ForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 24f278c24704..ac754f3d672a 100644
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -3584,30 +3584,38 @@ def test_sdpa_can_dispatch_composite_models(self):
                 model_sdpa = model_class.from_pretrained(tmpdirname)
                 model_sdpa = model_sdpa.base_model
 
-                vision_model_names = {"visual", "image_tower", "vision_tower", "vision_model"}
+                modality_tower_names = {
+                    "visual",
+                    "image_tower",
+                    "vision_tower",
+                    "vision_model",
+                    "audio_tower",
+                    "audio_model",
+                }
                 language_model_names = {"language_model", "model", "text_model"}
-                vision_model_name = [name for name in vision_model_names if hasattr(model_sdpa, name)]
-                vision_model_name = vision_model_name[0] if len(vision_model_name) > 0 else None
+                modality_tower_name = [name for name in modality_tower_names if hasattr(model_sdpa, name)]
+                modality_tower_name = modality_tower_name[0] if len(modality_tower_name) > 0 else None
                 language_model_name = [name for name in language_model_names if hasattr(model_sdpa, name)]
                 language_model_name = language_model_name[0] if len(language_model_name) > 0 else None
-                if language_model_name is None or vision_model_name is None:
+                if language_model_name is None or modality_tower_name is None:
                     self.skipTest(
-                        reason="Model does not have both vision and language sub-models, cannot test composite SDPA dispatch"
+                        reason="Model does not have both a non-text modality tower and a language sub-model, "
+                        "cannot test composite SDPA dispatch"
                     )
-                vision_model_sdpa = getattr(model_sdpa, vision_model_name)
+                modality_tower_sdpa = getattr(model_sdpa, modality_tower_name)
                 language_model_sdpa = getattr(model_sdpa, language_model_name)
                 text_attn = "sdpa" if language_model_sdpa._supports_sdpa else "eager"
-                vision_attn = "sdpa" if vision_model_sdpa._supports_sdpa else "eager"
+                modality_attn = "sdpa" if modality_tower_sdpa._supports_sdpa else "eager"
 
                 # `None` as it is the requested one which will be assigned to each sub-config
                 # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
                 self.assertTrue(language_model_sdpa.config._attn_implementation == text_attn)
-                self.assertTrue(vision_model_sdpa.config._attn_implementation == vision_attn)
+                self.assertTrue(modality_tower_sdpa.config._attn_implementation == modality_attn)
 
                 model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
                 model_eager = model_eager.base_model
                 self.assertTrue(getattr(model_eager, language_model_name).config._attn_implementation == "eager")
-                self.assertTrue(getattr(model_eager, vision_model_name).config._attn_implementation == "eager")
+                self.assertTrue(getattr(model_eager, modality_tower_name).config._attn_implementation == "eager")
 
                 for name, submodule in model_eager.named_modules():
                     class_name = submodule.__class__.__name__

From a7d54dc554c80c19013c4ce7d04fa12748c23b9f Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 20 Apr 2026 10:43:46 +0200
Subject: [PATCH 06/38] propagate to other model classes

---
 tests/alm_tester.py                           |  13 +-
 tests/models/glmasr/test_modeling_glmasr.py   | 170 +++-------------
 .../test_modeling_granite_speech.py           |  94 +++------
 .../test_modeling_musicflamingo.py            | 183 ++++-------------
 .../qwen2_audio/test_modeling_qwen2_audio.py  |  34 ++++
 tests/models/voxtral/test_modeling_voxtral.py | 167 +++-------------
 .../test_modeling_voxtral_realtime.py         | 189 +++++++-----------
 7 files changed, 232 insertions(+), 618 deletions(-)

diff --git a/tests/alm_tester.py b/tests/alm_tester.py
index 4223e9a87ca4..4c104e6dd49d 100644
--- a/tests/alm_tester.py
+++ b/tests/alm_tester.py
@@ -136,14 +136,17 @@ def get_audio_embeds_mask(self, audio_embeds_mask):
         raise NotImplementedError("This method should be overridden in the subclass")
 
     def place_audio_tokens(self, input_ids, config, num_audio_tokens):
-        """Place audio placeholder tokens at random positions in input_ids. Override for different placement."""
+        """Place audio placeholder tokens contiguously after BOS. Override for different placement.
+
+        Deterministic placement (position 0 reserved for BOS; audio tokens at [1:1+n]) keeps
+        the tail of each sequence text-only, which downstream tests (e.g. resize_token_embeddings
+        overwriting column -2) rely on.
+        """
         input_ids = input_ids.clone()
         input_ids[input_ids == self.audio_token_id] = self.pad_token_id
         for i in range(input_ids.shape[0]):
             n = num_audio_tokens[i].item() if isinstance(num_audio_tokens, torch.Tensor) else num_audio_tokens
-            available_positions = torch.arange(1, input_ids.shape[1])  # skip position 0 (BOS)
-            perm = torch.randperm(len(available_positions))[:n]
-            input_ids[i, available_positions[perm]] = self.audio_token_id
+            input_ids[i, 1 : 1 + int(n)] = self.audio_token_id
         return input_ids
 
     def get_audio_feature_key(self):
@@ -249,7 +252,7 @@ def get_config(self):
             elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]):
                 kwargs[k] = getattr(self, model_name_to_common_name[k])
         kwargs["text_config"] = self.get_text_config()
-        kwargs["audio_config"] = self.get_audio_config()
+        kwargs[self.audio_config_key] = self.get_audio_config()
         return self.config_class(**kwargs)
 
     def get_text_config(self):
diff --git a/tests/models/glmasr/test_modeling_glmasr.py b/tests/models/glmasr/test_modeling_glmasr.py
index 744e268e74c7..8b93ad64337d 100644
--- a/tests/models/glmasr/test_modeling_glmasr.py
+++ b/tests/models/glmasr/test_modeling_glmasr.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch glmasr model."""
 
-import tempfile
 import unittest
 
 import pytest
@@ -22,8 +21,10 @@
     AutoProcessor,
     GlmAsrConfig,
     GlmAsrForConditionalGeneration,
+    LlamaConfig,
     is_torch_available,
 )
+from transformers.models.glmasr.configuration_glmasr import GlmAsrEncoderConfig
 from transformers.testing_utils import (
     cleanup,
     require_torch,
@@ -31,123 +32,53 @@
     torch_device,
 )
 
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
+from ...alm_tester import ALMModelTest, ALMModelTester
 
 
 if is_torch_available():
     import torch
 
 
-class GlmAsrModelTester:
-    def __init__(
-        self,
-        parent,
-        ignore_index=-100,
-        audio_token_id=0,
-        seq_length=35,
-        feat_seq_length=64,
-        text_config={
-            "model_type": "llama",
-            "intermediate_size": 64,
-            "initializer_range": 0.02,
-            "hidden_size": 16,
-            "max_position_embeddings": 52,
-            "num_hidden_layers": 2,
-            "num_attention_heads": 4,
-            "use_labels": True,
-            "use_mrope": False,
-            "vocab_size": 99,
-            "head_dim": 8,
-            "pad_token_id": 1,  # can't be the same as the audio token id
-        },
-        is_training=True,
-        audio_config={
-            "model_type": "glmasr_encoder",
-            "hidden_size": 128,
-            "num_attention_heads": 2,
-            "intermediate_size": 512,
-            "num_hidden_layers": 2,
-            "num_mel_bins": 128,
-            "max_source_positions": 32,
-            "initializer_range": 0.02,
-        },
-    ):
-        self.parent = parent
-        self.ignore_index = ignore_index
-        self.audio_token_id = audio_token_id
-        self.text_config = text_config
-        self.audio_config = audio_config
-        self.seq_length = seq_length
-        self.feat_seq_length = feat_seq_length
-
-        self.num_hidden_layers = text_config["num_hidden_layers"]
-        self.vocab_size = text_config["vocab_size"]
-        self.hidden_size = text_config["hidden_size"]
-        self.num_attention_heads = text_config["num_attention_heads"]
-        self.is_training = is_training
-
-        self.batch_size = 3
-        self.encoder_seq_length = seq_length
-
-    def get_config(self):
-        return GlmAsrConfig(
-            text_config=self.text_config,
-            audio_config=self.audio_config,
-            ignore_index=self.ignore_index,
-            audio_token_id=self.audio_token_id,
-        )
+class GlmAsrModelTester(ALMModelTester):
+    config_class = GlmAsrConfig
+    conditional_generation_class = GlmAsrForConditionalGeneration
+    text_config_class = LlamaConfig
+    audio_config_class = GlmAsrEncoderConfig
 
-    def prepare_config_and_inputs(self):
-        input_features_values = floats_tensor(
-            [
-                self.batch_size,
-                self.audio_config["num_mel_bins"],
-                self.feat_seq_length,
-            ]
-        )
-        config = self.get_config()
-        input_features_mask = torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device)
-        return config, input_features_values, input_features_mask
+    def __init__(self, parent, **kwargs):
+        # feat_seq_length=64 → conv2 (s=2): post_conv=32 → merge_factor=4: 8 audio embed tokens.
+        kwargs.setdefault("feat_seq_length", 64)
+        kwargs.setdefault("seq_length", 35)
+        kwargs.setdefault("head_dim", 8)
+        super().__init__(parent, **kwargs)
 
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_features_values, input_features_mask = config_and_inputs
-        num_audio_tokens_per_batch_idx = 8
+    def get_audio_mask_key(self):
+        return "input_features_mask"
 
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
-        attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
-        attention_mask[:, :1] = 0
+    def create_audio_mask(self):
+        return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device)
 
-        input_ids[:, 1 : 1 + num_audio_tokens_per_batch_idx] = config.audio_token_id
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "input_features": input_features_values,
-            "input_features_mask": input_features_mask,
-        }
-        return config, inputs_dict
+    def get_audio_embeds_mask(self, audio_mask):
+        # conv1 (s=1) preserves length; conv2 (s=2, k=3, p=1) halves; merge_factor=4 post-projector.
+        audio_lengths = audio_mask.sum(-1)
+        for padding, kernel_size, stride in [(1, 3, 1), (1, 3, 2)]:
+            audio_lengths = (audio_lengths + 2 * padding - (kernel_size - 1) - 1) // stride + 1
+        merge_factor = 4
+        post_lengths = (audio_lengths - merge_factor) // merge_factor + 1
+        max_len = int(post_lengths.max().item())
+        positions = torch.arange(max_len, device=audio_mask.device)[None, :]
+        return (positions < post_lengths[:, None]).long()
 
 
 @require_torch
-class GlmAsrForConditionalGenerationModelTest(
-    ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase
-):
+class GlmAsrForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase):
     """
     Model tester for `GlmAsrForConditionalGeneration`.
     """
 
-    all_model_classes = (GlmAsrForConditionalGeneration,) if is_torch_available() else ()
+    model_tester_class = GlmAsrModelTester
     pipeline_model_mapping = {"audio-text-to-text": GlmAsrForConditionalGeneration} if is_torch_available() else {}
 
-    _is_composite = True
-
-    def setUp(self):
-        self.model_tester = GlmAsrModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GlmAsrConfig, has_text_modality=False)
-
     @unittest.skip(
         reason="This test does not apply to GlmAsr since inputs_embeds corresponding to audio tokens are replaced when input features are provided."
     )
@@ -167,47 +98,6 @@ def test_sdpa_can_dispatch_on_flash(self):
     def test_flash_attn_2_inference_equivalence_right_padding(self):
         pass
 
-    @unittest.skip(reason="GlmAsr has no separate base model without a head.")
-    def test_model_base_model_prefix(self):
-        pass
-
-    def test_sdpa_can_dispatch_composite_models(self):
-        # GlmAsr is audio+text composite; verify SDPA toggles propagate to submodules.
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        if not self._is_composite:
-            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                # SDPA (default)
-                model_sdpa = model_class.from_pretrained(tmpdirname)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                text_attn = "sdpa" if model.language_model._supports_sdpa else "eager"
-                audio_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager"
-
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-                self.assertTrue(model.language_model.config._attn_implementation == text_attn)
-                self.assertTrue(model.audio_tower.config._attn_implementation == audio_attn)
-
-                # Eager
-                model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
-                model_eager = model_eager.eval().to(torch_device)
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-                self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
-                self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager")
-
-                for _, submodule in model_eager.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        raise ValueError("The eager model should not have SDPA attention layers")
-
 
 @require_torch
 class GlmAsrForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index 4b0e91ddbd36..f7c76cb4093e 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -19,7 +19,9 @@
 
 from transformers import (
     AutoProcessor,
+    GraniteConfig,
     GraniteSpeechConfig,
+    GraniteSpeechEncoderConfig,
     GraniteSpeechForConditionalGeneration,
 )
 from transformers.testing_utils import (
@@ -48,80 +50,39 @@
 class GraniteSpeechModelTester(ALMModelTester):
     config_class = GraniteSpeechConfig
     conditional_generation_class = GraniteSpeechForConditionalGeneration
+    text_config_class = GraniteConfig
+    audio_config_class = GraniteSpeechEncoderConfig
     audio_config_key = "encoder_config"
-    audio_tower_attr = None  # Encoder SDPA not checked
 
     def __init__(self, parent, **kwargs):
         kwargs.setdefault("seq_length", 9)  # 7 text + 2 audio tokens
         kwargs.setdefault("num_audio_tokens", 2)
         kwargs.setdefault("sequence_dim", 844)
         kwargs.setdefault("feature_dim", 160)
-        kwargs.setdefault("audio_token_index", 0)
-        kwargs.setdefault("tie_word_embeddings", True)
-        kwargs.setdefault("initializer_range", 0.02)
         kwargs.setdefault("has_lora_adapter", True)
         kwargs.setdefault("downsample_rate", 5)
         kwargs.setdefault("window_size", 15)
+        # GraniteSpeechEncoderConfig fields (no attribute_map, so set explicitly).
+        kwargs.setdefault("input_dim", 160)
+        kwargs.setdefault("num_layers", 2)
+        kwargs.setdefault("hidden_dim", 32)
+        kwargs.setdefault("num_heads", 4)
+        kwargs.setdefault("dim_head", 8)
+        kwargs.setdefault("feedforward_mult", 4)
+        kwargs.setdefault("context_size", 200)
+        kwargs.setdefault("conv_kernel_size", 15)
+        kwargs.setdefault("conv_expansion_factor", 2)
+        kwargs.setdefault("output_dim", 42)
+        # Q-Former projector config (passed through as a dict; ALM's get_config forwards unknowns).
         kwargs.setdefault(
-            "text_config",
+            "projector_config",
             {
-                "model_type": "granite",
-                "is_training": True,
-                "seq_length": 7,
-                "use_token_type_ids": False,
-                "use_labels": True,
-                "vocab_size": 99,
+                "model_type": "blip_2_qformer",
                 "hidden_size": 32,
                 "num_hidden_layers": 2,
                 "num_attention_heads": 4,
-                "intermediate_size": 37,
-                "hidden_act": "gelu",
-                "hidden_dropout_prob": 0.1,
-                "attention_probs_dropout_prob": 0.1,
-                "max_position_embeddings": 580,
-                "type_vocab_size": 16,
-                "type_sequence_label_size": 2,
-                "initializer_range": 0.02,
-                "num_labels": 3,
-                "num_choices": 4,
-                "pad_token_id": 1,
-            },
-        )
-        kwargs.setdefault(
-            "audio_config",
-            {
-                "model_type": "granite_speech_encoder",
-                "context_size": 200,
-                "conv_expansion_factor": 2,
-                "conv_kernel_size": 15,
-                "dim_head": 32,
-                "dropout": 0.1,
-                "feedforward_mult": 4,
-                "hidden_dim": 32,
-                "input_dim": 160,
-                "num_heads": 4,
-                "num_layers": 2,
-                "output_dim": 42,
-            },
-        )
-        kwargs.setdefault(
-            "projector_config",
-            {
-                "attention_probs_dropout_prob": 0.1,
-                "cross_attention_frequency": 1,
-                "encoder_hidden_size": 32,
-                "hidden_act": "gelu",
-                "hidden_dropout_prob": 0.1,
-                "hidden_size": 32,
-                "initializer_range": 0.02,
                 "intermediate_size": 256,
-                "layer_norm_eps": 1e-12,
-                "max_position_embeddings": 2048,
-                "model_type": "blip_2_qformer",
-                "num_attention_heads": 4,
-                "num_hidden_layers": 2,
-                "use_qformer_text_input": False,
-                "vocab_size": 30522,
+                "encoder_hidden_size": 32,
             },
         )
         super().__init__(parent, **kwargs)
@@ -129,17 +90,16 @@ def __init__(self, parent, **kwargs):
     def create_audio_features(self):
         return floats_tensor([self.batch_size, self.sequence_dim, self.feature_dim])
 
-    def create_attention_mask(self, input_ids):
-        return torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
+    def create_audio_mask(self):
+        # Granite's encoder is fed the raw features; mask is all-ones over sequence_dim.
+        return torch.ones([self.batch_size, self.sequence_dim], dtype=torch.bool).to(torch_device)
 
-    def get_num_audio_tokens(self, audio_features):
-        return self.num_audio_tokens
+    def get_audio_embeds_mask(self, audio_mask):
+        # Projector produces `num_audio_tokens` embeds per sample (fixed by window_size/downsample_rate).
+        return torch.ones([self.batch_size, self.num_audio_tokens], dtype=torch.long).to(torch_device)
 
-    def place_audio_tokens(self, input_ids, config, num_audio_tokens):
-        input_ids = input_ids.clone()
-        input_ids[input_ids == self.audio_token_id] = self.pad_token_id
-        input_ids[:, :num_audio_tokens] = self.audio_token_id
-        return input_ids
+    def create_attention_mask(self, input_ids):
+        return torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
 
     def create_and_check_granite_speech_model_fp16_forward(self, config, input_ids, input_features, attention_mask):
         model = GraniteSpeechForConditionalGeneration(config=config)
diff --git a/tests/models/musicflamingo/test_modeling_musicflamingo.py b/tests/models/musicflamingo/test_modeling_musicflamingo.py
index 8c3b0ce549c8..9b8153705582 100644
--- a/tests/models/musicflamingo/test_modeling_musicflamingo.py
+++ b/tests/models/musicflamingo/test_modeling_musicflamingo.py
@@ -16,16 +16,17 @@
 
 import json
 import os
-import tempfile
 import unittest
 from pathlib import Path
 
 import pytest
 
 from transformers import (
+    AudioFlamingo3EncoderConfig,
     AutoProcessor,
     MusicFlamingoConfig,
     MusicFlamingoForConditionalGeneration,
+    Qwen2Config,
     is_torch_available,
 )
 from transformers.testing_utils import (
@@ -37,129 +38,60 @@
     torch_device,
 )
 
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...alm_tester import ALMModelTest, ALMModelTester
+from ...test_modeling_common import ids_tensor
 
 
 if is_torch_available():
     import torch
 
 
-class MusicFlamingoModelTester:
+class MusicFlamingoModelTester(ALMModelTester):
     """
     Builds a tiny MusicFlamingo config and synthetic inputs that respect MusicFlamingo's
     post-pool token accounting: num <sound> tokens per sample == post-pool frame count.
     """
 
-    def __init__(
-        self,
-        parent,
-        audio_token_id=0,
-        seq_length=25,
-        feat_seq_length=60,
-        text_config=None,
-        audio_config=None,
-        is_training=True,
-    ):
-        self.parent = parent
-        self.audio_token_id = audio_token_id
-        self.seq_length = seq_length
-        self.feat_seq_length = feat_seq_length
-        self.is_training = is_training
-
-        # Small text backbone (Qwen2-ish)
-        if text_config is None:
-            text_config = {
-                "model_type": "qwen2",
-                "intermediate_size": 36,
-                "initializer_range": 0.02,
-                "hidden_size": 32,
-                "max_position_embeddings": 52,
-                "num_hidden_layers": 2,
-                "num_attention_heads": 4,
-                "num_key_value_heads": 2,
-                "use_labels": True,
-                "use_mrope": False,
-                "vocab_size": 99,
-                "pad_token_id": 1,  # Ensure pad token != audio token
-            }
-        # Small audio encoder (MusicFlamingo Whisper-style)
-        if audio_config is None:
-            audio_config = {
-                "model_type": "musicflamingo_encoder",
-                "hidden_size": 16,
-                "num_attention_heads": 4,
-                "intermediate_size": 16,
-                "num_hidden_layers": 2,
-                "num_mel_bins": 80,
-                "max_source_positions": 30,
-                "initializer_range": 0.02,
-            }
+    config_class = MusicFlamingoConfig
+    conditional_generation_class = MusicFlamingoForConditionalGeneration
+    text_config_class = Qwen2Config
+    audio_config_class = AudioFlamingo3EncoderConfig
 
-        self.text_config = text_config
-        self.audio_config = audio_config
+    def __init__(self, parent, **kwargs):
+        # feat_seq_length=60 → (60-1)//2+1=30 → (30-2)//2+1=15 audio embed tokens.
+        kwargs.setdefault("feat_seq_length", 60)
+        kwargs.setdefault("max_source_positions", (kwargs["feat_seq_length"] - 1) // 2 + 1)
+        super().__init__(parent, **kwargs)
 
-        self.batch_size = 3
-        self.vocab_size = text_config["vocab_size"]
-        self.hidden_size = text_config["hidden_size"]
-        self.num_attention_heads = text_config["num_attention_heads"]
-        self.num_hidden_layers = text_config["num_hidden_layers"]
-        self.encoder_seq_length = seq_length
+    def get_audio_mask_key(self):
+        return "input_features_mask"
 
-    def get_config(self):
-        return MusicFlamingoConfig(
-            text_config=self.text_config,
-            audio_config=self.audio_config,
-            audio_token_id=self.audio_token_id,
-            rope_parameters={"rope_type": "default", "rope_theta": 2048, "partial_rotary_factor": 0.5},
-        )
+    def create_audio_mask(self):
+        return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device)
 
-    def prepare_config_and_inputs(self):
-        # (#windows == batch_size, n_mels, T_mel)
-        input_features_values = floats_tensor(
-            [self.batch_size, self.audio_config["num_mel_bins"], self.feat_seq_length]
-        )
-        config = self.get_config()
-        # Per-window mel validity (all ones => full length)
-        input_features_mask = torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device)
-        return config, input_features_values, input_features_mask
-
-    def _post_pool_tokens_per_window(self, T_mel):
-        # Mirror MusicFlamingo processor math:
-        pre = (T_mel - 1) // 2 + 1
-        post = (pre - 2) // 2 + 1
-        return post
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_features_values, input_features_mask = self.prepare_config_and_inputs()
-        # Every window has same T_mel here
-        num_audio_tokens_per_sample = self._post_pool_tokens_per_window(input_features_values.shape[-1])
-
-        # Build token ids with valid range and K <sound> tokens
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
-        attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=torch_device)
-        attention_mask[:, :1] = 0  # left padding sentinel
-
-        # Fill first K positions (after padding) with the audio token id, for each sample
-        input_ids[:, 1 : 1 + num_audio_tokens_per_sample] = config.audio_token_id
-
-        inputs_dict = {
-            "input_features": input_features_values,
-            "input_features_mask": input_features_mask,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
+    def get_audio_embeds_mask(self, audio_mask):
+        # AudioFlamingo3Encoder._get_feat_extract_output_lengths: conv2 (k=3,s=2) then avg_pool (k=2,s=2).
+        input_lengths = audio_mask.sum(-1)
+        input_lengths = (input_lengths - 1) // 2 + 1
+        output_lengths = (input_lengths - 2) // 2 + 1
+        max_len = int(output_lengths.max().item())
+        positions = torch.arange(max_len, device=audio_mask.device)[None, :]
+        return (positions < output_lengths[:, None]).long()
+
+    def get_config(self):
+        # MusicFlamingoConfig requires rope_parameters.
+        config = super().get_config()
+        config.rope_parameters = {"rope_type": "default", "rope_theta": 2048, "partial_rotary_factor": 0.5}
+        return config
 
 
 @require_torch
-class MusicFlamingoForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class MusicFlamingoForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase):
     """
     Model tester for `MusicFlamingoForConditionalGeneration`.
     """
 
-    all_model_classes = (MusicFlamingoForConditionalGeneration,) if is_torch_available() else ()
+    model_tester_class = MusicFlamingoModelTester
     pipeline_model_mapping = (
         {
             "text-to-speech": MusicFlamingoForConditionalGeneration,
@@ -168,11 +100,6 @@ class MusicFlamingoForConditionalGenerationModelTest(ModelTesterMixin, Generatio
         if is_torch_available()
         else {}
     )
-    _is_composite = True
-
-    def setUp(self):
-        self.model_tester = MusicFlamingoModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MusicFlamingoConfig, has_text_modality=False)
 
     def test_rotary_window_axis_resets_per_audio(self):
         config = self.model_tester.get_config()
@@ -246,48 +173,6 @@ def test_sdpa_can_dispatch_on_flash(self):
     def test_flash_attn_2_inference_equivalence_right_padding(self):
         pass
 
-    @unittest.skip(reason="MusicFlamingo has no separate base model without a head.")
-    def test_model_base_model_prefix(self):
-        pass
-
-    def test_sdpa_can_dispatch_composite_models(self):
-        # MusicFlamingo is audio+text composite; verify SDPA toggles propagate to submodules.
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        if not self._is_composite:
-            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                # SDPA (default)
-                model_sdpa = model_class.from_pretrained(tmpdirname)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                text_attn = "sdpa" if model.language_model._supports_sdpa else "eager"
-                audio_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager"
-
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-                self.assertTrue(model.language_model.config._attn_implementation == text_attn)
-                self.assertTrue(model.audio_tower.config._attn_implementation == audio_attn)
-
-                # Eager
-                model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
-                model_eager = model_eager.eval().to(torch_device)
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-                self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
-                self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager")
-
-                for _, submodule in model_eager.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        raise ValueError("The eager model should not have SDPA attention layers")
-
 
 @require_torch
 class MusicFlamingoForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
index 5733a4347568..1130220301ea 100644
--- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
+++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
@@ -23,7 +23,9 @@
 from transformers import (
     AutoProcessor,
     Qwen2AudioConfig,
+    Qwen2AudioEncoderConfig,
     Qwen2AudioForConditionalGeneration,
+    Qwen2Config,
     is_torch_available,
 )
 from transformers.testing_utils import (
@@ -43,10 +45,36 @@
 class Qwen2AudioModelTester(ALMModelTester):
     config_class = Qwen2AudioConfig
     conditional_generation_class = Qwen2AudioForConditionalGeneration
+    text_config_class = Qwen2Config
+    audio_config_class = Qwen2AudioEncoderConfig
+
+    def __init__(self, parent, **kwargs):
+        # feat_seq_length=60 → after conv2 s=2: 30 → after avg_pool s=2: 15 audio embed tokens.
+        kwargs.setdefault("feat_seq_length", 60)
+        # Encoder asserts input_features.shape[-1] == max_source_positions * conv1.stride * conv2.stride == 2 * max_source_positions.
+        kwargs.setdefault("max_source_positions", kwargs["feat_seq_length"] // 2)
+        # Qwen2AudioEncoderConfig only maps `num_hidden_layers`; override remaining size knobs explicitly.
+        kwargs.setdefault("d_model", 32)
+        kwargs.setdefault("encoder_attention_heads", 2)
+        kwargs.setdefault("encoder_ffn_dim", 32)
+        super().__init__(parent, **kwargs)
 
     def get_audio_mask_key(self):
         return "feature_attention_mask"
 
+    def create_audio_mask(self):
+        # Qwen2Audio expects full-length mel input; mask with all 1s.
+        return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device)
+
+    def get_audio_embeds_mask(self, audio_mask):
+        # Mirrors Qwen2AudioEncoder._get_feat_extract_output_lengths: conv2 (k=3,s=2,p=1) then avg_pool (k=2,s=2).
+        input_lengths = audio_mask.sum(-1)
+        input_lengths = (input_lengths - 1) // 2 + 1
+        output_lengths = (input_lengths - 2) // 2 + 1
+        max_len = int(output_lengths.max().item())
+        positions = torch.arange(max_len, device=audio_mask.device)[None, :]
+        return (positions < output_lengths[:, None]).long()
+
 
 @require_torch
 class Qwen2AudioForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase):
@@ -66,6 +94,12 @@ def test_sdpa_can_compile_dynamic(self):
     def test_sdpa_can_dispatch_on_flash(self):
         pass
 
+    @unittest.skip(
+        reason="inputs_embeds is the audio-fused path; can't match raw token-only embeddings."
+    )
+    def test_inputs_embeds_matches_input_ids(self):
+        pass
+
 
 @require_torch
 class Qwen2AudioForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/voxtral/test_modeling_voxtral.py b/tests/models/voxtral/test_modeling_voxtral.py
index 0cff2a66779b..adc8b1bdc767 100644
--- a/tests/models/voxtral/test_modeling_voxtral.py
+++ b/tests/models/voxtral/test_modeling_voxtral.py
@@ -13,12 +13,13 @@
 # limitations under the License.
 """Testing suite for the PyTorch Voxtral model."""
 
-import tempfile
 import unittest
 
 from transformers import (
     AutoProcessor,
+    LlamaConfig,
     VoxtralConfig,
+    VoxtralEncoderConfig,
     VoxtralForConditionalGeneration,
     is_torch_available,
 )
@@ -30,126 +31,53 @@
     torch_device,
 )
 
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
+from ...alm_tester import ALMModelTest, ALMModelTester
 
 
 if is_torch_available():
     import torch
 
 
-class VoxtralModelTester:
-    def __init__(
-        self,
-        parent,
-        ignore_index=-100,
-        audio_token_id=0,
-        seq_length=35,
-        feat_seq_length=60,
-        text_config={
-            "model_type": "llama",
-            "intermediate_size": 36,
-            "initializer_range": 0.02,
-            "hidden_size": 32,
-            "max_position_embeddings": 52,
-            "num_hidden_layers": 2,
-            "num_attention_heads": 4,
-            "num_key_value_heads": 2,
-            "use_labels": True,
-            "use_mrope": False,
-            "vocab_size": 99,
-            "head_dim": 8,
-            "pad_token_id": 1,  # can't be the same as the audio token id
-        },
-        is_training=True,
-        audio_config={
-            "model_type": "voxtral_encoder",
-            "hidden_size": 16,
-            "num_attention_heads": 4,
-            "intermediate_size": 16,
-            "num_hidden_layers": 2,
-            "num_mel_bins": 80,
-            "max_source_positions": 30,
-            "initializer_range": 0.02,
-        },
-    ):
-        self.parent = parent
-        self.ignore_index = ignore_index
-        self.audio_token_id = audio_token_id
-        self.text_config = text_config
-        self.audio_config = audio_config
-        self.seq_length = seq_length
-        self.feat_seq_length = feat_seq_length
-
-        self.num_hidden_layers = text_config["num_hidden_layers"]
-        self.vocab_size = text_config["vocab_size"]
-        self.hidden_size = text_config["hidden_size"]
-        self.num_attention_heads = text_config["num_attention_heads"]
-        self.is_training = is_training
-
-        self.batch_size = 3
-        self.encoder_seq_length = seq_length
-
-    def get_config(self):
-        return VoxtralConfig(
-            text_config=self.text_config,
-            audio_config=self.audio_config,
-            ignore_index=self.ignore_index,
-            audio_token_id=self.audio_token_id,
-        )
-
-    def prepare_config_and_inputs(self):
-        input_features_values = floats_tensor(
-            [
-                self.batch_size,
-                self.audio_config["num_mel_bins"],
-                self.feat_seq_length,
-            ]
-        )
-        config = self.get_config()
-        return config, input_features_values
+class VoxtralModelTester(ALMModelTester):
+    config_class = VoxtralConfig
+    conditional_generation_class = VoxtralForConditionalGeneration
+    text_config_class = LlamaConfig
+    audio_config_class = VoxtralEncoderConfig
 
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_features_values = config_and_inputs
-        num_audio_tokens_per_batch_idx = 30
+    def __init__(self, parent, **kwargs):
+        # seq_length 35 = BOS + 30 audio + 4 text (keeps column -2 text-only for resize test).
+        kwargs.setdefault("seq_length", 35)
+        # feat_seq_length 60 → conv2(s=2) → 30 audio embeds (Voxtral's encoder does not apply avg_pool
+        # in the forward; projector reshapes to B*30 embeddings).
+        kwargs.setdefault("feat_seq_length", 60)
+        # Encoder asserts input_features.shape[-1] == max_source_positions * 2.
+        kwargs.setdefault("max_source_positions", kwargs["feat_seq_length"] // 2)
+        # Llama needs head_dim
+        kwargs.setdefault("head_dim", 8)
+        super().__init__(parent, **kwargs)
 
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
-        attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
-        attention_mask[:, :1] = 0
+    def get_audio_embeds_mask(self, audio_mask):
+        # Voxtral encoder only applies conv2 (stride 2); no avg_pool in forward.
+        output_length = (self.feat_seq_length - 1) // 2 + 1
+        return torch.ones([self.batch_size, output_length], dtype=torch.long).to(torch_device)
 
-        input_ids[:, 1 : 1 + num_audio_tokens_per_batch_idx] = config.audio_token_id
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "input_features": input_features_values,
-        }
-        return config, inputs_dict
+    def create_audio_mask(self):
+        return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device)
 
 
 @require_torch
-class VoxtralForConditionalGenerationModelTest(
-    ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase
-):
+class VoxtralForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase):
     """
     Model tester for `VoxtralForConditionalGeneration`.
     """
 
-    all_model_classes = (VoxtralForConditionalGeneration,) if is_torch_available() else ()
+    model_tester_class = VoxtralModelTester
     pipeline_model_mapping = (
         {"text-to-speech": VoxtralForConditionalGeneration, "any-to-any": VoxtralForConditionalGeneration}
         if is_torch_available()
         else {}
     )
 
-    _is_composite = True
-
-    def setUp(self):
-        self.model_tester = VoxtralModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=VoxtralConfig, has_text_modality=False)
-
     @unittest.skip(
         reason="This test does not apply to Voxtral since inputs_embeds corresponding to audio tokens are replaced when input features are provided."
     )
@@ -192,47 +120,6 @@ def test_flash_attention_3_padding_matches_padding_free_with_position_ids(self):
     def test_flash_attention_3_padding_matches_padding_free_with_position_ids_and_fa_kwargs(self):
         pass
 
-    @unittest.skip(reason="Voxtral has no separate base model without a head.")
-    def test_model_base_model_prefix(self):
-        pass
-
-    def test_sdpa_can_dispatch_composite_models(self):
-        # overwrite because Voxtral is audio+text model (not vision+text)
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        if not self._is_composite:
-            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model_sdpa = model_class.from_pretrained(tmpdirname)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                text_attn = "sdpa" if model.language_model._supports_sdpa else "eager"
-                vision_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager"
-
-                # `None` as it is the requested one which will be assigned to each sub-config
-                # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-                self.assertTrue(model.language_model.config._attn_implementation == text_attn)
-                self.assertTrue(model.audio_tower.config._attn_implementation == vision_attn)
-
-                model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
-                model_eager = model_eager.eval().to(torch_device)
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-                self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
-                self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager")
-
-                for name, submodule in model_eager.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        raise ValueError("The eager model should not have SDPA attention layers")
-
 
 @require_torch
 class VoxtralForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py
index 9aa817f3cba6..4d5b464236b2 100644
--- a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py
+++ b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py
@@ -24,6 +24,10 @@
     is_torch_available,
 )
 from transformers.audio_utils import load_audio
+from transformers.models.voxtral_realtime.configuration_voxtral_realtime import (
+    VoxtralRealtimeEncoderConfig,
+    VoxtralRealtimeTextConfig,
+)
 from transformers.testing_utils import (
     cleanup,
     require_torch,
@@ -31,10 +35,8 @@
     torch_device,
 )
 
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
+from ...alm_tester import ALMModelTest, ALMModelTester
+from ...test_modeling_common import floats_tensor, ids_tensor
 
 
 if is_datasets_available():
@@ -44,136 +46,89 @@
     import torch
 
 
-class VoxtralRealtimeModelTester:
-    def __init__(
-        self,
-        parent,
-        ignore_index=-100,
-        audio_token_id=0,
-        seq_length=5,
-        feat_seq_length=40,
-        text_config={
-            "model_type": "voxtral_realtime_text",
-            "intermediate_size": 36,
-            "initializer_range": 0.02,
-            "hidden_size": 32,
-            "max_position_embeddings": 52,
-            "num_hidden_layers": 2,
-            "num_attention_heads": 4,
-            "num_key_value_heads": 2,
-            "use_labels": True,
-            "vocab_size": 99,
-            "head_dim": 8,
-            "pad_token_id": 1,  # can't be the same as the audio token id
-            "hidden_act": "silu",
-            "rms_norm_eps": 1e-6,
-            "attention_dropout": 0.0,
-            "rope_parameters": {
-                "rope_type": "default",
-                "rope_theta": 10000.0,
-            },
-        },
-        is_training=True,
-        audio_config={
-            "model_type": "voxtral_realtime_encoder",
-            "hidden_size": 16,
-            "num_attention_heads": 4,
-            "num_key_value_heads": 2,
-            "intermediate_size": 64,
-            "encoder_layers": 2,
-            "num_mel_bins": 80,
-            "max_position_embeddings": 100,
-            "initializer_range": 0.02,
-            "rms_norm_eps": 1e-6,
-            "activation_function": "silu",
-            "activation_dropout": 0.0,
-            "attention_dropout": 0.0,
-            "head_dim": 4,
-            "rope_parameters": {
-                "rope_type": "default",
-                "rope_theta": 10000.0,
-            },
-        },
-    ):
-        self.parent = parent
-        self.ignore_index = ignore_index
-        self.audio_token_id = audio_token_id
-        self.text_config = text_config
-        self.audio_config = audio_config
-        self.seq_length = seq_length
-        self.feat_seq_length = feat_seq_length
-
-        self.num_hidden_layers = text_config["num_hidden_layers"]
-        self.vocab_size = text_config["vocab_size"]
-        self.hidden_size = text_config["hidden_size"]
-        self.num_attention_heads = text_config["num_attention_heads"]
-        self.is_training = is_training
-
-        self.batch_size = 3
-        self.encoder_seq_length = seq_length
-        self._max_new_tokens = None  # this is used to set
-
-    def get_config(self):
-        return VoxtralRealtimeConfig(
-            text_config=self.text_config,
-            audio_config=self.audio_config,
-            ignore_index=self.ignore_index,
-            audio_token_id=self.audio_token_id,
-        )
-
-    def prepare_config_and_inputs(self):
-        if self._max_new_tokens is not None:
-            feat_seq_length = self.feat_seq_length + self._max_new_tokens * 8
-        else:
-            feat_seq_length = self.feat_seq_length
-
-        input_features_values = floats_tensor(
-            [
-                self.batch_size,
-                self.audio_config["num_mel_bins"],
-                feat_seq_length,
-            ]
-        )
-        config = self.get_config()
-        return config, input_features_values
+class VoxtralRealtimeModelTester(ALMModelTester):
+    config_class = VoxtralRealtimeConfig
+    conditional_generation_class = VoxtralRealtimeForConditionalGeneration
+    text_config_class = VoxtralRealtimeTextConfig
+    audio_config_class = VoxtralRealtimeEncoderConfig
+
+    def __init__(self, parent, **kwargs):
+        # VoxtralRealtime does additive audio/text fusion: seq_length must equal num_audio_embeds.
+        # With audio_length_per_tok=8 (config default), num_audio_embeds = feat_seq_length // 8.
+        kwargs.setdefault("seq_length", 32)
+        kwargs.setdefault("feat_seq_length", kwargs["seq_length"] * 8)
+        # Audio encoder uses RoPE; max position must cover post-conv length (feat_seq_length // 2).
+        kwargs.setdefault("max_position_embeddings", kwargs["feat_seq_length"])
+        kwargs.setdefault("head_dim", 8)
+        kwargs.setdefault("rms_norm_eps", 1e-6)
+        kwargs.setdefault("activation_function", "silu")
+        kwargs.setdefault("hidden_act", "silu")
+        super().__init__(parent, **kwargs)
+        self._max_new_tokens = None
+
+    def get_audio_embeds_mask(self, audio_mask):
+        # Causal conv2 (stride 2, left-pad 1): post_conv_len = feat_seq_length // 2.
+        # Projector reshapes by downsample_factor=4 → post_conv_len // downsample_factor embeds.
+        downsample_factor = 4
+        effective_feat = self.feat_seq_length + (self._max_new_tokens or 0) * 8
+        post_conv_len = effective_feat // 2
+        output_length = post_conv_len // downsample_factor
+        return torch.ones([self.batch_size, output_length], dtype=torch.long).to(torch_device)
+
+    def create_audio_features(self):
+        effective_feat = self.feat_seq_length + (self._max_new_tokens or 0) * 8
+        return floats_tensor([self.batch_size, self.num_mel_bins, effective_feat])
+
+    def create_audio_mask(self):
+        effective_feat = self.feat_seq_length + (self._max_new_tokens or 0) * 8
+        return torch.ones([self.batch_size, effective_feat], dtype=torch.bool).to(torch_device)
+
+    def place_audio_tokens(self, input_ids, config, num_audio_tokens):
+        # VoxtralRealtime fuses audio additively over the whole sequence; no placeholder token required.
+        input_ids = input_ids.clone()
+        input_ids[input_ids == self.audio_token_id] = self.pad_token_id
+        return input_ids
 
     def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_features_values = config_and_inputs
-        num_audio_tokens_per_batch_idx = 30
+        # Custom pipeline: input_ids at seq_length, audio covers seq_length (+ max_new_tokens extras
+        # during generation so the model can slice future-token audio per decode step). We do not run
+        # the base-class `audio_embeds_mask.shape[1] <= seq_length` invariant because, for this model,
+        # audio embeds legitimately exceed input length during generation.
+        audio_features = self.create_audio_features()
+        audio_mask = self.create_audio_mask()
+
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        special_tokens = [self.pad_token_id, self.bos_token_id, self.eos_token_id, self.audio_token_id]
+        for safe_id in range(self.vocab_size):
+            if safe_id not in special_tokens:
+                break
+        else:
+            raise ValueError("vocab_size too small for a non-special safe token.")
+        input_ids[input_ids == self.pad_token_id] = safe_id
+        input_ids[input_ids == self.eos_token_id] = safe_id
 
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
-        attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
-        attention_mask[:, :1] = 0
+        config = self.get_config()
+        # place_audio_tokens is a no-op for this model; call for symmetry.
+        input_ids = self.place_audio_tokens(input_ids, config, torch.tensor([self.seq_length] * self.batch_size))
+        attention_mask = self.create_attention_mask(input_ids)
 
-        input_ids[:, 1 : 1 + num_audio_tokens_per_batch_idx] = config.audio_token_id
-        inputs_dict = {
+        return config, {
             "input_ids": input_ids,
             "attention_mask": attention_mask,
-            "input_features": input_features_values,
+            "input_features": audio_features,
         }
-        return config, inputs_dict
 
 
 @require_torch
-class VoxtralRealtimeForConditionalGenerationModelTest(
-    ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase
-):
+class VoxtralRealtimeForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase):
     """
     Model tester for `VoxtralRealtimeForConditionalGeneration`.
     """
 
     additional_model_inputs = ["input_features"]
-
-    all_model_classes = (VoxtralRealtimeForConditionalGeneration,) if is_torch_available() else ()
+    model_tester_class = VoxtralRealtimeModelTester
     pipeline_model_mapping = {"any-to-any": VoxtralRealtimeForConditionalGeneration} if is_torch_available() else {}
 
-    _is_composite = True
-
-    def setUp(self):
-        self.model_tester = VoxtralRealtimeModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=VoxtralRealtimeConfig, has_text_modality=False)
-
     def _with_max_new_tokens(max_new_tokens):
         def decorator(test_func):
             @functools.wraps(test_func)

From a302c3ecf6923a176c1dfff562e267aa157c09e0 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 20 Apr 2026 16:46:03 +0200
Subject: [PATCH 07/38] cleaner

---
 tests/models/audioflamingo3/test_modeling_audioflamingo3.py   | 3 ++-
 tests/models/glmasr/test_modeling_glmasr.py                   | 3 ---
 tests/models/granite_speech/test_modeling_granite_speech.py   | 4 ----
 tests/models/musicflamingo/test_modeling_musicflamingo.py     | 2 ++
 tests/models/qwen2_audio/test_modeling_qwen2_audio.py         | 4 +++-
 tests/models/voxtral/test_modeling_voxtral.py                 | 3 ---
 .../models/voxtral_realtime/test_modeling_voxtral_realtime.py | 4 ----
 7 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
index 153c6ba11b52..0d3dd954dda2 100644
--- a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
+++ b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
@@ -48,7 +48,6 @@ class AudioFlamingo3ModelTester(ALMModelTester):
     text_config_class = Qwen2Config
     audio_config_class = AudioFlamingo3EncoderConfig
 
-
     def __init__(self, parent, **kwargs):
         # feat_seq_length → (L-1)//2+1 after conv2 → (·-2)//2+1 after avg_pool, so
         # feat_seq_length=60 gives 15 audio embed tokens (fits inside seq_length=32 + BOS + text).
@@ -62,6 +61,8 @@ def get_audio_mask_key(self):
         return "input_features_mask"
 
     def create_audio_mask(self):
+        # Full-length mask matches real processor output and lets the audio encoder dispatch to Flash
+        # Attention (which rejects non-null attn_masks) on `test_sdpa_can_dispatch_on_flash`.
         return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device)
 
     def get_audio_embeds_mask(self, audio_mask):
diff --git a/tests/models/glmasr/test_modeling_glmasr.py b/tests/models/glmasr/test_modeling_glmasr.py
index 8b93ad64337d..59d8e5969523 100644
--- a/tests/models/glmasr/test_modeling_glmasr.py
+++ b/tests/models/glmasr/test_modeling_glmasr.py
@@ -55,9 +55,6 @@ def __init__(self, parent, **kwargs):
     def get_audio_mask_key(self):
         return "input_features_mask"
 
-    def create_audio_mask(self):
-        return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device)
-
     def get_audio_embeds_mask(self, audio_mask):
         # conv1 (s=1) preserves length; conv2 (s=2, k=3, p=1) halves; merge_factor=4 post-projector.
         audio_lengths = audio_mask.sum(-1)
diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index f7c76cb4093e..61b6d4db53d8 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -90,10 +90,6 @@ def __init__(self, parent, **kwargs):
     def create_audio_features(self):
         return floats_tensor([self.batch_size, self.sequence_dim, self.feature_dim])
 
-    def create_audio_mask(self):
-        # Granite's encoder is fed the raw features; mask is all-ones over sequence_dim.
-        return torch.ones([self.batch_size, self.sequence_dim], dtype=torch.bool).to(torch_device)
-
     def get_audio_embeds_mask(self, audio_mask):
         # Projector produces `num_audio_tokens` embeds per sample (fixed by window_size/downsample_rate).
         return torch.ones([self.batch_size, self.num_audio_tokens], dtype=torch.long).to(torch_device)
diff --git a/tests/models/musicflamingo/test_modeling_musicflamingo.py b/tests/models/musicflamingo/test_modeling_musicflamingo.py
index 9b8153705582..25e714fc30ec 100644
--- a/tests/models/musicflamingo/test_modeling_musicflamingo.py
+++ b/tests/models/musicflamingo/test_modeling_musicflamingo.py
@@ -67,6 +67,8 @@ def get_audio_mask_key(self):
         return "input_features_mask"
 
     def create_audio_mask(self):
+        # Deterministic full-length mask — base default uses unseeded Python `random`, which makes
+        # multi-call generation-comparison tests (e.g. assisted decoding vs greedy) flaky.
         return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device)
 
     def get_audio_embeds_mask(self, audio_mask):
diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
index 1130220301ea..7e45ecfc4150 100644
--- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
+++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
@@ -63,7 +63,9 @@ def get_audio_mask_key(self):
         return "feature_attention_mask"
 
     def create_audio_mask(self):
-        # Qwen2Audio expects full-length mel input; mask with all 1s.
+        # Deterministic full-length mask: the base default randomizes via Python's `random`, which isn't
+        # re-seeded per test call and desynchronizes the two `prepare_config_and_inputs_for_common`
+        # invocations inside generation-comparison tests (e.g. test_greedy_generate_dict_outputs).
         return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device)
 
     def get_audio_embeds_mask(self, audio_mask):
diff --git a/tests/models/voxtral/test_modeling_voxtral.py b/tests/models/voxtral/test_modeling_voxtral.py
index adc8b1bdc767..4f0c604ce05f 100644
--- a/tests/models/voxtral/test_modeling_voxtral.py
+++ b/tests/models/voxtral/test_modeling_voxtral.py
@@ -61,9 +61,6 @@ def get_audio_embeds_mask(self, audio_mask):
         output_length = (self.feat_seq_length - 1) // 2 + 1
         return torch.ones([self.batch_size, output_length], dtype=torch.long).to(torch_device)
 
-    def create_audio_mask(self):
-        return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device)
-
 
 @require_torch
 class VoxtralForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase):
diff --git a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py
index 4d5b464236b2..f9699479aac9 100644
--- a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py
+++ b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py
@@ -79,10 +79,6 @@ def create_audio_features(self):
         effective_feat = self.feat_seq_length + (self._max_new_tokens or 0) * 8
         return floats_tensor([self.batch_size, self.num_mel_bins, effective_feat])
 
-    def create_audio_mask(self):
-        effective_feat = self.feat_seq_length + (self._max_new_tokens or 0) * 8
-        return torch.ones([self.batch_size, effective_feat], dtype=torch.bool).to(torch_device)
-
     def place_audio_tokens(self, input_ids, config, num_audio_tokens):
         # VoxtralRealtime fuses audio additively over the whole sequence; no placeholder token required.
         input_ids = input_ids.clone()

From 8fcba58d5f4377f8cc86a20626706121d2936ff8 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 20 Apr 2026 17:28:54 +0200
Subject: [PATCH 08/38] updates

---
 tests/alm_tester.py | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/tests/alm_tester.py b/tests/alm_tester.py
index 4c104e6dd49d..385382a13dc2 100644
--- a/tests/alm_tester.py
+++ b/tests/alm_tester.py
@@ -35,19 +35,21 @@
 class ALMModelTester:
     # If the model follows standard naming conventions, only `config_class` and
     # `conditional_generation_class` need to be set (others are optional).
+    # base_model_class = None, this should be added when #45534 is merged
     config_class = None
+    text_config_class = None
+    audio_config_class = None
     conditional_generation_class = None
-    base_model_class = None
     sequence_classification_class = None
-
-    # Key name for the audio sub-config in the main config constructor.
-    # Override to "encoder_config" for models like GraniteSpeech.
-    audio_config_key = "audio_config"
+    # These attributes are required after the initialization phase of the tester.
+    _required_attributes = ("config_class", "conditional_generation_class")
 
     # Arguments that should be passed to the config class even if not in its signature.
     forced_config_args = ["pad_token_id"]
 
-    _required_attributes = ("config_class", "conditional_generation_class")
+    # Key name for the audio sub-config in the main config constructor.
+    # Override to "encoder_config" for models like GraniteSpeech.
+    audio_config_key = "audio_config"
 
     @property
     def all_model_classes(self):
@@ -63,7 +65,13 @@ def all_model_classes(self):
 
     @property
     def pipeline_model_mapping(self):
-        return {"any-to-any": self.conditional_generation_class}
+        # TODO: @eustlb, we don't have pipeline testing for audio-text-to-text
+        mapping = {
+            "feature-extraction": self.base_model_class,
+            # "audio-text-to-text": self.conditional_generation_class,
+        }
+        # TODO: should we add automatic-speech-recognition with a special flag?
+        return mapping
 
     def __init__(self, parent, **kwargs):
         self.parent = parent
@@ -92,21 +100,11 @@ def __init__(self, parent, **kwargs):
         kwargs.setdefault("intermediate_size", 32)  # Keep this divisible by 8 for fp16/bf16/fp32 16-bytes alignment
         kwargs.setdefault("hidden_act", "gelu")
         kwargs.setdefault("max_position_embeddings", 512)
-    
-        # Optional projector config (e.g. GraniteSpeech uses a Q-Former projector)
-        kwargs.setdefault("projector_config", None)
 
         # Set all kwargs as instance attributes
         for key, value in kwargs.items():
             setattr(self, key, value)
 
-        # # Derived from text config (needed by ModelTesterMixin)
-        # self.vocab_size = self.text_config.get("vocab_size", 99)
-        # self.hidden_size = self.text_config.get("hidden_size", 32)
-        # self.num_hidden_layers = self.text_config.get("num_hidden_layers", 2)
-        # self.num_attention_heads = self.text_config.get("num_attention_heads", 4)
-        # self.encoder_seq_length = self.seq_length
-
         for required_attribute in [
             # "base_model_class", # TODO: @eustlb, there is a discrepancy here between ALMs/ VLMs. XXModel and XXForConditionalGeneration
             "config_class",
@@ -192,7 +190,7 @@ def prepare_config_and_inputs_for_common(self):
                 "This likely indicates a mismatch between your feature extraction/configuration and your sequence length. "
                 "Please ensure `seq_length` is >= the number of audio embedding positions."
             )
-         
+
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
         special_tokens = [self.pad_token_id, self.bos_token_id, self.eos_token_id, self.audio_token_id]
@@ -229,7 +227,7 @@ def prepare_config_and_inputs_for_common(self):
     @property
     def config_args(self):
         return list(signature(self.config_class.__init__).parameters.keys())
-    
+
     @property
     def text_config_args(self):
         args = list(signature(self.text_config_class.__init__).parameters.keys())
@@ -310,9 +308,7 @@ class ALMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin)
 
     def setUp(self):
         if self.model_tester_class is None:
-            raise ValueError(
-                "You have inherited from ALMModelTest but did not set the model_tester_class attribute."
-            )
+            raise ValueError("You have inherited from ALMModelTest but did not set the model_tester_class attribute.")
         self.model_tester = self.model_tester_class(self)
         self.config_tester = ConfigTester(self, config_class=self.model_tester.config_class, has_text_modality=False)
 
@@ -332,6 +328,11 @@ def test_config(self):
         """Test config common functionality."""
         self.config_tester.run_common_tests()
 
+    # TODO: @eustlb, remove this once #45534 is merged
     @unittest.skip("Audio-LMs have no separate base model without a head.")
     def test_model_base_model_prefix(self):
         pass
+
+    # TODO: @eustlb, add this
+    # def test_mismatching_num_audio_tokens(self):
+    #     pass

From 66acc9ed86067a8d52faa9ed80cbb5e964f1d0d5 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 20 Apr 2026 22:04:43 +0200
Subject: [PATCH 09/38] audio_mask_key + updates

---
 .../configuration_granite_speech.py           |  5 +++
 tests/alm_tester.py                           | 14 +++-----
 .../test_modeling_audioflamingo3.py           |  7 ++--
 tests/models/glmasr/test_modeling_glmasr.py   |  3 +-
 .../test_modeling_granite_speech.py           | 33 +++++++------------
 .../test_modeling_musicflamingo.py            |  3 +-
 .../qwen2_audio/test_modeling_qwen2_audio.py  |  7 ++--
 .../test_modeling_voxtral_realtime.py         |  1 -
 8 files changed, 27 insertions(+), 46 deletions(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index d02ac9998696..7d922992a10f 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -53,6 +53,11 @@ class GraniteSpeechEncoderConfig(PreTrainedConfig):
     ```"""
 
     model_type = "granite_speech_encoder"
+    attribute_map = {
+        "hidden_size": "hidden_dim",
+        "num_hidden_layers": "num_layers",
+        "num_attention_heads": "num_heads",
+    }
 
     input_dim: int = 160
     num_layers: int = 10
diff --git a/tests/alm_tester.py b/tests/alm_tester.py
index 385382a13dc2..5ab4b76ce95b 100644
--- a/tests/alm_tester.py
+++ b/tests/alm_tester.py
@@ -35,7 +35,7 @@
 class ALMModelTester:
     # If the model follows standard naming conventions, only `config_class` and
     # `conditional_generation_class` need to be set (others are optional).
-    # base_model_class = None, this should be added when #45534 is merged
+    base_model_class = None, # this should be added for most models when #45534 is merged
     config_class = None
     text_config_class = None
     audio_config_class = None
@@ -50,6 +50,7 @@ class ALMModelTester:
     # Key name for the audio sub-config in the main config constructor.
     # Override to "encoder_config" for models like GraniteSpeech.
     audio_config_key = "audio_config"
+    audio_mask_key = None  # to be set if audio-related mask has to be passed to the model's forward
 
     @property
     def all_model_classes(self):
@@ -149,11 +150,7 @@ def place_audio_tokens(self, input_ids, config, num_audio_tokens):
 
     def get_audio_feature_key(self):
         """Key name for audio features in the inputs dict."""
-        return "input_features"
-
-    def get_audio_mask_key(self):
-        """Key name for audio attention mask. Return None if no audio mask needed."""
-        return None
+        return "input_features" 
 
     def create_audio_mask(self):
         """Create audio-level attention mask with contiguous valid regions per batch element.
@@ -217,9 +214,8 @@ def prepare_config_and_inputs_for_common(self):
             self.get_audio_feature_key(): audio_features,
         }
 
-        audio_mask_key = self.get_audio_mask_key()
-        if audio_mask_key is not None:
-            inputs_dict[audio_mask_key] = audio_mask
+        if self.audio_mask_key is not None:
+            inputs_dict[self.audio_mask_key] = audio_mask
 
         inputs_dict.update(self.get_additional_inputs(config, input_ids, audio_features))
         return config, inputs_dict
diff --git a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
index 0d3dd954dda2..db17a400cab8 100644
--- a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
+++ b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
@@ -18,14 +18,12 @@
 import unittest
 from pathlib import Path
 
-import pytest
-
 from transformers import (
     AudioFlamingo3Config,
     AudioFlamingo3EncoderConfig,
-    Qwen2Config,
     AudioFlamingo3ForConditionalGeneration,
     AutoProcessor,
+    Qwen2Config,
     is_torch_available,
 )
 from transformers.testing_utils import (
@@ -57,8 +55,7 @@ def __init__(self, parent, **kwargs):
         kwargs.setdefault("max_source_positions", (kwargs["feat_seq_length"] - 1) // 2 + 1)
         super().__init__(parent, **kwargs)
 
-    def get_audio_mask_key(self):
-        return "input_features_mask"
+    audio_mask_key = "input_features_mask"
 
     def create_audio_mask(self):
         # Full-length mask matches real processor output and lets the audio encoder dispatch to Flash
diff --git a/tests/models/glmasr/test_modeling_glmasr.py b/tests/models/glmasr/test_modeling_glmasr.py
index 59d8e5969523..5606f1c75fac 100644
--- a/tests/models/glmasr/test_modeling_glmasr.py
+++ b/tests/models/glmasr/test_modeling_glmasr.py
@@ -52,8 +52,7 @@ def __init__(self, parent, **kwargs):
         kwargs.setdefault("head_dim", 8)
         super().__init__(parent, **kwargs)
 
-    def get_audio_mask_key(self):
-        return "input_features_mask"
+    audio_mask_key = "input_features_mask"
 
     def get_audio_embeds_mask(self, audio_mask):
         # conv1 (s=1) preserves length; conv2 (s=2, k=3, p=1) halves; merge_factor=4 post-projector.
diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index 61b6d4db53d8..dd36955f469a 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -56,35 +56,24 @@ class GraniteSpeechModelTester(ALMModelTester):
 
     def __init__(self, parent, **kwargs):
         kwargs.setdefault("seq_length", 9)  # 7 text + 2 audio tokens
+
         kwargs.setdefault("num_audio_tokens", 2)
         kwargs.setdefault("sequence_dim", 844)
         kwargs.setdefault("feature_dim", 160)
         kwargs.setdefault("has_lora_adapter", True)
         kwargs.setdefault("downsample_rate", 5)
         kwargs.setdefault("window_size", 15)
-        # GraniteSpeechEncoderConfig fields (no attribute_map, so set explicitly).
-        kwargs.setdefault("input_dim", 160)
-        kwargs.setdefault("num_layers", 2)
-        kwargs.setdefault("hidden_dim", 32)
-        kwargs.setdefault("num_heads", 4)
         kwargs.setdefault("dim_head", 8)
-        kwargs.setdefault("feedforward_mult", 4)
-        kwargs.setdefault("context_size", 200)
-        kwargs.setdefault("conv_kernel_size", 15)
-        kwargs.setdefault("conv_expansion_factor", 2)
-        kwargs.setdefault("output_dim", 42)
-        # Q-Former projector config (passed through as a dict; ALM's get_config forwards unknowns).
-        kwargs.setdefault(
-            "projector_config",
-            {
-                "model_type": "blip_2_qformer",
-                "hidden_size": 32,
-                "num_hidden_layers": 2,
-                "num_attention_heads": 4,
-                "intermediate_size": 256,
-                "encoder_hidden_size": 32,
-            },
-        )
+ 
+        kwargs["projector_config"] = {
+            "model_type": "blip_2_qformer",
+            "hidden_size": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 256,
+            "encoder_hidden_size": 32,
+        }
+   
         super().__init__(parent, **kwargs)
 
     def create_audio_features(self):
diff --git a/tests/models/musicflamingo/test_modeling_musicflamingo.py b/tests/models/musicflamingo/test_modeling_musicflamingo.py
index 25e714fc30ec..19da6506d1ba 100644
--- a/tests/models/musicflamingo/test_modeling_musicflamingo.py
+++ b/tests/models/musicflamingo/test_modeling_musicflamingo.py
@@ -63,8 +63,7 @@ def __init__(self, parent, **kwargs):
         kwargs.setdefault("max_source_positions", (kwargs["feat_seq_length"] - 1) // 2 + 1)
         super().__init__(parent, **kwargs)
 
-    def get_audio_mask_key(self):
-        return "input_features_mask"
+    audio_mask_key = "input_features_mask"
 
     def create_audio_mask(self):
         # Deterministic full-length mask — base default uses unseeded Python `random`, which makes
diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
index 7e45ecfc4150..b3010fa82539 100644
--- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
+++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
@@ -59,8 +59,7 @@ def __init__(self, parent, **kwargs):
         kwargs.setdefault("encoder_ffn_dim", 32)
         super().__init__(parent, **kwargs)
 
-    def get_audio_mask_key(self):
-        return "feature_attention_mask"
+    audio_mask_key = "feature_attention_mask"
 
     def create_audio_mask(self):
         # Deterministic full-length mask: the base default randomizes via Python's `random`, which isn't
@@ -96,9 +95,7 @@ def test_sdpa_can_compile_dynamic(self):
     def test_sdpa_can_dispatch_on_flash(self):
         pass
 
-    @unittest.skip(
-        reason="inputs_embeds is the audio-fused path; can't match raw token-only embeddings."
-    )
+    @unittest.skip(reason="inputs_embeds is the audio-fused path; can't match raw token-only embeddings.")
     def test_inputs_embeds_matches_input_ids(self):
         pass
 
diff --git a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py
index f9699479aac9..86682cd558a0 100644
--- a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py
+++ b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py
@@ -91,7 +91,6 @@ def prepare_config_and_inputs_for_common(self):
         # the base-class `audio_embeds_mask.shape[1] <= seq_length` invariant because, for this model,
         # audio embeds legitimately exceed input length during generation.
         audio_features = self.create_audio_features()
-        audio_mask = self.create_audio_mask()
 
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
         special_tokens = [self.pad_token_id, self.bos_token_id, self.eos_token_id, self.audio_token_id]

From 63ca77e01e50951d999c5614214260e74e5234de Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 20 Apr 2026 22:08:14 +0200
Subject: [PATCH 10/38] typo

---
 tests/alm_tester.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/alm_tester.py b/tests/alm_tester.py
index 5ab4b76ce95b..94e480e74b72 100644
--- a/tests/alm_tester.py
+++ b/tests/alm_tester.py
@@ -35,7 +35,7 @@
 class ALMModelTester:
     # If the model follows standard naming conventions, only `config_class` and
     # `conditional_generation_class` need to be set (others are optional).
-    base_model_class = None, # this should be added for most models when #45534 is merged
+    base_model_class = None  # this should be added for most models when #45534 is merged
     config_class = None
     text_config_class = None
     audio_config_class = None

From 7588135e2623f693052ea709c390fbf2651a56f6 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 20 Apr 2026 22:41:02 +0200
Subject: [PATCH 11/38] simplify granite speech

---
 .../configuration_granite_speech.py           |  7 ++++++-
 .../test_modeling_granite_speech.py           | 21 +++++++------------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index 7d922992a10f..dbdda02ccdb9 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -57,6 +57,7 @@ class GraniteSpeechEncoderConfig(PreTrainedConfig):
         "hidden_size": "hidden_dim",
         "num_hidden_layers": "num_layers",
         "num_attention_heads": "num_heads",
+        "num_mel_bins": "input_dim",
     }
 
     input_dim: int = 160
@@ -64,7 +65,7 @@ class GraniteSpeechEncoderConfig(PreTrainedConfig):
     hidden_dim: int = 1024
     feedforward_mult: int = 4
     num_heads: int = 8
-    dim_head: int = 128
+    dim_head: int | None = None
     output_dim: int = 42
     context_size: int = 200
     max_pos_emb: int = 512
@@ -72,6 +73,10 @@ class GraniteSpeechEncoderConfig(PreTrainedConfig):
     conv_kernel_size: int = 15
     conv_expansion_factor: int = 2
 
+    def __post_init__(self, **kwargs):
+        super().__post_init__(**kwargs)
+        if self.dim_head is None:
+            self.dim_head = self.hidden_dim // self.num_heads
 
 @auto_docstring(checkpoint="ibm-granite/granite-speech-3.3-2b")
 @strict
diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index dd36955f469a..18f07fc71bef 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -55,16 +55,6 @@ class GraniteSpeechModelTester(ALMModelTester):
     audio_config_key = "encoder_config"
 
     def __init__(self, parent, **kwargs):
-        kwargs.setdefault("seq_length", 9)  # 7 text + 2 audio tokens
-
-        kwargs.setdefault("num_audio_tokens", 2)
-        kwargs.setdefault("sequence_dim", 844)
-        kwargs.setdefault("feature_dim", 160)
-        kwargs.setdefault("has_lora_adapter", True)
-        kwargs.setdefault("downsample_rate", 5)
-        kwargs.setdefault("window_size", 15)
-        kwargs.setdefault("dim_head", 8)
- 
         kwargs["projector_config"] = {
             "model_type": "blip_2_qformer",
             "hidden_size": 32,
@@ -77,11 +67,16 @@ def __init__(self, parent, **kwargs):
         super().__init__(parent, **kwargs)
 
     def create_audio_features(self):
-        return floats_tensor([self.batch_size, self.sequence_dim, self.feature_dim])
+        # GraniteSpeech expects [B, seq_len, features] (time-first), unlike the standard [B, features, seq_len]
+        return floats_tensor([self.batch_size, self.feat_seq_length, self.num_mel_bins])
 
     def get_audio_embeds_mask(self, audio_mask):
-        # Projector produces `num_audio_tokens` embeds per sample (fixed by window_size/downsample_rate).
-        return torch.ones([self.batch_size, self.num_audio_tokens], dtype=torch.long).to(torch_device)
+        # Projector: ceil(feat_seq_length / window_size) * (window_size // downsample_rate) tokens per sample.
+        import math
+
+        nblocks = math.ceil(self.feat_seq_length / self.window_size)
+        num_audio_tokens = nblocks * (self.window_size // self.downsample_rate)
+        return torch.ones([self.batch_size, num_audio_tokens], dtype=torch.long).to(torch_device)
 
     def create_attention_mask(self, input_ids):
         return torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)

From 41fed1c820e2745e7c5c9f9bfb5dbfa2aca751a6 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 20 Apr 2026 22:45:29 +0200
Subject: [PATCH 12/38] nits

---
 tests/models/audioflamingo3/test_modeling_audioflamingo3.py | 3 +--
 tests/models/glmasr/test_modeling_glmasr.py                 | 3 +--
 tests/models/musicflamingo/test_modeling_musicflamingo.py   | 3 +--
 tests/models/qwen2_audio/test_modeling_qwen2_audio.py       | 3 +--
 4 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
index db17a400cab8..9629fe3ba086 100644
--- a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
+++ b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
@@ -45,6 +45,7 @@ class AudioFlamingo3ModelTester(ALMModelTester):
     conditional_generation_class = AudioFlamingo3ForConditionalGeneration
     text_config_class = Qwen2Config
     audio_config_class = AudioFlamingo3EncoderConfig
+    audio_mask_key = "input_features_mask"
 
     def __init__(self, parent, **kwargs):
         # feat_seq_length → (L-1)//2+1 after conv2 → (·-2)//2+1 after avg_pool, so
@@ -55,8 +56,6 @@ def __init__(self, parent, **kwargs):
         kwargs.setdefault("max_source_positions", (kwargs["feat_seq_length"] - 1) // 2 + 1)
         super().__init__(parent, **kwargs)
 
-    audio_mask_key = "input_features_mask"
-
     def create_audio_mask(self):
         # Full-length mask matches real processor output and lets the audio encoder dispatch to Flash
         # Attention (which rejects non-null attn_masks) on `test_sdpa_can_dispatch_on_flash`.
diff --git a/tests/models/glmasr/test_modeling_glmasr.py b/tests/models/glmasr/test_modeling_glmasr.py
index 5606f1c75fac..76e4cd5cc6b5 100644
--- a/tests/models/glmasr/test_modeling_glmasr.py
+++ b/tests/models/glmasr/test_modeling_glmasr.py
@@ -44,6 +44,7 @@ class GlmAsrModelTester(ALMModelTester):
     conditional_generation_class = GlmAsrForConditionalGeneration
     text_config_class = LlamaConfig
     audio_config_class = GlmAsrEncoderConfig
+    audio_mask_key = "input_features_mask"
 
     def __init__(self, parent, **kwargs):
         # feat_seq_length=64 → conv2 (s=2): post_conv=32 → merge_factor=4: 8 audio embed tokens.
@@ -52,8 +53,6 @@ def __init__(self, parent, **kwargs):
         kwargs.setdefault("head_dim", 8)
         super().__init__(parent, **kwargs)
 
-    audio_mask_key = "input_features_mask"
-
     def get_audio_embeds_mask(self, audio_mask):
         # conv1 (s=1) preserves length; conv2 (s=2, k=3, p=1) halves; merge_factor=4 post-projector.
         audio_lengths = audio_mask.sum(-1)
diff --git a/tests/models/musicflamingo/test_modeling_musicflamingo.py b/tests/models/musicflamingo/test_modeling_musicflamingo.py
index 19da6506d1ba..6996ff4ccb71 100644
--- a/tests/models/musicflamingo/test_modeling_musicflamingo.py
+++ b/tests/models/musicflamingo/test_modeling_musicflamingo.py
@@ -56,6 +56,7 @@ class MusicFlamingoModelTester(ALMModelTester):
     conditional_generation_class = MusicFlamingoForConditionalGeneration
     text_config_class = Qwen2Config
     audio_config_class = AudioFlamingo3EncoderConfig
+    audio_mask_key = "input_features_mask"
 
     def __init__(self, parent, **kwargs):
         # feat_seq_length=60 → (60-1)//2+1=30 → (30-2)//2+1=15 audio embed tokens.
@@ -63,8 +64,6 @@ def __init__(self, parent, **kwargs):
         kwargs.setdefault("max_source_positions", (kwargs["feat_seq_length"] - 1) // 2 + 1)
         super().__init__(parent, **kwargs)
 
-    audio_mask_key = "input_features_mask"
-
     def create_audio_mask(self):
         # Deterministic full-length mask — base default uses unseeded Python `random`, which makes
         # multi-call generation-comparison tests (e.g. assisted decoding vs greedy) flaky.
diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
index b3010fa82539..ade43ffabf39 100644
--- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
+++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
@@ -47,6 +47,7 @@ class Qwen2AudioModelTester(ALMModelTester):
     conditional_generation_class = Qwen2AudioForConditionalGeneration
     text_config_class = Qwen2Config
     audio_config_class = Qwen2AudioEncoderConfig
+    audio_mask_key = "feature_attention_mask"
 
     def __init__(self, parent, **kwargs):
         # feat_seq_length=60 → after conv2 s=2: 30 → after avg_pool s=2: 15 audio embed tokens.
@@ -59,8 +60,6 @@ def __init__(self, parent, **kwargs):
         kwargs.setdefault("encoder_ffn_dim", 32)
         super().__init__(parent, **kwargs)
 
-    audio_mask_key = "feature_attention_mask"
-
     def create_audio_mask(self):
         # Deterministic full-length mask: the base default randomizes via Python's `random`, which isn't
         # re-seeded per test call and desynchronizes the two `prepare_config_and_inputs_for_common`

From e5971c7fab1a7e33ee64c15d62475e7cedf8224b Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 20 Apr 2026 23:05:07 +0200
Subject: [PATCH 13/38] some more cleaning

---
 .../models/qwen2_audio/configuration_qwen2_audio.py        | 7 ++++++-
 tests/models/glmasr/test_modeling_glmasr.py                | 3 ---
 tests/models/qwen2_audio/test_modeling_qwen2_audio.py      | 4 ----
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py
index a617f33e6177..6aec9eace900 100644
--- a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py
@@ -42,7 +42,12 @@ class Qwen2AudioEncoderConfig(PreTrainedConfig):
     ```"""
 
     model_type = "qwen2_audio_encoder"
-    attribute_map = {"num_hidden_layers": "encoder_layers"}
+    attribute_map = {
+        "num_hidden_layers": "encoder_layers",
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+        "intermediate_size": "encoder_ffn_dim",
+    }
 
     num_mel_bins: int = 128
     encoder_layers: int = 32
diff --git a/tests/models/glmasr/test_modeling_glmasr.py b/tests/models/glmasr/test_modeling_glmasr.py
index 76e4cd5cc6b5..0b2aae719d19 100644
--- a/tests/models/glmasr/test_modeling_glmasr.py
+++ b/tests/models/glmasr/test_modeling_glmasr.py
@@ -47,9 +47,6 @@ class GlmAsrModelTester(ALMModelTester):
     audio_mask_key = "input_features_mask"
 
     def __init__(self, parent, **kwargs):
-        # feat_seq_length=64 → conv2 (s=2): post_conv=32 → merge_factor=4: 8 audio embed tokens.
-        kwargs.setdefault("feat_seq_length", 64)
-        kwargs.setdefault("seq_length", 35)
         kwargs.setdefault("head_dim", 8)
         super().__init__(parent, **kwargs)
 
diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
index ade43ffabf39..fc73d6dca607 100644
--- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
+++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
@@ -54,10 +54,6 @@ def __init__(self, parent, **kwargs):
         kwargs.setdefault("feat_seq_length", 60)
         # Encoder asserts input_features.shape[-1] == max_source_positions * conv1.stride * conv2.stride == 2 * max_source_positions.
         kwargs.setdefault("max_source_positions", kwargs["feat_seq_length"] // 2)
-        # Qwen2AudioEncoderConfig only maps `num_hidden_layers`; override remaining size knobs explicitly.
-        kwargs.setdefault("d_model", 32)
-        kwargs.setdefault("encoder_attention_heads", 2)
-        kwargs.setdefault("encoder_ffn_dim", 32)
         super().__init__(parent, **kwargs)
 
     def create_audio_mask(self):

From 59703ddd3eab7cb978272dd7d83190620df02c20 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Tue, 21 Apr 2026 17:57:32 +0200
Subject: [PATCH 14/38] add test_mismatching_num_audio_tokens

---
 tests/alm_tester.py | 87 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 75 insertions(+), 12 deletions(-)

diff --git a/tests/alm_tester.py b/tests/alm_tester.py
index 94e480e74b72..340aee77df5c 100644
--- a/tests/alm_tester.py
+++ b/tests/alm_tester.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import unittest
 from inspect import signature
 
@@ -145,12 +146,18 @@ def place_audio_tokens(self, input_ids, config, num_audio_tokens):
         input_ids[input_ids == self.audio_token_id] = self.pad_token_id
         for i in range(input_ids.shape[0]):
             n = num_audio_tokens[i].item() if isinstance(num_audio_tokens, torch.Tensor) else num_audio_tokens
+            if 1 + int(n) > self.seq_length:
+                raise ValueError(
+                    f"Cannot place {int(n)} audio tokens after BOS in a sequence of length {self.seq_length}. "
+                    "This likely indicates a mismatch between your feature extraction/configuration and your sequence length. "
+                    "Please ensure `seq_length` is >= the number of audio embedding positions + 1."
+                )
             input_ids[i, 1 : 1 + int(n)] = self.audio_token_id
         return input_ids
 
     def get_audio_feature_key(self):
         """Key name for audio features in the inputs dict."""
-        return "input_features" 
+        return "input_features"
 
     def create_audio_mask(self):
         """Create audio-level attention mask with contiguous valid regions per batch element.
@@ -180,14 +187,6 @@ def prepare_config_and_inputs_for_common(self):
         audio_mask = self.create_audio_mask()
         audio_embeds_mask = self.get_audio_embeds_mask(audio_mask)
 
-        if audio_embeds_mask.shape[1] > self.seq_length:
-            raise ValueError(
-                f"`audio_embeds_mask` has more tokens per sequence than `seq_length` allows "
-                f"({audio_embeds_mask.shape[1]} > {self.seq_length}). "
-                "This likely indicates a mismatch between your feature extraction/configuration and your sequence length. "
-                "Please ensure `seq_length` is >= the number of audio embedding positions."
-            )
-
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
         special_tokens = [self.pad_token_id, self.bos_token_id, self.eos_token_id, self.audio_token_id]
@@ -329,6 +328,70 @@ def test_config(self):
     def test_model_base_model_prefix(self):
         pass
 
-    # TODO: @eustlb, add this
-    # def test_mismatching_num_audio_tokens(self):
-    #     pass
+    def test_mismatching_num_audio_tokens(self):
+        """
+        Tests that ALMs throw an error with explicit message saying what is wrong
+        when number of audios don't match number of audio tokens in the text.
+        Also we need to test multi-audio cases when one prompt has multiple audio tokens.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        audio_feature_key = self.model_tester.get_audio_feature_key()
+        audio_mask_key = self.model_tester.audio_mask_key
+
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            model.eval()
+            curr_input_dict = copy.deepcopy(input_dict)
+            _ = model(**curr_input_dict)  # successful forward with no modifications
+
+            # Test 1: remove one audio but leave the audio tokens in the text
+            curr_input_dict[audio_feature_key] = curr_input_dict[audio_feature_key][-1:, ...]
+            if audio_mask_key is not None:
+                curr_input_dict[audio_mask_key] = curr_input_dict[audio_mask_key][-1:, ...]
+            with self.assertRaises(ValueError):
+                _ = model(**curr_input_dict)
+
+            # Test 2: add one audio but leave the audio tokens in the text
+            curr_input_dict = copy.deepcopy(input_dict)
+            curr_input_dict[audio_feature_key] = torch.cat(
+                [curr_input_dict[audio_feature_key], curr_input_dict[audio_feature_key][:1, ...]], dim=0
+            )
+            if audio_mask_key is not None:
+                curr_input_dict[audio_mask_key] = torch.cat(
+                    [curr_input_dict[audio_mask_key], curr_input_dict[audio_mask_key][:1, ...]], dim=0
+                )
+            with self.assertRaises(ValueError):
+                _ = model(**curr_input_dict)
+
+            # Test 3: duplicate the text along the seq dim so each prompt has twice as many
+            # audio tokens, while leaving the audio features unchanged -> mismatch
+            curr_input_dict = copy.deepcopy(input_dict)
+            curr_input_dict["input_ids"] = torch.cat(
+                [curr_input_dict["input_ids"], curr_input_dict["input_ids"]], dim=1
+            )
+            curr_input_dict["attention_mask"] = torch.cat(
+                [curr_input_dict["attention_mask"], curr_input_dict["attention_mask"]], dim=1
+            )
+            with self.assertRaises(ValueError):
+                _ = model(**curr_input_dict)
+
+            # Test 4: multi-audio valid case. A prompt may contain multiple audio segments;
+            # all audio segments are concatenated along the batch dim on the audio side.
+            # Duplicating input_ids along seq dim (-> [audios, audios] per prompt) and the
+            # audio features along batch dim (-> batch_size * 2) must forward successfully.
+            curr_input_dict = copy.deepcopy(input_dict)
+            curr_input_dict["input_ids"] = torch.cat(
+                [curr_input_dict["input_ids"], curr_input_dict["input_ids"]], dim=1
+            )
+            curr_input_dict["attention_mask"] = torch.cat(
+                [curr_input_dict["attention_mask"], curr_input_dict["attention_mask"]], dim=1
+            )
+            curr_input_dict[audio_feature_key] = torch.cat(
+                [curr_input_dict[audio_feature_key], curr_input_dict[audio_feature_key]], dim=0
+            )
+            if audio_mask_key is not None:
+                curr_input_dict[audio_mask_key] = torch.cat(
+                    [curr_input_dict[audio_mask_key], curr_input_dict[audio_mask_key]], dim=0
+                )
+            _ = model(**curr_input_dict)
+

From 6a67f32b5d4e58b55fab9858fea6afd41573deea Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Tue, 21 Apr 2026 18:00:14 +0200
Subject: [PATCH 15/38] add get_placeholder_mask

---
 .../audioflamingo3/modeling_audioflamingo3.py | 32 ++++++++++++++---
 .../audioflamingo3/modular_audioflamingo3.py  |  6 ++--
 .../models/glmasr/modeling_glmasr.py          | 32 ++++++++++++++---
 .../granite_speech/modeling_granite_speech.py | 36 ++++++++++++++-----
 .../models/voxtral/modeling_voxtral.py        | 32 ++++++++++++++---
 .../models/voxtral/modular_voxtral.py         | 32 ++++++++++++++---
 6 files changed, 142 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py
index 1fbbc733c308..43028ab1c74c 100644
--- a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py
+++ b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py
@@ -34,7 +34,7 @@
 from ...modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_compilable_check
 from ...utils.generic import merge_with_config_defaults
 from ...utils.output_capturing import capture_outputs
 from ..auto import AutoModel, AutoModelForCausalLM
@@ -473,6 +473,30 @@ def get_audio_features(
 
         return audio_output
 
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_audio_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_audio_mask = special_audio_mask.all(-1)
+        else:
+            special_audio_mask = input_ids == self.config.audio_token_id
+
+        n_audio_tokens = special_audio_mask.sum()
+        n_audio_features = audio_features.shape[0]
+        special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        torch_compilable_check(
+            inputs_embeds[special_audio_mask].numel() == audio_features.numel(),
+            f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}",
+        )
+        return special_audio_mask
+
     @can_return_tuple
     @auto_docstring
     def forward(
@@ -559,10 +583,10 @@ def forward(
             audio_embeds = self.get_audio_features(input_features, input_features_mask, return_dict=True).pooler_output
 
             # replace text-audio token placeholders with audio embeddings
-            audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
-            inputs_embeds = inputs_embeds.masked_scatter(
-                audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device)
+            special_audio_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds
             )
+            inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device))
 
         outputs: CausalLMOutputWithPast = self.language_model(
             inputs_embeds=inputs_embeds,
diff --git a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py
index c325bc85300e..20cf2189bffd 100644
--- a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py
+++ b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py
@@ -269,10 +269,10 @@ def forward(
             audio_embeds = self.get_audio_features(input_features, input_features_mask, return_dict=True).pooler_output
 
             # replace text-audio token placeholders with audio embeddings
-            audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
-            inputs_embeds = inputs_embeds.masked_scatter(
-                audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device)
+            special_audio_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds
             )
+            inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device))
 
         outputs: CausalLMOutputWithPast = self.language_model(
             inputs_embeds=inputs_embeds,
diff --git a/src/transformers/models/glmasr/modeling_glmasr.py b/src/transformers/models/glmasr/modeling_glmasr.py
index aff96cad3217..8b15a9241522 100644
--- a/src/transformers/models/glmasr/modeling_glmasr.py
+++ b/src/transformers/models/glmasr/modeling_glmasr.py
@@ -30,7 +30,7 @@
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, is_torch_available
+from ...utils import TransformersKwargs, auto_docstring, is_torch_available, torch_compilable_check
 from ...utils.generic import can_return_tuple, maybe_autocast, merge_with_config_defaults
 from ...utils.output_capturing import capture_outputs
 from ..auto import AutoModel, AutoModelForCausalLM
@@ -425,6 +425,30 @@ def get_audio_features(
 
         return audio_outputs
 
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_audio_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_audio_mask = special_audio_mask.all(-1)
+        else:
+            special_audio_mask = input_ids == self.config.audio_token_id
+
+        n_audio_tokens = special_audio_mask.sum()
+        n_audio_features = audio_features.shape[0]
+        special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        torch_compilable_check(
+            inputs_embeds[special_audio_mask].numel() == audio_features.numel(),
+            f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}",
+        )
+        return special_audio_mask
+
     @can_return_tuple
     @auto_docstring
     def forward(
@@ -477,10 +501,10 @@ def forward(
             audio_embeds = self.get_audio_features(input_features, input_features_mask, return_dict=True).pooler_output
 
             # replace text-audio token placeholders with audio embeddings
-            audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
-            inputs_embeds = inputs_embeds.masked_scatter(
-                audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device)
+            special_audio_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds
             )
+            inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device))
 
         outputs: CausalLMOutputWithPast = self.language_model(
             inputs_embeds=inputs_embeds,
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 0fbc1d1035bf..b417f844b428 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -514,6 +514,30 @@ def prepare_inputs_for_generation(
             model_inputs["input_features"] = input_features
         return model_inputs
 
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_audio_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_audio_mask = special_audio_mask.all(-1)
+        else:
+            special_audio_mask = input_ids == self.config.audio_token_id
+
+        n_audio_tokens = special_audio_mask.sum()
+        n_audio_features = audio_features.shape[0]
+        special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        torch_compilable_check(
+            inputs_embeds[special_audio_mask].numel() == audio_features.numel(),
+            f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}",
+        )
+        return special_audio_mask
+
     def get_merged_audio_embeddings(
         self, input_ids: torch.Tensor, audio_features: torch.Tensor, input_features_mask: torch.Tensor | None = None
     ) -> torch.Tensor:
@@ -534,20 +558,14 @@ def get_merged_audio_embeddings(
         llm_input_ids = torch.where(is_audio_index, 0, input_ids)
         inputs_embeds = self.language_model.get_input_embeddings()(llm_input_ids)  # [bsz, # features, hidden size]
 
-        # Mask the audio features into the text embeddings
-        special_audio_mask = is_audio_index.unsqueeze(-1)
         audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
         if input_features_mask is not None:
-            torch_compilable_check(
-                not torch.all(is_audio_index.int().sum(dim=1) != input_features_mask.int().sum(dim=1)),
-                "Number of audio tokens does not match number of audio features",
-            )
             audio_features = audio_features[input_features_mask]
 
-        inputs_embeds = inputs_embeds.masked_scatter(
-            special_audio_mask,
-            audio_features,
+        special_audio_mask = self.get_placeholder_mask(
+            input_ids, inputs_embeds=inputs_embeds, audio_features=audio_features
         )
+        inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features)
         return inputs_embeds
 
     def generate(self, *args, **kwargs) -> torch.LongTensor:
diff --git a/src/transformers/models/voxtral/modeling_voxtral.py b/src/transformers/models/voxtral/modeling_voxtral.py
index 76da78cc558f..54466321b79e 100644
--- a/src/transformers/models/voxtral/modeling_voxtral.py
+++ b/src/transformers/models/voxtral/modeling_voxtral.py
@@ -32,7 +32,7 @@
 from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, CausalLMOutputWithPast
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_compilable_check
 from ...utils.generic import merge_with_config_defaults
 from ...utils.output_capturing import capture_outputs
 from ..auto import AutoModel, AutoModelForCausalLM
@@ -418,6 +418,30 @@ def get_audio_features(
 
         return audio_outputs
 
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_audio_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_audio_mask = special_audio_mask.all(-1)
+        else:
+            special_audio_mask = input_ids == self.config.audio_token_id
+
+        n_audio_tokens = special_audio_mask.sum()
+        n_audio_features = audio_features.shape[0]
+        special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        torch_compilable_check(
+            inputs_embeds[special_audio_mask].numel() == audio_features.numel(),
+            f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}",
+        )
+        return special_audio_mask
+
     @can_return_tuple
     @auto_docstring
     def forward(
@@ -473,10 +497,10 @@ def forward(
             audio_embeds = self.get_audio_features(input_features, return_dict=True).pooler_output
 
             # replace text-audio token placeholders with audio embeddings
-            audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
-            inputs_embeds = inputs_embeds.masked_scatter(
-                audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device)
+            special_audio_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds
             )
+            inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device))
 
         outputs: BaseModelOutputWithPast = self.language_model(
             attention_mask=attention_mask,
diff --git a/src/transformers/models/voxtral/modular_voxtral.py b/src/transformers/models/voxtral/modular_voxtral.py
index c7b2c53e16d4..02e8e2806a0f 100644
--- a/src/transformers/models/voxtral/modular_voxtral.py
+++ b/src/transformers/models/voxtral/modular_voxtral.py
@@ -25,7 +25,7 @@
     CausalLMOutputWithPast,
 )
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check
 from ...utils.generic import merge_with_config_defaults
 from ...utils.output_capturing import capture_outputs
 from ..auto import AutoModel, AutoModelForCausalLM
@@ -187,6 +187,30 @@ def get_audio_features(
 
         return audio_outputs
 
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_audio_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_audio_mask = special_audio_mask.all(-1)
+        else:
+            special_audio_mask = input_ids == self.config.audio_token_id
+
+        n_audio_tokens = special_audio_mask.sum()
+        n_audio_features = audio_features.shape[0]
+        special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        torch_compilable_check(
+            inputs_embeds[special_audio_mask].numel() == audio_features.numel(),
+            f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}",
+        )
+        return special_audio_mask
+
     @can_return_tuple
     @auto_docstring
     def forward(
@@ -242,10 +266,10 @@ def forward(
             audio_embeds = self.get_audio_features(input_features, return_dict=True).pooler_output
 
             # replace text-audio token placeholders with audio embeddings
-            audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
-            inputs_embeds = inputs_embeds.masked_scatter(
-                audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device)
+            special_audio_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds
             )
+            inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device))
 
         outputs: BaseModelOutputWithPast = self.language_model(
             attention_mask=attention_mask,

From b59f9583755fba2afa5e9effd1103c180b34b341 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Tue, 21 Apr 2026 18:36:56 +0200
Subject: [PATCH 16/38] specific to musicflamingo

---
 .../musicflamingo/modeling_musicflamingo.py   | 39 +++++++++++++++++--
 .../musicflamingo/modular_musicflamingo.py    | 15 +++++--
 2 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/musicflamingo/modeling_musicflamingo.py b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
index adec95bbf3e1..3ebfc929f6a8 100644
--- a/src/transformers/models/musicflamingo/modeling_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
@@ -33,7 +33,7 @@
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from ...modeling_utils import PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available, torch_compilable_check
 from ..auto import AutoModel, AutoModelForCausalLM
 from .configuration_musicflamingo import MusicFlamingoConfig
 
@@ -268,6 +268,30 @@ def get_audio_features(
 
         return audio_output
 
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_audio_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_audio_mask = special_audio_mask.all(-1)
+        else:
+            special_audio_mask = input_ids == self.config.audio_token_id
+
+        n_audio_tokens = special_audio_mask.sum()
+        n_audio_features = audio_features.shape[0]
+        special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        torch_compilable_check(
+            inputs_embeds[special_audio_mask].numel() == audio_features.numel(),
+            f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}",
+        )
+        return special_audio_mask
+
     @can_return_tuple
     @auto_docstring
     def forward(
@@ -344,10 +368,10 @@ def forward(
             ).pooler_output
 
             # replace text-audio token placeholders with audio embeddings
-            audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
-            inputs_embeds = inputs_embeds.masked_scatter(
-                audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device)
+            special_audio_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds
             )
+            inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device))
 
         outputs: CausalLMOutputWithPast = self.language_model(
             inputs_embeds=inputs_embeds,
@@ -387,6 +411,13 @@ def _build_audio_timestamps(
         _, ends = torch.where(diff == -1)
         sample_lengths = (ends - starts).to(torch.long)
 
+        n_audio_tokens = audio_token_mask.sum()
+        n_audio_features = post_lengths.sum()
+        torch_compilable_check(
+            n_audio_tokens == n_audio_features,
+            f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}",
+        )
+
         # Account for 4x downsampling in audio encoder (conv2 and avg pooling)
         audio_embed_frame_step = self.config.audio_frame_step * 4
         frame_offsets = (
diff --git a/src/transformers/models/musicflamingo/modular_musicflamingo.py b/src/transformers/models/musicflamingo/modular_musicflamingo.py
index 7d98d0ffdeab..e16ae28f6c68 100644
--- a/src/transformers/models/musicflamingo/modular_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/modular_musicflamingo.py
@@ -25,7 +25,7 @@
 from ...modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast
 from ...modeling_utils import PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available, torch_compilable_check
 from ..audioflamingo3.configuration_audioflamingo3 import AudioFlamingo3Config
 from ..audioflamingo3.modeling_audioflamingo3 import (
     AudioFlamingo3ForConditionalGeneration,
@@ -274,6 +274,13 @@ def _build_audio_timestamps(
         _, ends = torch.where(diff == -1)
         sample_lengths = (ends - starts).to(torch.long)
 
+        n_audio_tokens = audio_token_mask.sum()
+        n_audio_features = post_lengths.sum()
+        torch_compilable_check(
+            n_audio_tokens == n_audio_features,
+            f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}",
+        )
+
         # Account for 4x downsampling in audio encoder (conv2 and avg pooling)
         audio_embed_frame_step = self.config.audio_frame_step * 4
         frame_offsets = (
@@ -408,10 +415,10 @@ def forward(
             ).pooler_output
 
             # replace text-audio token placeholders with audio embeddings
-            audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
-            inputs_embeds = inputs_embeds.masked_scatter(
-                audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device)
+            special_audio_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds
             )
+            inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device))
 
         outputs: CausalLMOutputWithPast = self.language_model(
             inputs_embeds=inputs_embeds,

From bb986b6631c08b9c7e269978ba27acc5d3568e86 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Tue, 21 Apr 2026 18:37:09 +0200
Subject: [PATCH 17/38] granite speech fix

---
 tests/models/granite_speech/test_modeling_granite_speech.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index 18f07fc71bef..3493fde4a267 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -74,8 +74,9 @@ def get_audio_embeds_mask(self, audio_mask):
         # Projector: ceil(feat_seq_length / window_size) * (window_size // downsample_rate) tokens per sample.
         import math
 
-        nblocks = math.ceil(self.feat_seq_length / self.window_size)
-        num_audio_tokens = nblocks * (self.window_size // self.downsample_rate)
+        config = self.get_config()
+        nblocks = math.ceil(self.feat_seq_length / config.window_size)
+        num_audio_tokens = nblocks * (config.window_size // config.downsample_rate)
         return torch.ones([self.batch_size, num_audio_tokens], dtype=torch.long).to(torch_device)
 
     def create_attention_mask(self, input_ids):

From 670c68c238afa8643764f9db30f61f1bdb77147a Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Wed, 22 Apr 2026 11:54:35 +0200
Subject: [PATCH 18/38] let's factorise alm/vlm testers

---
 tests/alm_tester.py        | 218 ++++----------------------------
 tests/multimodal_tester.py | 253 +++++++++++++++++++++++++++++++++++++
 tests/vlm_tester.py        | 222 +++-----------------------------
 3 files changed, 296 insertions(+), 397 deletions(-)
 create mode 100644 tests/multimodal_tester.py

diff --git a/tests/alm_tester.py b/tests/alm_tester.py
index 340aee77df5c..fd16623994ea 100644
--- a/tests/alm_tester.py
+++ b/tests/alm_tester.py
@@ -16,54 +16,27 @@
 import unittest
 from inspect import signature
 
-from .test_configuration_common import ConfigTester
+from .multimodal_tester import MultiModalModelTest, MultiModalModelTester
 from .test_modeling_common import (
-    GenerationTesterMixin,
-    ModelTesterMixin,
     floats_tensor,
     ids_tensor,
     is_torch_available,
-    require_torch,
     torch_device,
 )
-from .test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
     import torch
 
 
-class ALMModelTester:
-    # If the model follows standard naming conventions, only `config_class` and
-    # `conditional_generation_class` need to be set (others are optional).
-    base_model_class = None  # this should be added for most models when #45534 is merged
-    config_class = None
-    text_config_class = None
+class ALMModelTester(MultiModalModelTester):
     audio_config_class = None
-    conditional_generation_class = None
-    sequence_classification_class = None
-    # These attributes are required after the initialization phase of the tester.
-    _required_attributes = ("config_class", "conditional_generation_class")
-
-    # Arguments that should be passed to the config class even if not in its signature.
-    forced_config_args = ["pad_token_id"]
-
-    # Key name for the audio sub-config in the main config constructor.
-    # Override to "encoder_config" for models like GraniteSpeech.
     audio_config_key = "audio_config"
-    audio_mask_key = None  # to be set if audio-related mask has to be passed to the model's forward
-
-    @property
-    def all_model_classes(self):
-        return [
-            model_class
-            for model_class in (
-                self.base_model_class,
-                self.conditional_generation_class,
-                self.sequence_classification_class,
-            )
-            if model_class is not None
-        ]
+    # Name under which the audio mask is passed to the model's forward (e.g. "feature_attention_mask"
+    # for Qwen2Audio). Leave as `None` if the model does not consume a separate audio-level mask;
+    # `_prepare_modality_inputs` then skips adding it to the inputs dict.
+    audio_mask_key = None
+    _required_attributes = MultiModalModelTester._required_attributes + ("audio_config_class",)
 
     @property
     def pipeline_model_mapping(self):
@@ -76,61 +49,22 @@ def pipeline_model_mapping(self):
         return mapping
 
     def __init__(self, parent, **kwargs):
-        self.parent = parent
-
         # Standard defaults
-        kwargs.setdefault("batch_size", 3)
-
-        # TODO: explain here specifically why these values are chosen
         kwargs.setdefault("seq_length", 32)
         kwargs.setdefault("feat_seq_length", 128)
 
         kwargs.setdefault("num_mel_bins", 80)
-        kwargs.setdefault("is_training", True)
-        kwargs.setdefault("use_labels", True)
         kwargs.setdefault("pad_token_id", 1)
-        kwargs.setdefault("bos_token_id", 1)
-        kwargs.setdefault("eos_token_id", 2)
         kwargs.setdefault("audio_token_id", 0)
-        kwargs.setdefault("ignore_index", -100)
-        kwargs.setdefault("scope", None)
-        kwargs.setdefault("vocab_size", 99)
-        kwargs.setdefault("hidden_size", 32)
-        kwargs.setdefault("num_hidden_layers", 2)
-        kwargs.setdefault("num_attention_heads", 2)
-        kwargs.setdefault("num_key_value_heads", 2)
-        kwargs.setdefault("intermediate_size", 32)  # Keep this divisible by 8 for fp16/bf16/fp32 16-bytes alignment
-        kwargs.setdefault("hidden_act", "gelu")
-        kwargs.setdefault("max_position_embeddings", 512)
-
-        # Set all kwargs as instance attributes
-        for key, value in kwargs.items():
-            setattr(self, key, value)
-
-        for required_attribute in [
-            # "base_model_class", # TODO: @eustlb, there is a discrepancy here between ALMs/ VLMs. XXModel and XXForConditionalGeneration
-            "config_class",
-            "conditional_generation_class",
-            "text_config_class",
-            "audio_config_class",
-        ]:
-            if getattr(self, required_attribute) is None:
-                raise ValueError(
-                    f"You have inherited from ALMModelTester but did not set the {required_attribute} attribute."
-                )
 
-    # Because audio-LMs have some different standards in how they handle audio tokens, we need
-    # a few methods that can be overridden if required:
+        super().__init__(parent, **kwargs)
+
+    # -- Overridable ALM-specific hooks ------------------------------------------------------
 
     def create_audio_features(self):
         """Create audio feature tensor. Override for different shapes (e.g. [B, T, features])."""
         return floats_tensor([self.batch_size, self.num_mel_bins, self.feat_seq_length])
 
-    def create_attention_mask(self, input_ids):
-        # TODO: check, this looks strange to force as default behavior
-        # Override for bidirectional attention models like Gemma3
-        return torch.tril(torch.ones_like(input_ids).to(torch_device))
-
     def get_audio_embeds_mask(self, audio_embeds_mask):
         """Get audio embeds mask from audio mask. Override for different shapes."""
         raise NotImplementedError("This method should be overridden in the subclass")
@@ -174,115 +108,39 @@ def create_audio_mask(self):
         audio_mask = ((positions >= offsets[:, None]) & (positions < offsets[:, None] + lengths[:, None])).long()
         return audio_mask
 
-    def get_additional_inputs(self, config, input_ids, audio_features):
-        """Return dict of model-specific extra inputs (e.g. image_sizes for multi-modal)."""
-        return {}
+    # -- Hooks consumed by the shared base ---------------------------------------------------
 
-    # End of overridable methods
+    def _special_token_ids(self):
+        return super()._special_token_ids() | {self.audio_token_id}
 
-    def prepare_config_and_inputs_for_common(self):
-        # TODO: add a clear diagram that explains input prep
+    def _build_modality_sub_configs(self):
+        return {self.audio_config_key: self.get_audio_config()}
 
+    def _prepare_modality_inputs(self, input_ids, config):
+        # TODO: add a clear diagram that explains input prep ?
         audio_features = self.create_audio_features()
         audio_mask = self.create_audio_mask()
         audio_embeds_mask = self.get_audio_embeds_mask(audio_mask)
-
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        special_tokens = [self.pad_token_id, self.bos_token_id, self.eos_token_id, self.audio_token_id]
-        for i in range(self.vocab_size):
-            if i not in special_tokens:
-                safe_token_id = i
-                break
-        else:
-            raise ValueError("vocab_size is too small and there is no token ID that is not a special token!")
-
-        # Avoid flaky tests, clear any special tokens in ids_tensor
-        # audio_token_id is handled separately by place_audio_tokens()
-        input_ids[input_ids == self.pad_token_id] = safe_token_id
-        input_ids[input_ids == self.eos_token_id] = safe_token_id
-
-        config = self.get_config()
         num_audio_tokens = audio_embeds_mask.sum(dim=1)
         input_ids = self.place_audio_tokens(input_ids, config, num_audio_tokens)
-        attention_mask = self.create_attention_mask(input_ids)
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            self.get_audio_feature_key(): audio_features,
-        }
 
+        modality_inputs = {self.get_audio_feature_key(): audio_features}
         if self.audio_mask_key is not None:
-            inputs_dict[self.audio_mask_key] = audio_mask
-
-        inputs_dict.update(self.get_additional_inputs(config, input_ids, audio_features))
-        return config, inputs_dict
-
-    @property
-    def config_args(self):
-        return list(signature(self.config_class.__init__).parameters.keys())
+            modality_inputs[self.audio_mask_key] = audio_mask
+        return input_ids, modality_inputs, audio_features
 
-    @property
-    def text_config_args(self):
-        args = list(signature(self.text_config_class.__init__).parameters.keys())
-        for token_arg in ["pad_token_id", "bos_token_id", "eos_token_id"]:  # Not always explicitly in the sig
-            if token_arg not in args:
-                args.append(token_arg)
-        return args
+    # -- Audio sub-config construction -------------------------------------------------------
 
     @property
     def audio_config_args(self):
         return list(signature(self.audio_config_class.__init__).parameters.keys())
 
-    def get_config(self):
-        kwargs = {}
-        attribute_map = getattr(self.config_class, "attribute_map", {})
-        model_name_to_common_name = {v: k for k, v in attribute_map.items()}
-        for k in self.config_args + self.forced_config_args:
-            if hasattr(self, k) and k != "self":
-                kwargs[k] = getattr(self, k)
-            elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]):
-                kwargs[k] = getattr(self, model_name_to_common_name[k])
-        kwargs["text_config"] = self.get_text_config()
-        kwargs[self.audio_config_key] = self.get_audio_config()
-        return self.config_class(**kwargs)
-
-    def get_text_config(self):
-        kwargs = {}
-        attribute_map = getattr(self.text_config_class, "attribute_map", {})
-        model_name_to_common_name = {v: k for k, v in attribute_map.items()}
-        for k in self.text_config_args:
-            if hasattr(self, k) and k != "self":
-                kwargs[k] = getattr(self, k)
-            elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]):
-                kwargs[k] = getattr(self, model_name_to_common_name[k])
-        return self.text_config_class(**kwargs)
-
     def get_audio_config(self):
-        kwargs = {}
-        attribute_map = getattr(self.audio_config_class, "attribute_map", {})
-        model_name_to_common_name = {v: k for k, v in attribute_map.items()}
-        for k in self.audio_config_args:
-            if hasattr(self, k) and k != "self":
-                kwargs[k] = getattr(self, k)
-            elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]):
-                kwargs[k] = getattr(self, model_name_to_common_name[k])
+        kwargs = self._collect_kwargs(self.audio_config_args, self.audio_config_class)
         return self.audio_config_class(**kwargs)
 
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = self.base_model_class(config=config)
-        model.to(torch_device)
-        model.eval()
-        model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-
-@require_torch
-class ALMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin):
+class ALMModelTest(MultiModalModelTest):
     """
     Base test class for Audio-Language Models.
 
@@ -294,35 +152,6 @@ class ALMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin)
     - `pipeline_model_mapping`: Override if not using default from model_tester
     """
 
-    model_tester_class = None
-    all_model_classes = None
-    pipeline_model_mapping = None
-
-    # Audio-LMs are always composite
-    _is_composite = True
-
-    def setUp(self):
-        if self.model_tester_class is None:
-            raise ValueError("You have inherited from ALMModelTest but did not set the model_tester_class attribute.")
-        self.model_tester = self.model_tester_class(self)
-        self.config_tester = ConfigTester(self, config_class=self.model_tester.config_class, has_text_modality=False)
-
-        if self.pipeline_model_mapping is None:
-            if self.all_model_classes is not None:
-                raise ValueError(
-                    "Tests that inherit from `ALMModelTest` and set `all_model_classes` must manually set "
-                    "`pipeline_model_mapping`."
-                )
-            else:
-                self.pipeline_model_mapping = self.model_tester.pipeline_model_mapping
-
-        if self.all_model_classes is None:
-            self.all_model_classes = self.model_tester.all_model_classes
-
-    def test_config(self):
-        """Test config common functionality."""
-        self.config_tester.run_common_tests()
-
     # TODO: @eustlb, remove this once #45534 is merged
     @unittest.skip("Audio-LMs have no separate base model without a head.")
     def test_model_base_model_prefix(self):
@@ -394,4 +223,3 @@ def test_mismatching_num_audio_tokens(self):
                     [curr_input_dict[audio_mask_key], curr_input_dict[audio_mask_key]], dim=0
                 )
             _ = model(**curr_input_dict)
-
diff --git a/tests/multimodal_tester.py b/tests/multimodal_tester.py
new file mode 100644
index 000000000000..1a52a5be303c
--- /dev/null
+++ b/tests/multimodal_tester.py
@@ -0,0 +1,253 @@
+# Copyright 2026 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from inspect import signature
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import (
+    GenerationTesterMixin,
+    ModelTesterMixin,
+    ids_tensor,
+    is_torch_available,
+    require_torch,
+    torch_device,
+)
+from .test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+
+class MultiModalModelTester:
+    """Shared tester base for VLM (vision-language) and ALM (audio-language).
+
+    Concrete subclasses (e.g. `VLMModelTester`, `ALMModelTester`) supply:
+      - the modality-specific sub-config class (`vision_config_class` for VLMs, `audio_config_class` for ALMs, ...),
+      - the modality-specific defaults and helper methods,
+      - the hooks `_build_modality_sub_configs` and `_prepare_modality_inputs`,
+      - optionally an extended `_special_token_ids` and `pipeline_model_mapping`.
+
+    This tester provides shared logic for evaluating and verifying models that combine text with other modalities,
+    centering on the needs of vision-language (VLM) and audio-language (ALM) models.
+    """
+
+    # If the model follows the standard naming conventions, only `base_model_class` needs to be set
+    # (the others are inferred from available public classes).
+    base_model_class = None
+    config_class = None
+    text_config_class = None
+    conditional_generation_class = None
+    sequence_classification_class = None
+
+    # Required attributes after the initialization phase of the tester. Subclasses extend.
+    _required_attributes = ("config_class", "text_config_class", "conditional_generation_class")
+
+    # Arguments that should be passed to the config class even if not in its signature
+    forced_config_args = ["pad_token_id"]
+
+    @property
+    def all_model_classes(self):
+        # Models that set `all_model_classes` in their `XXXModelTest` class must have a new class that doesn't fit
+        # any of the common classes.
+        return [
+            model_class
+            for model_class in (
+                self.base_model_class,
+                self.conditional_generation_class,
+                self.sequence_classification_class,
+            )
+            if model_class is not None
+        ]
+
+    def __init__(self, parent, **kwargs):
+        self.parent = parent
+
+        # Text-side defaults shared by every multimodal tester. Subclasses are expected to `setdefault`
+        # their modality-specific kwargs (and any differing values such as `pad_token_id`) *before* calling super.
+        kwargs.setdefault("batch_size", 3)
+        kwargs.setdefault("is_training", True)
+        kwargs.setdefault("use_input_mask", True)
+        kwargs.setdefault("use_labels", True)
+        kwargs.setdefault("vocab_size", 99)
+        kwargs.setdefault("hidden_size", 32)
+        kwargs.setdefault("num_hidden_layers", 2)
+        kwargs.setdefault("num_attention_heads", 2)
+        kwargs.setdefault("num_key_value_heads", 2)
+        kwargs.setdefault("intermediate_size", 32)  # Keep this divisible by 8 for fp16/bf16/fp32 16-bytes alignment
+        kwargs.setdefault("hidden_act", "gelu")
+        kwargs.setdefault("max_position_embeddings", 512)
+        kwargs.setdefault("bos_token_id", 1)
+        kwargs.setdefault("eos_token_id", 2)
+        kwargs.setdefault("ignore_index", -100)
+        kwargs.setdefault("scope", None)
+
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+
+        self._check_required_attributes()
+
+    def _check_required_attributes(self):
+        for required_attribute in self._required_attributes:
+            if getattr(self, required_attribute, None) is None:
+                raise ValueError(
+                    f"You have inherited from {type(self).__name__} but did not set the {required_attribute} attribute."
+                )
+
+    # -- Overridable modality hooks -----------------------------------------------------------
+
+    def create_attention_mask(self, input_ids):
+        """Default causal (lower-triangular) attention mask. Override for bidirectional models like Gemma3."""
+        return torch.tril(torch.ones_like(input_ids).to(torch_device))
+
+    def get_additional_inputs(self, config, input_ids, modality_tensor):
+        """Model-specific extra inputs (e.g. LlavaNext `image_sizes`, Qwen3VL `mm_token_type_ids`)."""
+        return {}
+
+    def _special_token_ids(self):
+        """Special token ids that must never appear as random text tokens. Subclasses add modality tokens."""
+        return {self.pad_token_id, self.bos_token_id, self.eos_token_id}
+
+    def _build_modality_sub_configs(self):
+        """Return the {sub-config-key: sub-config-instance} entries for the main config constructor."""
+        raise NotImplementedError
+
+    def _prepare_modality_inputs(self, input_ids, config):
+        """Create modality features, place modality placeholder tokens in ``input_ids``, and return:
+
+        (input_ids_with_placeholders, modality_inputs_dict, modality_tensor_for_additional_inputs)
+        """
+        raise NotImplementedError
+
+    # -- End of overridable hooks -------------------------------------------------------------
+
+    def _safe_token_id(self):
+        """Smallest token ID that is not a special token. Used to scrub random ids_tensor outputs."""
+        special_tokens = self._special_token_ids()
+        for i in range(self.vocab_size):
+            if i not in special_tokens:
+                return i
+        raise ValueError("vocab_size is too small and there is no token ID that is not a special token!")
+
+    def prepare_config_and_inputs_for_common(self):
+        config = self.get_config()
+
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        # Avoid flaky tests by scrubbing any accidental special tokens produced by ids_tensor.
+        # Modality placeholder tokens are scrubbed and placed by `_prepare_modality_inputs`.
+        safe_token_id = self._safe_token_id()
+        input_ids[input_ids == self.pad_token_id] = safe_token_id
+        input_ids[input_ids == self.eos_token_id] = safe_token_id
+
+        input_ids, modality_inputs, modality_tensor = self._prepare_modality_inputs(input_ids, config)
+
+        # Create attention mask with final input_ids (after modality placeholders are placed) — important
+        # for models that derive padding from token values.
+        attention_mask = self.create_attention_mask(input_ids) if self.use_input_mask else None
+
+        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
+        inputs_dict.update(modality_inputs)
+        inputs_dict.update(self.get_additional_inputs(config, input_ids, modality_tensor))
+        return config, inputs_dict
+
+    # -- Config construction helpers ----------------------------------------------------------
+
+    @property
+    def config_args(self):
+        return list(signature(self.config_class.__init__).parameters.keys())
+
+    @property
+    def text_config_args(self):
+        args = list(signature(self.text_config_class.__init__).parameters.keys())
+        for token_arg in ["pad_token_id", "bos_token_id", "eos_token_id"]:  # Not always explicitly in the sig
+            if token_arg not in args:
+                args.append(token_arg)
+        return args
+
+    def _collect_kwargs(self, sig_keys, config_class):
+        """Collect kwargs for ``config_class`` by matching ``sig_keys`` (and its ``attribute_map``) against ``self``."""
+        attribute_map = getattr(config_class, "attribute_map", {})
+        model_name_to_common_name = {v: k for k, v in attribute_map.items()}
+        kwargs = {}
+        for k in sig_keys:
+            if hasattr(self, k) and k != "self":
+                kwargs[k] = getattr(self, k)
+            elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]):
+                kwargs[k] = getattr(self, model_name_to_common_name[k])
+        return kwargs
+
+    def get_config(self):
+        kwargs = self._collect_kwargs(self.config_args + self.forced_config_args, self.config_class)
+        kwargs["text_config"] = self.get_text_config()
+        kwargs.update(self._build_modality_sub_configs())
+        return self.config_class(**kwargs)
+
+    def get_text_config(self):
+        kwargs = self._collect_kwargs(self.text_config_args, self.text_config_class)
+        return self.text_config_class(**kwargs)
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = self.base_model_class(config=config)
+        model.to(torch_device)
+        model.eval()
+        model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+
+@require_torch
+class MultiModalModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin):
+    """Shared test-class base for multimodal model families.
+
+    Subclasses must set:
+      - ``model_tester_class``: The tester class (subclass of ``MultiModalModelTester``)
+
+    Optional:
+      - ``all_model_classes``: override if not using the default from the model tester
+      - ``pipeline_model_mapping``: override if not using the default from the model tester
+    """
+
+    model_tester_class = None
+    all_model_classes = None
+    pipeline_model_mapping = None
+
+    # Multimodal models are always composite
+    _is_composite = True
+
+    def setUp(self):
+        if self.model_tester_class is None:
+            raise ValueError(
+                f"You have inherited from {type(self).__name__} but did not set the model_tester_class attribute."
+            )
+        self.model_tester = self.model_tester_class(self)
+        self.config_tester = ConfigTester(self, config_class=self.model_tester.config_class, has_text_modality=False)
+
+        if self.pipeline_model_mapping is None:
+            if self.all_model_classes is not None:
+                raise ValueError(
+                    f"Tests that inherit from `{type(self).__name__}` and set `all_model_classes` must manually set "
+                    "`pipeline_model_mapping`."
+                )
+            else:
+                self.pipeline_model_mapping = self.model_tester.pipeline_model_mapping
+
+        if self.all_model_classes is None:
+            self.all_model_classes = self.model_tester.all_model_classes
+
+    def test_config(self):
+        """Test config common functionality."""
+        self.config_tester.run_common_tests()
diff --git a/tests/vlm_tester.py b/tests/vlm_tester.py
index c40b42785836..7a435028c5e4 100644
--- a/tests/vlm_tester.py
+++ b/tests/vlm_tester.py
@@ -16,90 +16,42 @@
 import unittest
 from inspect import signature
 
-from .test_configuration_common import ConfigTester
+from .multimodal_tester import MultiModalModelTest, MultiModalModelTester
 from .test_modeling_common import (
-    GenerationTesterMixin,
-    ModelTesterMixin,
     floats_tensor,
-    ids_tensor,
     is_torch_available,
-    require_torch,
     torch_device,
 )
-from .test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
     import torch
 
 
-class VLMModelTester:
-    # If the model follows the standard naming conventions, only `base_model_class` needs to be set (the others are
-    # inferred from available public classes).
-    base_model_class = None
-    config_class = None
-    text_config_class = None
+class VLMModelTester(MultiModalModelTester):
     vision_config_class = None
-    conditional_generation_class = None
-    sequence_classification_class = None
-    # These attributes are required after the initialization phase of the tester.
-    _required_attributes = ("base_model_class", "config_class", "conditional_generation_class")
-
-    # Arguments that should be passed to the config class even if not in its signature
-    forced_config_args = ["pad_token_id"]
-
-    @property
-    def all_model_classes(self):
-        # Models that set `all_model_classes` in their `XXXModelTest` class must have a new class that doesn't fit
-        # any of the common classes.
-        return [
-            model_class
-            for model_class in (
-                self.base_model_class,
-                self.conditional_generation_class,
-                self.sequence_classification_class,
-            )
-            if model_class is not None
-        ]
+    _required_attributes = MultiModalModelTester._required_attributes + ("base_model_class", "vision_config_class")
 
     @property
     def pipeline_model_mapping(self):
-        mapping = {
+        return {
             "feature-extraction": self.base_model_class,
             "image-text-to-text": self.conditional_generation_class,
         }
-        return mapping
 
     def __init__(self, parent, **kwargs):
-        self.parent = parent
-
         # Standard defaults
-        kwargs.setdefault("batch_size", 3)
-        kwargs.setdefault("is_training", True)
-        kwargs.setdefault("use_input_mask", True)
         kwargs.setdefault("use_token_type_ids", False)
-        kwargs.setdefault("use_labels", True)
-        kwargs.setdefault("vocab_size", 99)
-        kwargs.setdefault("hidden_size", 32)
-        kwargs.setdefault("num_hidden_layers", 2)
-        kwargs.setdefault("num_attention_heads", 2)
-        kwargs.setdefault("num_key_value_heads", 2)
-        kwargs.setdefault("intermediate_size", 32)  # Keep this divisible by 8 for fp16/bf16/fp32 16-bytes alignment
-        kwargs.setdefault("hidden_act", "gelu")
         kwargs.setdefault("hidden_dropout_prob", 0.1)
         kwargs.setdefault("attention_probs_dropout_prob", 0.1)
-        kwargs.setdefault("max_position_embeddings", 512)
         kwargs.setdefault("type_vocab_size", 16)
         kwargs.setdefault("type_sequence_label_size", 2)
         kwargs.setdefault("initializer_range", 0.02)
         kwargs.setdefault("num_labels", 3)
         kwargs.setdefault("num_choices", 4)
         kwargs.setdefault("pad_token_id", 0)
-        kwargs.setdefault("bos_token_id", 1)
-        kwargs.setdefault("eos_token_id", 2)
         kwargs.setdefault("image_token_id", 3)
         kwargs.setdefault("is_decoder", False)
-        kwargs.setdefault("scope", None)
         kwargs.setdefault("expert_interval", 1)
         kwargs.setdefault("moe_layer_start_index", 0)
         kwargs.setdefault("moe_intermediate_size", 12)
@@ -108,54 +60,29 @@ def __init__(self, parent, **kwargs):
         kwargs.setdefault("moe_num_shared_experts", 2)
         kwargs.setdefault("num_experts_per_tok", 2)
         kwargs.setdefault("num_experts", 8)
-        kwargs.setdefault("mamba_n_groups", 1)
-        kwargs.setdefault("mamba_n_heads", 16)
-        kwargs.setdefault("mamba_d_state", 16)
-        kwargs.setdefault("mamba_d_conv", 4)
-        kwargs.setdefault("mamba_expand", 2)
-        kwargs.setdefault("mamba_chunk_size", 16)
         kwargs.setdefault("image_size", 8)
         kwargs.setdefault("patch_size", 4)
         kwargs.setdefault("num_channels", 3)
         kwargs.setdefault("projection_dim", 32)
         kwargs.setdefault("projector_hidden_act", "gelu")
-        kwargs.setdefault("ignore_index", -100)
         kwargs.setdefault("vision_feature_select_strategy", "default")
         kwargs.setdefault("vision_feature_layer", -1)
         kwargs.setdefault("tie_word_embeddings", False)
-
-        # Computed defaults (can still be overridden in derived classes)
-        kwargs.setdefault("head_dim", kwargs["hidden_size"] // kwargs["num_attention_heads"])
         kwargs.setdefault("num_image_tokens", (kwargs["image_size"] // kwargs["patch_size"]) ** 2)
         kwargs.setdefault("seq_length", 7 + kwargs["num_image_tokens"])
 
-        # Set all kwargs as instance attributes
-        for key, value in kwargs.items():
-            setattr(self, key, value)
+        super().__init__(parent, **kwargs)
 
-        for required_attribute in [
-            "base_model_class",
-            "config_class",
-            "conditional_generation_class",
-            "text_config_class",
-            "vision_config_class",
-        ]:
-            if getattr(self, required_attribute) is None:
-                raise ValueError(
-                    f"You have inherited from VLMModelTester but did not set the {required_attribute} attribute."
-                )
+        # Computed default depending on base-class defaults for hidden_size / num_attention_heads.
+        if not hasattr(self, "head_dim"):
+            self.head_dim = self.hidden_size // self.num_attention_heads
 
-    # Because VLMs have some different standards in how they handle image tokens, we need a few methods
-    # that can be overridden if required:
+    # -- Overridable VLM-specific hooks ------------------------------------------------------
 
     def create_pixel_values(self):
         # Override to 5D for patch-based models
         return floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size], scale=1.0)
 
-    def create_attention_mask(self, input_ids):
-        # Override for bidirectional attention models like Gemma3
-        return torch.tril(torch.ones_like(input_ids).to(torch_device))
-
     def place_image_tokens(self, input_ids, config):
         # Override if the image tokens shouldn't be placed at the start of the test sequence
         image_token_id = getattr(config, "image_token_id", self.image_token_id)
@@ -166,111 +93,31 @@ def place_image_tokens(self, input_ids, config):
         input_ids[:, : self.num_image_tokens] = image_token_id
         return input_ids
 
-    def get_additional_inputs(self, config, input_ids, pixel_values):
-        # Override for model-specific inputs like LlavaNext's image_sizes
-        return {}
+    # -- Hooks consumed by the shared base ---------------------------------------------------
 
-    # End of overridable methods
+    def _special_token_ids(self):
+        return super()._special_token_ids() | {self.image_token_id}
 
-    def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        pixel_values = self.create_pixel_values()
-
-        config = self.get_config()
-
-        special_tokens = [self.pad_token_id, self.bos_token_id, self.eos_token_id, self.image_token_id]
-        for i in range(self.vocab_size):
-            if i not in special_tokens:
-                # The smallest token ID that is not a special token
-                safe_token_id = i
-                break
-        else:
-            raise ValueError("vocab_size is too small and there is no token ID that is not a special token!")
-
-        # Avoid flaky tests, clear any special tokens in ids_tensor
-        # image_token_id is handled separately by place_image_tokens()
-        input_ids[input_ids == self.pad_token_id] = safe_token_id
-        input_ids[input_ids == self.eos_token_id] = safe_token_id
+    def _build_modality_sub_configs(self):
+        return {"vision_config": self.get_vision_config()}
 
+    def _prepare_modality_inputs(self, input_ids, config):
+        pixel_values = self.create_pixel_values()
         input_ids = self.place_image_tokens(input_ids, config)
+        return input_ids, {"pixel_values": pixel_values}, pixel_values
 
-        # Create attention mask with final input_ids (after image tokens are placed)
-        # This is important for models that use padding masks based on token values
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = self.create_attention_mask(input_ids)
-
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask, "pixel_values": pixel_values}
-
-        additional_inputs = self.get_additional_inputs(config, input_ids, pixel_values)
-        inputs_dict.update(additional_inputs)
-
-        return config, inputs_dict
-
-    @property
-    def config_args(self):
-        return list(signature(self.config_class.__init__).parameters.keys())
-
-    @property
-    def text_config_args(self):
-        args = list(signature(self.text_config_class.__init__).parameters.keys())
-        for token_arg in ["pad_token_id", "bos_token_id", "eos_token_id"]:  # Not always explicitly in the sig
-            if token_arg not in args:
-                args.append(token_arg)
-        return args
+    # -- Vision sub-config construction ------------------------------------------------------
 
     @property
     def vision_config_args(self):
         return list(signature(self.vision_config_class.__init__).parameters.keys())
 
-    def get_config(self):
-        kwargs = {}
-        attribute_map = getattr(self.config_class, "attribute_map", {})
-        model_name_to_common_name = {v: k for k, v in attribute_map.items()}
-        for k in self.config_args + self.forced_config_args:
-            if hasattr(self, k) and k != "self":
-                kwargs[k] = getattr(self, k)
-            elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]):
-                kwargs[k] = getattr(self, model_name_to_common_name[k])
-        kwargs["text_config"] = self.get_text_config()
-        kwargs["vision_config"] = self.get_vision_config()
-        return self.config_class(**kwargs)
-
-    def get_text_config(self):
-        kwargs = {}
-        attribute_map = getattr(self.text_config_class, "attribute_map", {})
-        model_name_to_common_name = {v: k for k, v in attribute_map.items()}
-        for k in self.text_config_args:
-            if hasattr(self, k) and k != "self":
-                kwargs[k] = getattr(self, k)
-            elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]):
-                kwargs[k] = getattr(self, model_name_to_common_name[k])
-        return self.text_config_class(**kwargs)
-
     def get_vision_config(self):
-        kwargs = {}
-        attribute_map = getattr(self.vision_config_class, "attribute_map", {})
-        model_name_to_common_name = {v: k for k, v in attribute_map.items()}
-        for k in self.vision_config_args:
-            if hasattr(self, k) and k != "self":
-                kwargs[k] = getattr(self, k)
-            elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]):
-                kwargs[k] = getattr(self, model_name_to_common_name[k])
+        kwargs = self._collect_kwargs(self.vision_config_args, self.vision_config_class)
         return self.vision_config_class(**kwargs)
 
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = self.base_model_class(config=config)
-        model.to(torch_device)
-        model.eval()
-        model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
 
-@require_torch
-class VLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin):
+class VLMModelTest(MultiModalModelTest):
     """
     Base test class for Vision-Language Models.
 
@@ -282,35 +129,6 @@ class VLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin)
     - `pipeline_model_mapping`: Override if not using default from model_tester
     """
 
-    model_tester_class = None
-    all_model_classes = None
-    pipeline_model_mapping = None
-
-    # VLMs are always composite
-    _is_composite = True
-
-    def setUp(self):
-        if self.model_tester_class is None:
-            raise ValueError("You have inherited from VLMModelTest but did not set the model_tester_class attribute.")
-        self.model_tester = self.model_tester_class(self)
-        self.config_tester = ConfigTester(self, config_class=self.model_tester.config_class, has_text_modality=False)
-
-        if self.pipeline_model_mapping is None:
-            if self.all_model_classes is not None:
-                raise ValueError(
-                    "Tests that inherit from `VLMModelTest` and set `all_model_classes` must manually set "
-                    "`pipeline_model_mapping`."
-                )
-            else:
-                self.pipeline_model_mapping = self.model_tester.pipeline_model_mapping
-
-        if self.all_model_classes is None:
-            self.all_model_classes = self.model_tester.all_model_classes
-
-    def test_config(self):
-        """Test config common functionality."""
-        self.config_tester.run_common_tests()
-
     def test_mismatching_num_image_tokens(self):
         """
         Tests that VLMs throw an error with explicit message saying what is wrong

From c9534432c615de97e7d15c9c437e95af07866495 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Wed, 22 Apr 2026 12:11:13 +0200
Subject: [PATCH 19/38] make fix-repo

---
 .../configuration_granite_speech.py           |  1 +
 .../configuration_qwen2_5_omni.py             |  7 +++-
 .../configuration_qwen3_omni_moe.py           |  7 +++-
 .../vibevoice_asr/modeling_vibevoice_asr.py   | 32 +++++++++++++++++-
 .../modeling_voxtral_realtime.py              | 33 ++++++++++++++++++-
 .../test_modeling_granite_speech.py           |  2 +-
 6 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index dbdda02ccdb9..e5532b3bf880 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -78,6 +78,7 @@ def __post_init__(self, **kwargs):
         if self.dim_head is None:
             self.dim_head = self.hidden_dim // self.num_heads
 
+
 @auto_docstring(checkpoint="ibm-granite/granite-speech-3.3-2b")
 @strict
 class GraniteSpeechConfig(PreTrainedConfig):
diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py
index 1564d2b36de9..081823bf222f 100644
--- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py
@@ -99,7 +99,12 @@ class Qwen2_5OmniAudioEncoderConfig(PreTrainedConfig):
     ```"""
 
     model_type = "qwen2_5_omni_audio_encoder"
-    attribute_map = {"num_hidden_layers": "encoder_layers"}
+    attribute_map = {
+        "num_hidden_layers": "encoder_layers",
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+        "intermediate_size": "encoder_ffn_dim",
+    }
 
     num_mel_bins: int = 128
     encoder_layers: int = 32
diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py
index 1ba13364401a..482030541e33 100644
--- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py
@@ -47,7 +47,12 @@ class Qwen3OmniMoeAudioEncoderConfig(PreTrainedConfig):
     """
 
     model_type = "qwen3_omni_moe_audio_encoder"
-    attribute_map = {"num_hidden_layers": "encoder_layers"}
+    attribute_map = {
+        "num_hidden_layers": "encoder_layers",
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+        "intermediate_size": "encoder_ffn_dim",
+    }
 
     num_mel_bins: int = 128
     encoder_layers: int = 32
diff --git a/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py b/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py
index 703bb6ca5130..5a1cb1b8895e 100644
--- a/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py
+++ b/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py
@@ -28,7 +28,13 @@
 from ...modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast
 from ...modeling_utils import PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling
+from ...utils import (
+    TransformersKwargs,
+    auto_docstring,
+    can_return_tuple,
+    is_torchdynamo_compiling,
+    torch_compilable_check,
+)
 from ..auto import AutoModel, AutoModelForCausalLM
 from .configuration_vibevoice_asr import VibeVoiceAsrConfig
 
@@ -362,6 +368,30 @@ def get_audio_features(
 
         return BaseModelOutputWithPooling(last_hidden_state=acoustic_latents, pooler_output=combined_features)
 
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_audio_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_audio_mask = special_audio_mask.all(-1)
+        else:
+            special_audio_mask = input_ids == self.config.audio_token_id
+
+        n_audio_tokens = special_audio_mask.sum()
+        n_audio_features = audio_features.shape[0]
+        special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        torch_compilable_check(
+            inputs_embeds[special_audio_mask].numel() == audio_features.numel(),
+            f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}",
+        )
+        return special_audio_mask
+
     @can_return_tuple
     @auto_docstring
     def forward(
diff --git a/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py
index 07325b0ea559..dbecd9a6f530 100644
--- a/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py
+++ b/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py
@@ -39,7 +39,14 @@
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
+from ...utils import (
+    TransformersKwargs,
+    auto_docstring,
+    can_return_tuple,
+    is_torchdynamo_compiling,
+    logging,
+    torch_compilable_check,
+)
 from ...utils.generic import maybe_autocast, merge_with_config_defaults
 from ...utils.output_capturing import capture_outputs
 from ..auto import AutoModel
@@ -1007,6 +1014,30 @@ def get_audio_features(
 
         return audio_outputs
 
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_audio_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_audio_mask = special_audio_mask.all(-1)
+        else:
+            special_audio_mask = input_ids == self.config.audio_token_id
+
+        n_audio_tokens = special_audio_mask.sum()
+        n_audio_features = audio_features.shape[0]
+        special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        torch_compilable_check(
+            inputs_embeds[special_audio_mask].numel() == audio_features.numel(),
+            f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}",
+        )
+        return special_audio_mask
+
     @can_return_tuple
     @auto_docstring
     def forward(
diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index 3493fde4a267..f54350185c43 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -63,7 +63,7 @@ def __init__(self, parent, **kwargs):
             "intermediate_size": 256,
             "encoder_hidden_size": 32,
         }
-   
+
         super().__init__(parent, **kwargs)
 
     def create_audio_features(self):

From 874040992375d09ff521abc400c3f32d80a1c8f0 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Wed, 22 Apr 2026 15:56:38 +0200
Subject: [PATCH 20/38] unskip test_sdpa_can_dispatch_on_flash on qwen2_audio

---
 tests/models/qwen2_audio/test_modeling_qwen2_audio.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
index fc73d6dca607..669b5a4287a9 100644
--- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
+++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
@@ -86,10 +86,6 @@ class Qwen2AudioForConditionalGenerationModelTest(ALMModelTest, unittest.TestCas
     def test_sdpa_can_compile_dynamic(self):
         pass
 
-    @unittest.skip(reason="Compile not yet supported because in Qwen2Audio models")
-    def test_sdpa_can_dispatch_on_flash(self):
-        pass
-
     @unittest.skip(reason="inputs_embeds is the audio-fused path; can't match raw token-only embeddings.")
     def test_inputs_embeds_matches_input_ids(self):
         pass

From dde65f61fa3bf84988411c25f3737c1f02ba08e2 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Wed, 22 Apr 2026 16:24:31 +0200
Subject: [PATCH 21/38] should not be skipped

---
 tests/models/glmasr/test_modeling_glmasr.py       | 15 ---------------
 .../musicflamingo/test_modeling_musicflamingo.py  | 15 ---------------
 .../qwen2_audio/test_modeling_qwen2_audio.py      |  5 -----
 .../vibevoice_asr/test_modeling_vibevoice_asr.py  | 14 --------------
 .../test_modeling_voxtral_realtime.py             |  4 ----
 5 files changed, 53 deletions(-)

diff --git a/tests/models/glmasr/test_modeling_glmasr.py b/tests/models/glmasr/test_modeling_glmasr.py
index 0b2aae719d19..b19e91a61209 100644
--- a/tests/models/glmasr/test_modeling_glmasr.py
+++ b/tests/models/glmasr/test_modeling_glmasr.py
@@ -15,8 +15,6 @@
 
 import unittest
 
-import pytest
-
 from transformers import (
     AutoProcessor,
     GlmAsrConfig,
@@ -77,19 +75,6 @@ class GlmAsrForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase):
     def test_inputs_embeds_matches_input_ids(self):
         pass
 
-    @unittest.skip(reason="Compile not yet supported for GlmAsr models")
-    @pytest.mark.torch_compile_test
-    def test_sdpa_can_compile_dynamic(self):
-        pass
-
-    @unittest.skip(reason="Compile not yet supported for GlmAsr models")
-    def test_sdpa_can_dispatch_on_flash(self):
-        pass
-
-    @unittest.skip(reason="GlmAsr tests avoid right-padding equivalence; fusion is in-place.")
-    def test_flash_attn_2_inference_equivalence_right_padding(self):
-        pass
-
 
 @require_torch
 class GlmAsrForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/musicflamingo/test_modeling_musicflamingo.py b/tests/models/musicflamingo/test_modeling_musicflamingo.py
index 6996ff4ccb71..2615af219ff5 100644
--- a/tests/models/musicflamingo/test_modeling_musicflamingo.py
+++ b/tests/models/musicflamingo/test_modeling_musicflamingo.py
@@ -19,8 +19,6 @@
 import unittest
 from pathlib import Path
 
-import pytest
-
 from transformers import (
     AudioFlamingo3EncoderConfig,
     AutoProcessor,
@@ -160,19 +158,6 @@ def test_build_audio_timestamps_reconstructs_windows_from_input_ids(self):
     def test_inputs_embeds_matches_input_ids(self):
         pass
 
-    @unittest.skip(reason="Compile not yet supported for MusicFlamingo models")
-    @pytest.mark.torch_compile_test
-    def test_sdpa_can_compile_dynamic(self):
-        pass
-
-    @unittest.skip(reason="Compile not yet supported for MusicFlamingo models")
-    def test_sdpa_can_dispatch_on_flash(self):
-        pass
-
-    @unittest.skip(reason="MusicFlamingo tests avoid right-padding equivalence; fusion is in-place.")
-    def test_flash_attn_2_inference_equivalence_right_padding(self):
-        pass
-
 
 @require_torch
 class MusicFlamingoForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
index 669b5a4287a9..869e8ff93753 100644
--- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
+++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
@@ -81,11 +81,6 @@ class Qwen2AudioForConditionalGenerationModelTest(ALMModelTest, unittest.TestCas
     model_tester_class = Qwen2AudioModelTester
     pipeline_model_mapping = {"any-to-any": Qwen2AudioForConditionalGeneration} if is_torch_available() else {}
 
-    @unittest.skip(reason="Compile not yet supported because in Qwen2Audio models")
-    @pytest.mark.torch_compile_test
-    def test_sdpa_can_compile_dynamic(self):
-        pass
-
     @unittest.skip(reason="inputs_embeds is the audio-fused path; can't match raw token-only embeddings.")
     def test_inputs_embeds_matches_input_ids(self):
         pass
diff --git a/tests/models/vibevoice_asr/test_modeling_vibevoice_asr.py b/tests/models/vibevoice_asr/test_modeling_vibevoice_asr.py
index be0ece165e36..fc8bb11568ea 100644
--- a/tests/models/vibevoice_asr/test_modeling_vibevoice_asr.py
+++ b/tests/models/vibevoice_asr/test_modeling_vibevoice_asr.py
@@ -17,7 +17,6 @@
 import unittest
 from pathlib import Path
 
-import pytest
 from parameterized import parameterized
 
 from transformers import (
@@ -150,19 +149,6 @@ def setUp(self):
     def test_inputs_embeds_matches_input_ids(self):
         pass
 
-    @unittest.skip(reason="Compile not yet supported for VibeVoiceAsr models")
-    @pytest.mark.torch_compile_test
-    def test_sdpa_can_compile_dynamic(self):
-        pass
-
-    @unittest.skip(reason="Compile not yet supported for VibeVoiceAsr models")
-    def test_sdpa_can_dispatch_on_flash(self):
-        pass
-
-    @unittest.skip(reason="VibeVoiceAsr tests avoid right-padding equivalence; fusion is in-place.")
-    def test_flash_attn_2_inference_equivalence_right_padding(self):
-        pass
-
     @unittest.skip(reason="VibeVoiceAsr has no separate base model without a head.")
     def test_model_base_model_prefix(self):
         pass
diff --git a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py
index 86682cd558a0..24bf9ccbd706 100644
--- a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py
+++ b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py
@@ -159,10 +159,6 @@ def test_generate_compile_model_forward_fullgraph(self):
     def test_generate_with_and_without_position_ids(self):
         super().test_generate_with_and_without_position_ids()
 
-    @unittest.skip(reason="VoxtralRealtime does not have a base model")
-    def test_model_base_model_prefix(self):
-        pass
-
     @unittest.skip(
         reason="This test does not apply to VoxtralRealtime since input_features must be provided along input_ids"
     )

From 19b37c5adad555adb650fb9863fc0e3dc3b6d272 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Wed, 22 Apr 2026 16:33:02 +0200
Subject: [PATCH 22/38] make fix-repo

---
 tests/models/qwen2_audio/test_modeling_qwen2_audio.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
index 869e8ff93753..1557217fdd63 100644
--- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
+++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
@@ -18,7 +18,6 @@
 from urllib.request import urlopen
 
 import librosa
-import pytest
 
 from transformers import (
     AutoProcessor,

From b47621a9fb02efeb51869df863e54356b1173671 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Wed, 22 Apr 2026 17:51:47 +0200
Subject: [PATCH 23/38] test_mismatching_num_audio_tokens should be skipped for
 voxtral_realtime

---
 src/transformers/models/esm/configuration_esm.py           | 4 ++--
 .../voxtral_realtime/test_modeling_voxtral_realtime.py     | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/esm/configuration_esm.py b/src/transformers/models/esm/configuration_esm.py
index a00dcf8b39e3..7875d88ecee8 100644
--- a/src/transformers/models/esm/configuration_esm.py
+++ b/src/transformers/models/esm/configuration_esm.py
@@ -159,12 +159,12 @@ class EsmConfig(PreTrainedConfig):
     mask_token_id (`int`, *optional*):
         The index of the mask token in the vocabulary. This must be included in the config because of the
         "mask-dropout" scaling trick, which will scale the inputs depending on the number of masked tokens.
+    rope_theta (`float`, defaults to 10000.0):
+        The base period of the RoPE embeddings. Only used when `position_embedding_type` is set to `"rotary"`.
     position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
         Type of position embedding. Choose either `"absolute"` or "rotary"`.
     emb_layer_norm_before (`bool`, *optional*):
         Whether to apply layer normalization after embeddings but before the main stem of the network.
-    rope_theta (`float`, defaults to 10000.0):
-        The base period of the RoPE embeddings. Only used when `position_embedding_type` is set to `"rotary"`.
     token_dropout (`bool`, defaults to `False`):
         When this is enabled, masked tokens are treated as if they had been dropped out by input dropout.
     is_folding_model (`bool`, defaults to `False`):
diff --git a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py
index 24bf9ccbd706..150d7a894104 100644
--- a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py
+++ b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py
@@ -159,6 +159,13 @@ def test_generate_compile_model_forward_fullgraph(self):
     def test_generate_with_and_without_position_ids(self):
         super().test_generate_with_and_without_position_ids()
 
+    @unittest.skip(
+        reason="This test does not apply to VoxtralRealtime: audio tokens are not replaced in inputs_embeds, "
+        "audio and text embeddings are summed instead."
+    )
+    def test_mismatching_num_audio_tokens(self):
+        pass
+
     @unittest.skip(
         reason="This test does not apply to VoxtralRealtime since input_features must be provided along input_ids"
     )

From b9d30be1262245c8e658dfdd3e8624660a10e660 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 27 Apr 2026 14:48:59 +0900
Subject: [PATCH 24/38] nit

---
 tests/multimodal_tester.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/multimodal_tester.py b/tests/multimodal_tester.py
index 1a52a5be303c..41c1be171dd7 100644
--- a/tests/multimodal_tester.py
+++ b/tests/multimodal_tester.py
@@ -31,7 +31,7 @@
 
 
 class MultiModalModelTester:
-    """Shared tester base for VLM (vision-language) and ALM (audio-language).
+    """Shared tester base for VLM (vision-language) and ALM (audio-language) models.
 
     Concrete subclasses (e.g. `VLMModelTester`, `ALMModelTester`) supply:
       - the modality-specific sub-config class (`vision_config_class` for VLMs, `audio_config_class` for ALMs, ...),

From 8d2e4b7623b88cafa969de8d63baddf3346eadeb Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 27 Apr 2026 14:57:30 +0900
Subject: [PATCH 25/38] _special_token_ids as property and skipped in
 prepare_config_and_inputs_for_common

---
 tests/alm_tester.py        | 3 ++-
 tests/multimodal_tester.py | 7 ++++---
 tests/vlm_tester.py        | 3 ++-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests/alm_tester.py b/tests/alm_tester.py
index fd16623994ea..25647221c3a5 100644
--- a/tests/alm_tester.py
+++ b/tests/alm_tester.py
@@ -110,8 +110,9 @@ def create_audio_mask(self):
 
     # -- Hooks consumed by the shared base ---------------------------------------------------
 
+    @property
     def _special_token_ids(self):
-        return super()._special_token_ids() | {self.audio_token_id}
+        return super()._special_token_ids | {self.audio_token_id}
 
     def _build_modality_sub_configs(self):
         return {self.audio_config_key: self.get_audio_config()}
diff --git a/tests/multimodal_tester.py b/tests/multimodal_tester.py
index 41c1be171dd7..72de0834bf55 100644
--- a/tests/multimodal_tester.py
+++ b/tests/multimodal_tester.py
@@ -115,6 +115,7 @@ def get_additional_inputs(self, config, input_ids, modality_tensor):
         """Model-specific extra inputs (e.g. LlavaNext `image_sizes`, Qwen3VL `mm_token_type_ids`)."""
         return {}
 
+    @property
     def _special_token_ids(self):
         """Special token ids that must never appear as random text tokens. Subclasses add modality tokens."""
         return {self.pad_token_id, self.bos_token_id, self.eos_token_id}
@@ -134,7 +135,7 @@ def _prepare_modality_inputs(self, input_ids, config):
 
     def _safe_token_id(self):
         """Smallest token ID that is not a special token. Used to scrub random ids_tensor outputs."""
-        special_tokens = self._special_token_ids()
+        special_tokens = self._special_token_ids
         for i in range(self.vocab_size):
             if i not in special_tokens:
                 return i
@@ -148,8 +149,8 @@ def prepare_config_and_inputs_for_common(self):
         # Avoid flaky tests by scrubbing any accidental special tokens produced by ids_tensor.
         # Modality placeholder tokens are scrubbed and placed by `_prepare_modality_inputs`.
         safe_token_id = self._safe_token_id()
-        input_ids[input_ids == self.pad_token_id] = safe_token_id
-        input_ids[input_ids == self.eos_token_id] = safe_token_id
+        for token_id in self._special_token_ids:
+            input_ids[input_ids == token_id] = safe_token_id
 
         input_ids, modality_inputs, modality_tensor = self._prepare_modality_inputs(input_ids, config)
 
diff --git a/tests/vlm_tester.py b/tests/vlm_tester.py
index 7a435028c5e4..31914ebfc95d 100644
--- a/tests/vlm_tester.py
+++ b/tests/vlm_tester.py
@@ -95,8 +95,9 @@ def place_image_tokens(self, input_ids, config):
 
     # -- Hooks consumed by the shared base ---------------------------------------------------
 
+    @property
     def _special_token_ids(self):
-        return super()._special_token_ids() | {self.image_token_id}
+        return super()._special_token_ids | {self.image_token_id}
 
     def _build_modality_sub_configs(self):
         return {"vision_config": self.get_vision_config()}

From cbd526f24f9fb976e5916f208e9693e86715d8f7 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 27 Apr 2026 14:59:59 +0900
Subject: [PATCH 26/38] MoE params in common class

---
 tests/multimodal_tester.py | 8 ++++++++
 tests/vlm_tester.py        | 8 --------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/multimodal_tester.py b/tests/multimodal_tester.py
index 72de0834bf55..66c9ab12ddca 100644
--- a/tests/multimodal_tester.py
+++ b/tests/multimodal_tester.py
@@ -90,6 +90,14 @@ def __init__(self, parent, **kwargs):
         kwargs.setdefault("max_position_embeddings", 512)
         kwargs.setdefault("bos_token_id", 1)
         kwargs.setdefault("eos_token_id", 2)
+        kwargs.setdefault("expert_interval", 1)
+        kwargs.setdefault("moe_layer_start_index", 0)
+        kwargs.setdefault("moe_intermediate_size", 12)
+        kwargs.setdefault("shared_expert_intermediate_size", 36)
+        kwargs.setdefault("shared_expert_gate", True)
+        kwargs.setdefault("moe_num_shared_experts", 2)
+        kwargs.setdefault("num_experts_per_tok", 2)
+        kwargs.setdefault("num_experts", 8)
         kwargs.setdefault("ignore_index", -100)
         kwargs.setdefault("scope", None)
 
diff --git a/tests/vlm_tester.py b/tests/vlm_tester.py
index 31914ebfc95d..685dc09facd4 100644
--- a/tests/vlm_tester.py
+++ b/tests/vlm_tester.py
@@ -52,14 +52,6 @@ def __init__(self, parent, **kwargs):
         kwargs.setdefault("pad_token_id", 0)
         kwargs.setdefault("image_token_id", 3)
         kwargs.setdefault("is_decoder", False)
-        kwargs.setdefault("expert_interval", 1)
-        kwargs.setdefault("moe_layer_start_index", 0)
-        kwargs.setdefault("moe_intermediate_size", 12)
-        kwargs.setdefault("shared_expert_intermediate_size", 36)
-        kwargs.setdefault("shared_expert_gate", True)
-        kwargs.setdefault("moe_num_shared_experts", 2)
-        kwargs.setdefault("num_experts_per_tok", 2)
-        kwargs.setdefault("num_experts", 8)
         kwargs.setdefault("image_size", 8)
         kwargs.setdefault("patch_size", 4)
         kwargs.setdefault("num_channels", 3)

From 12dfcd04bedab5f12a635ceb6e6536e033d78b2c Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 27 Apr 2026 16:17:26 +0900
Subject: [PATCH 27/38] add _TEXT_MODEL_TESTER_DEFAULTS to avoid divergence

---
 src/transformers/testing_utils.py | 28 +++++++++++++++
 tests/causal_lm_tester.py         | 60 +++++++------------------------
 tests/multimodal_tester.py        | 33 ++++++-----------
 3 files changed, 51 insertions(+), 70 deletions(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 863242a695c6..908337fd4fd4 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -228,6 +228,34 @@
     "conditional_generation_class": "ForConditionalGeneration",
 }
 
+# Shared text-model defaults for CausalLMModelTester and MultiModalModelTester.
+_TEXT_MODEL_TESTER_DEFAULTS = {
+    "batch_size": 13,
+    "seq_length": 7,
+    "is_training": True,
+    "use_input_mask": True,
+    "use_labels": True,
+    "vocab_size": 99,
+    "hidden_size": 32,
+    "num_hidden_layers": 2,
+    "num_attention_heads": 2,
+    "num_key_value_heads": 2,
+    "intermediate_size": 32,
+    "hidden_act": "gelu",
+    "max_position_embeddings": 512,
+    "pad_token_id": 0,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "expert_interval": 1,
+    "moe_layer_start_index": 0,
+    "moe_intermediate_size": 16,
+    "shared_expert_intermediate_size": 36,
+    "shared_expert_gate": True,
+    "moe_num_shared_experts": 2,
+    "num_experts_per_tok": 2,
+    "num_experts": 8,
+}
+
 
 if is_torch_available():
     import torch
diff --git a/tests/causal_lm_tester.py b/tests/causal_lm_tester.py
index b3398f13c393..6b94a520d4f2 100644
--- a/tests/causal_lm_tester.py
+++ b/tests/causal_lm_tester.py
@@ -22,6 +22,7 @@
 from transformers.models.auto.auto_factory import getattribute_from_module
 from transformers.testing_utils import (
     _COMMON_MODEL_NAMES_MAP,
+    _TEXT_MODEL_TESTER_DEFAULTS,
     is_flaky,
     require_flash_attn,
     require_torch_accelerator,
@@ -166,84 +167,43 @@ def pipeline_model_mapping(self):
     def __init__(
         self,
         parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
         use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        num_key_value_heads=2,
-        intermediate_size=32,
-        hidden_act="gelu",
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
         type_vocab_size=16,
         type_sequence_label_size=2,
         initializer_range=0.02,
         num_labels=3,
         num_choices=4,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
         is_decoder=False,
         scope=None,
-        expert_interval=1,
-        moe_layer_start_index=0,
-        moe_intermediate_size=16,
-        shared_expert_intermediate_size=36,
-        shared_expert_gate=True,
-        moe_num_shared_experts=2,
-        num_experts_per_tok=2,
-        num_experts=8,
         mamba_n_groups=1,
         mamba_n_heads=16,
         mamba_d_state=16,
         mamba_d_conv=4,
         mamba_expand=2,
         mamba_chunk_size=16,
+        **kwargs,
     ):
         self._verify_and_infer_model_attributes()
         self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
+
+        # Apply shared text-model defaults, then let caller kwargs override
+        for key, default in _TEXT_MODEL_TESTER_DEFAULTS.items():
+            setattr(self, key, kwargs.pop(key, default))
+
+        # CausalLM-specific defaults (not shared with multimodal testers)
         self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
         self.hidden_dropout_prob = hidden_dropout_prob
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
         self.type_vocab_size = type_vocab_size
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
         self.num_labels = num_labels
         self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
         self.scope = scope
         self.head_dim = self.hidden_size // self.num_attention_heads
         self.is_decoder = is_decoder
-        self.expert_interval = expert_interval
-        self.moe_layer_start_index = moe_layer_start_index
-        self.moe_intermediate_size = moe_intermediate_size
-        self.shared_expert_intermediate_size = shared_expert_intermediate_size
-        self.shared_expert_gate = shared_expert_gate
-        self.moe_num_shared_experts = moe_num_shared_experts
-        self.num_experts_per_tok = num_experts_per_tok
-        self.num_experts = num_experts
         self.mamba_n_groups = mamba_n_groups
         self.mamba_n_heads = mamba_n_heads
         self.mamba_d_state = mamba_d_state
@@ -252,6 +212,10 @@ def __init__(
         self.mamba_chunk_size = mamba_chunk_size
         self.tie_word_embeddings = False
 
+        # Any remaining kwargs become attributes (for model-specific params)
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
diff --git a/tests/multimodal_tester.py b/tests/multimodal_tester.py
index 66c9ab12ddca..7c1e0ea6a75f 100644
--- a/tests/multimodal_tester.py
+++ b/tests/multimodal_tester.py
@@ -15,6 +15,8 @@
 from inspect import signature
 
 from .test_configuration_common import ConfigTester
+from transformers.testing_utils import _TEXT_MODEL_TESTER_DEFAULTS
+
 from .test_modeling_common import (
     GenerationTesterMixin,
     ModelTesterMixin,
@@ -74,30 +76,17 @@ def all_model_classes(self):
     def __init__(self, parent, **kwargs):
         self.parent = parent
 
-        # Text-side defaults shared by every multimodal tester. Subclasses are expected to `setdefault`
-        # their modality-specific kwargs (and any differing values such as `pad_token_id`) *before* calling super.
+        # Multimodal-specific overrides of shared defaults (applied before the shared
+        # defaults so they take precedence, but after any subclass setdefault calls).
         kwargs.setdefault("batch_size", 3)
-        kwargs.setdefault("is_training", True)
-        kwargs.setdefault("use_input_mask", True)
-        kwargs.setdefault("use_labels", True)
-        kwargs.setdefault("vocab_size", 99)
-        kwargs.setdefault("hidden_size", 32)
-        kwargs.setdefault("num_hidden_layers", 2)
-        kwargs.setdefault("num_attention_heads", 2)
-        kwargs.setdefault("num_key_value_heads", 2)
-        kwargs.setdefault("intermediate_size", 32)  # Keep this divisible by 8 for fp16/bf16/fp32 16-bytes alignment
-        kwargs.setdefault("hidden_act", "gelu")
-        kwargs.setdefault("max_position_embeddings", 512)
-        kwargs.setdefault("bos_token_id", 1)
-        kwargs.setdefault("eos_token_id", 2)
-        kwargs.setdefault("expert_interval", 1)
-        kwargs.setdefault("moe_layer_start_index", 0)
         kwargs.setdefault("moe_intermediate_size", 12)
-        kwargs.setdefault("shared_expert_intermediate_size", 36)
-        kwargs.setdefault("shared_expert_gate", True)
-        kwargs.setdefault("moe_num_shared_experts", 2)
-        kwargs.setdefault("num_experts_per_tok", 2)
-        kwargs.setdefault("num_experts", 8)
+
+        # Apply shared text-model defaults for anything not already set.
+        # Subclasses are expected to `setdefault` their modality-specific kwargs
+        # (and any differing values such as `pad_token_id`) *before* calling super.
+        for key, default in _TEXT_MODEL_TESTER_DEFAULTS.items():
+            kwargs.setdefault(key, default)
+
         kwargs.setdefault("ignore_index", -100)
         kwargs.setdefault("scope", None)
 

From 95b1f20296aa97dac7a1b2c10c44e9254231a01a Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 27 Apr 2026 16:28:22 +0900
Subject: [PATCH 28/38] nit

---
 tests/vlm_tester.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/vlm_tester.py b/tests/vlm_tester.py
index 685dc09facd4..d8cae2e215f6 100644
--- a/tests/vlm_tester.py
+++ b/tests/vlm_tester.py
@@ -49,7 +49,6 @@ def __init__(self, parent, **kwargs):
         kwargs.setdefault("initializer_range", 0.02)
         kwargs.setdefault("num_labels", 3)
         kwargs.setdefault("num_choices", 4)
-        kwargs.setdefault("pad_token_id", 0)
         kwargs.setdefault("image_token_id", 3)
         kwargs.setdefault("is_decoder", False)
         kwargs.setdefault("image_size", 8)

From c2aa666ec2790f78f03c7c41366f96513928432e Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 27 Apr 2026 16:36:31 +0900
Subject: [PATCH 29/38] clearer inits

---
 tests/alm_tester.py | 7 ++++---
 tests/vlm_tester.py | 7 +++++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/alm_tester.py b/tests/alm_tester.py
index 25647221c3a5..fe339188cf52 100644
--- a/tests/alm_tester.py
+++ b/tests/alm_tester.py
@@ -49,12 +49,13 @@ def pipeline_model_mapping(self):
         return mapping
 
     def __init__(self, parent, **kwargs):
-        # Standard defaults
+        # Overrides of _TEXT_MODEL_TESTER_DEFAULTS
         kwargs.setdefault("seq_length", 32)
-        kwargs.setdefault("feat_seq_length", 128)
+        kwargs.setdefault("pad_token_id", 1)
 
+        # ALM-specific defaults
+        kwargs.setdefault("feat_seq_length", 128)
         kwargs.setdefault("num_mel_bins", 80)
-        kwargs.setdefault("pad_token_id", 1)
         kwargs.setdefault("audio_token_id", 0)
 
         super().__init__(parent, **kwargs)
diff --git a/tests/vlm_tester.py b/tests/vlm_tester.py
index d8cae2e215f6..be175032b34d 100644
--- a/tests/vlm_tester.py
+++ b/tests/vlm_tester.py
@@ -40,7 +40,11 @@ def pipeline_model_mapping(self):
         }
 
     def __init__(self, parent, **kwargs):
-        # Standard defaults
+        # Overrides of _TEXT_MODEL_TESTER_DEFAULTS
+        kwargs.setdefault("seq_length", 7 + kwargs.get("num_image_tokens", (kwargs.get("image_size", 8) // kwargs.get("patch_size", 4)) ** 2))
+        kwargs.setdefault("pad_token_id", 0)
+
+        # VLM-specific defaults
         kwargs.setdefault("use_token_type_ids", False)
         kwargs.setdefault("hidden_dropout_prob", 0.1)
         kwargs.setdefault("attention_probs_dropout_prob", 0.1)
@@ -60,7 +64,6 @@ def __init__(self, parent, **kwargs):
         kwargs.setdefault("vision_feature_layer", -1)
         kwargs.setdefault("tie_word_embeddings", False)
         kwargs.setdefault("num_image_tokens", (kwargs["image_size"] // kwargs["patch_size"]) ** 2)
-        kwargs.setdefault("seq_length", 7 + kwargs["num_image_tokens"])
 
         super().__init__(parent, **kwargs)
 

From 5e36c9f87d717d43497b4ba9a73481c6f29d1a65 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 27 Apr 2026 16:44:22 +0900
Subject: [PATCH 30/38] _prepare_modality_inputs return dict

---
 tests/alm_tester.py                                 |  2 +-
 tests/models/gemma3/test_modeling_gemma3.py         |  2 +-
 tests/models/llava_next/test_modeling_llava_next.py |  2 +-
 tests/models/qwen3_vl/test_modeling_qwen3_vl.py     |  2 +-
 .../qwen3_vl_moe/test_modeling_qwen3_vl_moe.py      |  2 +-
 tests/multimodal_tester.py                          | 13 ++++++++-----
 tests/vlm_tester.py                                 |  2 +-
 7 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/tests/alm_tester.py b/tests/alm_tester.py
index fe339188cf52..b51cc4f11880 100644
--- a/tests/alm_tester.py
+++ b/tests/alm_tester.py
@@ -129,7 +129,7 @@ def _prepare_modality_inputs(self, input_ids, config):
         modality_inputs = {self.get_audio_feature_key(): audio_features}
         if self.audio_mask_key is not None:
             modality_inputs[self.audio_mask_key] = audio_mask
-        return input_ids, modality_inputs, audio_features
+        return input_ids, modality_inputs
 
     # -- Audio sub-config construction -------------------------------------------------------
 
diff --git a/tests/models/gemma3/test_modeling_gemma3.py b/tests/models/gemma3/test_modeling_gemma3.py
index fe65a3f83bcf..02a7004d73e3 100644
--- a/tests/models/gemma3/test_modeling_gemma3.py
+++ b/tests/models/gemma3/test_modeling_gemma3.py
@@ -281,7 +281,7 @@ def create_attention_mask(self, input_ids):
         # Gemma3 uses padding mask for bidirectional attention on image tokens
         return input_ids.ne(self.pad_token_id).to(torch_device)
 
-    def get_additional_inputs(self, config, input_ids, pixel_values):
+    def get_additional_inputs(self, config, input_ids, modality_inputs):
         # Gemma3 requires specific token_type_ids for bidirectional attention on image tokens
         token_type_ids = torch.zeros_like(input_ids)
         token_type_ids[input_ids == config.image_token_id] = 1
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
index a5bd146fcc6d..6f3c2aa03751 100644
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -84,7 +84,7 @@ def create_pixel_values(self):
             ]
         )
 
-    def get_additional_inputs(self, config, input_ids, pixel_values):
+    def get_additional_inputs(self, config, input_ids, modality_inputs):
         """LlavaNext requires image_sizes tensor"""
         return {
             "image_sizes": torch.tensor([[self.image_size, self.image_size]] * self.batch_size),
diff --git a/tests/models/qwen3_vl/test_modeling_qwen3_vl.py b/tests/models/qwen3_vl/test_modeling_qwen3_vl.py
index 9874ce4a8203..d80cb3819486 100644
--- a/tests/models/qwen3_vl/test_modeling_qwen3_vl.py
+++ b/tests/models/qwen3_vl/test_modeling_qwen3_vl.py
@@ -107,7 +107,7 @@ def place_image_tokens(self, input_ids, config):
         input_ids[:, 0] = self.vision_start_token_id
         return input_ids
 
-    def get_additional_inputs(self, config, input_ids, pixel_values):
+    def get_additional_inputs(self, config, input_ids, modality_inputs):
         mm_token_type_ids = torch.zeros_like(input_ids)
         mm_token_type_ids[input_ids == self.image_token_id] = 1
         return {
diff --git a/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py b/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py
index 0b0523de3b71..03a93ef1d7fd 100644
--- a/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py
+++ b/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py
@@ -106,7 +106,7 @@ def place_image_tokens(self, input_ids, config):
         input_ids[:, 0] = self.vision_start_token_id
         return input_ids
 
-    def get_additional_inputs(self, config, input_ids, pixel_values):
+    def get_additional_inputs(self, config, input_ids, modality_inputs):
         # Qwen3VL requires image_grid_thw tensor
         mm_token_type_ids = torch.zeros_like(input_ids)
         mm_token_type_ids[input_ids == self.image_token_id] = 1
diff --git a/tests/multimodal_tester.py b/tests/multimodal_tester.py
index 7c1e0ea6a75f..3a91f536f429 100644
--- a/tests/multimodal_tester.py
+++ b/tests/multimodal_tester.py
@@ -108,8 +108,11 @@ def create_attention_mask(self, input_ids):
         """Default causal (lower-triangular) attention mask. Override for bidirectional models like Gemma3."""
         return torch.tril(torch.ones_like(input_ids).to(torch_device))
 
-    def get_additional_inputs(self, config, input_ids, modality_tensor):
-        """Model-specific extra inputs (e.g. LlavaNext `image_sizes`, Qwen3VL `mm_token_type_ids`)."""
+    def get_additional_inputs(self, config, input_ids, modality_inputs):
+        """Model-specific extra inputs (e.g. LlavaNext `image_sizes`, Qwen3VL `mm_token_type_ids`).
+
+        ``modality_inputs`` is the full dict returned by ``_prepare_modality_inputs``.
+        """
         return {}
 
     @property
@@ -124,7 +127,7 @@ def _build_modality_sub_configs(self):
     def _prepare_modality_inputs(self, input_ids, config):
         """Create modality features, place modality placeholder tokens in ``input_ids``, and return:
 
-        (input_ids_with_placeholders, modality_inputs_dict, modality_tensor_for_additional_inputs)
+        (input_ids_with_placeholders, modality_inputs_dict)
         """
         raise NotImplementedError
 
@@ -149,7 +152,7 @@ def prepare_config_and_inputs_for_common(self):
         for token_id in self._special_token_ids:
             input_ids[input_ids == token_id] = safe_token_id
 
-        input_ids, modality_inputs, modality_tensor = self._prepare_modality_inputs(input_ids, config)
+        input_ids, modality_inputs = self._prepare_modality_inputs(input_ids, config)
 
         # Create attention mask with final input_ids (after modality placeholders are placed) — important
         # for models that derive padding from token values.
@@ -157,7 +160,7 @@ def prepare_config_and_inputs_for_common(self):
 
         inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
         inputs_dict.update(modality_inputs)
-        inputs_dict.update(self.get_additional_inputs(config, input_ids, modality_tensor))
+        inputs_dict.update(self.get_additional_inputs(config, input_ids, modality_inputs))
         return config, inputs_dict
 
     # -- Config construction helpers ----------------------------------------------------------
diff --git a/tests/vlm_tester.py b/tests/vlm_tester.py
index be175032b34d..ba08097e048a 100644
--- a/tests/vlm_tester.py
+++ b/tests/vlm_tester.py
@@ -99,7 +99,7 @@ def _build_modality_sub_configs(self):
     def _prepare_modality_inputs(self, input_ids, config):
         pixel_values = self.create_pixel_values()
         input_ids = self.place_image_tokens(input_ids, config)
-        return input_ids, {"pixel_values": pixel_values}, pixel_values
+        return input_ids, {"pixel_values": pixel_values}
 
     # -- Vision sub-config construction ------------------------------------------------------
 

From 184227cb20e4034b175933e5307c852d84e60f22 Mon Sep 17 00:00:00 2001
From: Tarek Ziade <tarek@ziade.org>
Date: Mon, 4 May 2026 15:13:17 +0200
Subject: [PATCH 31/38] format

---
 tests/multimodal_tester.py | 2 +-
 tests/vlm_tester.py        | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/multimodal_tester.py b/tests/multimodal_tester.py
index 3a91f536f429..22559876689b 100644
--- a/tests/multimodal_tester.py
+++ b/tests/multimodal_tester.py
@@ -14,9 +14,9 @@
 
 from inspect import signature
 
-from .test_configuration_common import ConfigTester
 from transformers.testing_utils import _TEXT_MODEL_TESTER_DEFAULTS
 
+from .test_configuration_common import ConfigTester
 from .test_modeling_common import (
     GenerationTesterMixin,
     ModelTesterMixin,
diff --git a/tests/vlm_tester.py b/tests/vlm_tester.py
index ba08097e048a..bce23b71e142 100644
--- a/tests/vlm_tester.py
+++ b/tests/vlm_tester.py
@@ -41,7 +41,10 @@ def pipeline_model_mapping(self):
 
     def __init__(self, parent, **kwargs):
         # Overrides of _TEXT_MODEL_TESTER_DEFAULTS
-        kwargs.setdefault("seq_length", 7 + kwargs.get("num_image_tokens", (kwargs.get("image_size", 8) // kwargs.get("patch_size", 4)) ** 2))
+        kwargs.setdefault(
+            "seq_length",
+            7 + kwargs.get("num_image_tokens", (kwargs.get("image_size", 8) // kwargs.get("patch_size", 4)) ** 2),
+        )
         kwargs.setdefault("pad_token_id", 0)
 
         # VLM-specific defaults

From d77fbb95d8a0c7539a6ee4ff6266bf53f04eed0d Mon Sep 17 00:00:00 2001
From: Tarek Ziade <tarek@ziade.org>
Date: Mon, 4 May 2026 15:14:41 +0200
Subject: [PATCH 32/38] split line for readability

---
 tests/vlm_tester.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/vlm_tester.py b/tests/vlm_tester.py
index bce23b71e142..05be8bdfa8f1 100644
--- a/tests/vlm_tester.py
+++ b/tests/vlm_tester.py
@@ -43,7 +43,11 @@ def __init__(self, parent, **kwargs):
         # Overrides of _TEXT_MODEL_TESTER_DEFAULTS
         kwargs.setdefault(
             "seq_length",
-            7 + kwargs.get("num_image_tokens", (kwargs.get("image_size", 8) // kwargs.get("patch_size", 4)) ** 2),
+            7
+            + kwargs.get(
+                "num_image_tokens",
+                (kwargs.get("image_size", 8) // kwargs.get("patch_size", 4)) ** 2,
+            ),
         )
         kwargs.setdefault("pad_token_id", 0)
 

From 902dbba3d740812b3c75903a0fad3c62b0ba6581 Mon Sep 17 00:00:00 2001
From: Tarek Ziade <tarek@ziade.org>
Date: Mon, 4 May 2026 15:22:47 +0200
Subject: [PATCH 33/38] ran python utils/check_modular_conversion.py
 --fix_and_overwrite

---
 .../configuration_granite_speech_plus.py      | 13 ++++++-
 .../modeling_granite_speech_plus.py           | 36 ++++++++++++++-----
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/granite_speech_plus/configuration_granite_speech_plus.py b/src/transformers/models/granite_speech_plus/configuration_granite_speech_plus.py
index c17c3f7391f9..1eec538091a4 100644
--- a/src/transformers/models/granite_speech_plus/configuration_granite_speech_plus.py
+++ b/src/transformers/models/granite_speech_plus/configuration_granite_speech_plus.py
@@ -62,13 +62,19 @@ class GraniteSpeechPlusEncoderConfig(PreTrainedConfig):
     ```"""
 
     model_type = "granite_speech_plus_encoder"
+    attribute_map = {
+        "hidden_size": "hidden_dim",
+        "num_hidden_layers": "num_layers",
+        "num_attention_heads": "num_heads",
+        "num_mel_bins": "input_dim",
+    }
 
     input_dim: int = 160
     num_layers: int = 10
     hidden_dim: int = 1024
     feedforward_mult: int = 4
     num_heads: int = 8
-    dim_head: int = 128
+    dim_head: int | None = None
     output_dim: int = 42
     context_size: int = 200
     max_pos_emb: int = 512
@@ -78,6 +84,11 @@ class GraniteSpeechPlusEncoderConfig(PreTrainedConfig):
 
     cat_hidden_layers: list[int] | None = None
 
+    def __post_init__(self, **kwargs):
+        super().__post_init__(**kwargs)
+        if self.dim_head is None:
+            self.dim_head = self.hidden_dim // self.num_heads
+
 
 @auto_docstring(checkpoint="ibm-granite/granite-speech-4.1-2b-plus")
 @strict
diff --git a/src/transformers/models/granite_speech_plus/modeling_granite_speech_plus.py b/src/transformers/models/granite_speech_plus/modeling_granite_speech_plus.py
index 11020d261498..6293d9eb5941 100644
--- a/src/transformers/models/granite_speech_plus/modeling_granite_speech_plus.py
+++ b/src/transformers/models/granite_speech_plus/modeling_granite_speech_plus.py
@@ -537,6 +537,30 @@ def prepare_inputs_for_generation(
             model_inputs["input_features"] = input_features
         return model_inputs
 
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_audio_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_audio_mask = special_audio_mask.all(-1)
+        else:
+            special_audio_mask = input_ids == self.config.audio_token_id
+
+        n_audio_tokens = special_audio_mask.sum()
+        n_audio_features = audio_features.shape[0]
+        special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        torch_compilable_check(
+            inputs_embeds[special_audio_mask].numel() == audio_features.numel(),
+            f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}",
+        )
+        return special_audio_mask
+
     def get_merged_audio_embeddings(
         self, input_ids: torch.Tensor, audio_features: torch.Tensor, input_features_mask: torch.Tensor | None = None
     ) -> torch.Tensor:
@@ -557,20 +581,14 @@ def get_merged_audio_embeddings(
         llm_input_ids = torch.where(is_audio_index, 0, input_ids)
         inputs_embeds = self.language_model.get_input_embeddings()(llm_input_ids)  # [bsz, # features, hidden size]
 
-        # Mask the audio features into the text embeddings
-        special_audio_mask = is_audio_index.unsqueeze(-1)
         audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
         if input_features_mask is not None:
-            torch_compilable_check(
-                not torch.all(is_audio_index.int().sum(dim=1) != input_features_mask.int().sum(dim=1)),
-                "Number of audio tokens does not match number of audio features",
-            )
             audio_features = audio_features[input_features_mask]
 
-        inputs_embeds = inputs_embeds.masked_scatter(
-            special_audio_mask,
-            audio_features,
+        special_audio_mask = self.get_placeholder_mask(
+            input_ids, inputs_embeds=inputs_embeds, audio_features=audio_features
         )
+        inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features)
         return inputs_embeds
 
     def generate(self, *args, **kwargs) -> torch.LongTensor:

From dcdead1df6de9f3146da691d2536fbd9d26b8a5e Mon Sep 17 00:00:00 2001
From: Tarek Ziade <tarek@ziade.org>
Date: Tue, 5 May 2026 12:37:11 +0200
Subject: [PATCH 34/38] testing auto cancel


From 628343dc23997c29d3f16bc33bb858b430819a1f Mon Sep 17 00:00:00 2001
From: Tarek Ziade <tarek@ziade.org>
Date: Tue, 5 May 2026 12:37:44 +0200
Subject: [PATCH 35/38] testing auto cancel - part 2


From c1a47720b70f00df9977e5f21ae35ce40712e93f Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 11 May 2026 11:14:04 +0200
Subject: [PATCH 36/38] remove comment

---
 tests/alm_tester.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/alm_tester.py b/tests/alm_tester.py
index b51cc4f11880..c34d4d45524c 100644
--- a/tests/alm_tester.py
+++ b/tests/alm_tester.py
@@ -119,7 +119,6 @@ def _build_modality_sub_configs(self):
         return {self.audio_config_key: self.get_audio_config()}
 
     def _prepare_modality_inputs(self, input_ids, config):
-        # TODO: add a clear diagram that explains input prep ?
         audio_features = self.create_audio_features()
         audio_mask = self.create_audio_mask()
         audio_embeds_mask = self.get_audio_embeds_mask(audio_mask)

From 9322315383d3be3f9efeff8018315319b8519cd4 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 11 May 2026 11:34:20 +0200
Subject: [PATCH 37/38] udpate granite speech plus tests

---
 .../test_modeling_granite_speech_plus.py      | 178 ++++--------------
 1 file changed, 39 insertions(+), 139 deletions(-)

diff --git a/tests/models/granite_speech_plus/test_modeling_granite_speech_plus.py b/tests/models/granite_speech_plus/test_modeling_granite_speech_plus.py
index 4108a4fbb79b..21f1d997efb4 100644
--- a/tests/models/granite_speech_plus/test_modeling_granite_speech_plus.py
+++ b/tests/models/granite_speech_plus/test_modeling_granite_speech_plus.py
@@ -15,18 +15,18 @@
 
 import unittest
 
-from parameterized import parameterized
-
-from transformers import AutoProcessor, GraniteSpeechPlusConfig, GraniteSpeechPlusForConditionalGeneration
+from transformers import (
+    AutoProcessor,
+    GraniteSpeechPlusConfig,
+    GraniteSpeechPlusEncoderConfig,
+    GraniteSpeechPlusForConditionalGeneration,
+)
 from transformers.testing_utils import cleanup, require_torch, slow, torch_device
-from transformers.utils import ModelOutput, is_datasets_available, is_torch_available
+from transformers.utils import is_datasets_available, is_torch_available
 
-from ...test_configuration_common import ConfigTester
-from ..granite_speech.test_modeling_granite_speech import (
-    GraniteSpeechForConditionalGenerationModelTest as _GraniteSpeechModelTestBase,
-)
 from ..granite_speech.test_modeling_granite_speech import (
-    GraniteSpeechForConditionalGenerationModelTester as _GraniteSpeechModelTesterBase,
+    GraniteSpeechForConditionalGenerationModelTest,
+    GraniteSpeechModelTester,
 )
 
 
@@ -35,155 +35,55 @@
 if is_datasets_available():
     from datasets import load_dataset
 
-from transformers import set_seed
-
 
-class GraniteSpeechPlusForConditionalGenerationModelTester(_GraniteSpeechModelTesterBase):
+class GraniteSpeechPlusForConditionalGenerationModelTester(GraniteSpeechModelTester):
     """
-    Plus variant that exercises the ``encoder_hidden_layers`` concat path. The projector's
-    ``encoder_hidden_size`` is scaled to match ``encoder_config.hidden_dim * (len(encoder_hidden_layers) + 1)``.
+    Plus variant that exercises the ``cat_hidden_layers`` concat path. The projector's
+    ``encoder_hidden_size`` is scaled to match ``encoder_config.hidden_dim * (len(cat_hidden_layers) + 1)``.
     """
 
-    def __init__(self, parent, encoder_hidden_layers=(0,), **kwargs):
-        projector_config = kwargs.pop(
-            "projector_config",
-            {
-                "attention_probs_dropout_prob": 0.1,
-                "cross_attention_frequency": 1,
-                "encoder_hidden_size": 64,  # 32 (hidden_dim) * (1 intermediate + 1 last) = 64
-                "hidden_act": "gelu",
-                "hidden_dropout_prob": 0.1,
-                "hidden_size": 32,
-                "initializer_range": 0.02,
-                "intermediate_size": 256,
-                "layer_norm_eps": 1e-12,
-                "max_position_embeddings": 2048,
-                "model_type": "blip_2_qformer",
-                "num_attention_heads": 4,
-                "num_hidden_layers": 2,
-                "use_qformer_text_input": False,
-                "vocab_size": 30522,
-            },
-        )
-        super().__init__(parent=parent, projector_config=projector_config, **kwargs)
-        self.encoder_hidden_layers = list(encoder_hidden_layers)
-        self.encoder_config["cat_hidden_layers"] = self.encoder_hidden_layers
+    config_class = GraniteSpeechPlusConfig
+    conditional_generation_class = GraniteSpeechPlusForConditionalGeneration
+    audio_config_class = GraniteSpeechPlusEncoderConfig
 
-    def get_config(self):
-        return GraniteSpeechPlusConfig(
-            encoder_config=self.encoder_config,
-            text_config=self.text_config,
-            projector_config=self.projector_config,
-            audio_token_index=self.audio_token_index,
-            tie_word_embeddings=self.tie_word_embeddings,
-            initializer_range=self.initializer_range,
-            has_lora_adapter=self.has_lora_adapter,
-        )
+    def __init__(self, parent, cat_hidden_layers=(0,), **kwargs):
+        super().__init__(parent, **kwargs)
+        self.cat_hidden_layers = list(cat_hidden_layers)
+        # Projector encoder_hidden_size must equal hidden_dim * (len(cat_hidden_layers) + 1).
+        self.projector_config = {
+            "model_type": "blip_2_qformer",
+            "hidden_size": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 256,
+            "encoder_hidden_size": 32 * (len(self.cat_hidden_layers) + 1),
+        }
 
 
 @require_torch
-class GraniteSpeechPlusForConditionalGenerationModelTest(_GraniteSpeechModelTestBase):
+class GraniteSpeechPlusForConditionalGenerationModelTest(GraniteSpeechForConditionalGenerationModelTest):
     """
     Model tester for `GraniteSpeechPlusForConditionalGeneration`.
     """
 
-    all_model_classes = (GraniteSpeechPlusForConditionalGeneration,) if is_torch_available() else ()
+    model_tester_class = GraniteSpeechPlusForConditionalGenerationModelTester
     pipeline_model_mapping = {"any-to-any": GraniteSpeechPlusForConditionalGeneration} if is_torch_available() else {}
 
-    def setUp(self):
-        self.model_tester = GraniteSpeechPlusForConditionalGenerationModelTester(self)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=GraniteSpeechPlusConfig,
-            has_text_modality=False,
-        )
+    # The cat path changes the encoder output feature dim, so the generic shape assertion in
+    # `test_get_audio_features_output` (which assumes hidden_dim) does not apply.
+    skip_test_audio_features_output_shape = True
 
     def test_encoder_hidden_layers_concat_shape(self):
-        """With ``encoder_hidden_layers`` set, get_audio_features concatenates the selected intermediate
-        hidden states with the final hidden state before the projector."""
+        """``encoder_config.cat_hidden_layers`` concatenates selected intermediate hidden states with the final
+        hidden state along the feature dim before the projector."""
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        model = GraniteSpeechPlusForConditionalGeneration(config).to(
-            self.model_tester.parent.device if hasattr(self.model_tester.parent, "device") else "cpu"
-        )
-        model.eval()
+        model = GraniteSpeechPlusForConditionalGeneration(config).to(torch_device).eval()
         with torch.no_grad():
             out = model.get_audio_features(inputs_dict["input_features"].to(next(model.parameters()).device))
-        self.assertEqual(out.pooler_output.shape[0], inputs_dict["input_features"].shape[0])
-
-    @parameterized.expand([True, False, None])
-    def test_get_audio_features_output(self, return_dict: bool | None):
-        for model_class in self.all_model_classes:
-            if not hasattr(model_class, "get_audio_features"):
-                continue
-
-            config, inputs_dict = self._audio_features_prepare_config_and_inputs()
-            if return_dict is not None:
-                config.return_dict = return_dict
-
-            model = model_class(config).eval()
-            model = model.to(torch_device)
-
-            set_seed(42)
-            with torch.no_grad():
-                outputs = model.get_audio_features(**inputs_dict)
-
-            if return_dict in (True, None):
-                self.assertTrue(
-                    isinstance(outputs, ModelOutput), "get_audio_features() must return a BaseModelOutputWithPooling"
-                )
-                self.assertTrue(
-                    hasattr(outputs, "last_hidden_state"),
-                    "get_audio_features() must return a BaseModelOutputWithPooling with last_hidden_state",
-                )
-                self.assertTrue(
-                    hasattr(outputs, "pooler_output"),
-                    "get_audio_features() must return a BaseModelOutputWithPooling with pooler_output",
-                )
-                self.assertTrue(
-                    hasattr(outputs, "hidden_states"),
-                    "get_audio_features() must return a BaseModelOutputWithPooling with hidden_states",
-                )
-                if self.has_attentions:
-                    self.assertTrue(
-                        hasattr(outputs, "attentions"),
-                        "get_audio_features() must return a BaseModelOutputWithPooling with attentions",
-                    )
-
-                if getattr(self, "skip_test_audio_features_output_shape", False):
-                    return
-
-                last_hidden_state_shape = outputs.last_hidden_state.shape
-
-                if "input_features" in inputs_dict:
-                    batch_size = inputs_dict["input_features"].shape[0]
-                else:
-                    batch_size = inputs_dict["input_values"].shape[0]
-                self.assertEqual(
-                    last_hidden_state_shape[0],
-                    batch_size,
-                    f"batch_size mismatch, full shape: {last_hidden_state_shape}",
-                )
-
-                audio_config = config.audio_config if hasattr(config, "audio_config") else config
-                hidden_size = None
-                if hasattr(audio_config, "projection_dim"):
-                    hidden_size = audio_config.projection_dim
-                elif hasattr(audio_config, "hidden_size"):
-                    hidden_size = audio_config.hidden_size
-                elif hasattr(audio_config, "encoder_config"):
-                    hidden_size = audio_config.encoder_config.hidden_dim * (
-                        len(audio_config.encoder_config.cat_hidden_layers) + 1
-                    )
-                elif hasattr(audio_config, "encoder_ffn_dim"):
-                    hidden_size = audio_config.encoder_ffn_dim
-                self.assertEqual(
-                    last_hidden_state_shape[-1],
-                    hidden_size,
-                    f"hidden_size mismatch, full shape: {last_hidden_state_shape}",
-                )
-
-            else:
-                self.assertIsInstance(outputs, tuple, "get_audio_features() must return a tuple if return_dict=False")
+        cat_factor = len(config.encoder_config.cat_hidden_layers) + 1
+        expected_hidden_size = config.encoder_config.hidden_dim * cat_factor
+        self.assertEqual(out.last_hidden_state.shape[0], inputs_dict["input_features"].shape[0])
+        self.assertEqual(out.last_hidden_state.shape[-1], expected_hidden_size)
 
 
 class GraniteSpeechPlusForConditionalGenerationIntegrationTest(unittest.TestCase):

From 95da79839b74104c2ef10a9a09eb79017023edc4 Mon Sep 17 00:00:00 2001
From: eustlb <94853470+eustlb@users.noreply.github.com>
Date: Mon, 11 May 2026 12:15:34 +0200
Subject: [PATCH 38/38] fix test

---
 utils/check_repo.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/utils/check_repo.py b/utils/check_repo.py
index ed77fedb5745..5a7484409e31 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -832,8 +832,9 @@ def find_tested_models(test_file: str) -> set[str]:
             model_tested.add(tested_class)
 
     # Same as above, but for ALMModelTester. Audio-LMs typically only set `conditional_generation_class`
-    # (no base_model_class).
-    audio_class_match = re.search(r"class \w+\(ALMModelTester\)", content)
+    # (no base_model_class). `GraniteSpeechModelTester` is listed because `GraniteSpeechPlusForConditionalGenerationModelTester`
+    # uses `ALMModelTester` indirectly through it; in the future we may want to resolve inheritance properly.
+    audio_class_match = re.search(r"class \w+\((?:ALMModelTester|GraniteSpeechModelTester)\)", content)
     if audio_class_match is not None:
         audio_content = content[audio_class_match.start() :]
         for test_class_type in [