From 61971d23072c4814a60b6afe3cc7c21b18c7e3a1 Mon Sep 17 00:00:00 2001 From: Harshal Janjani Date: Wed, 8 Apr 2026 17:55:10 +0000 Subject: [PATCH 1/3] chore: Add vLLM compat for audio models --- .../models/audioflamingo3/modeling_audioflamingo3.py | 1 + .../models/audioflamingo3/modular_audioflamingo3.py | 1 + src/transformers/models/auto/modeling_auto.py | 1 + src/transformers/models/glmasr/modeling_glmasr.py | 1 + src/transformers/models/glmasr/modular_glmasr.py | 2 ++ .../models/granite_speech/modeling_granite_speech.py | 2 ++ .../feature_extraction_vibevoice_acoustic_tokenizer.py | 1 + src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py | 1 + src/transformers/models/vibevoice_asr/modular_vibevoice_asr.py | 2 ++ 9 files changed, 12 insertions(+) diff --git a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py index 1fbbc733c308..888b3b1c29c3 100644 --- a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py +++ b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py @@ -408,6 +408,7 @@ def forward(self, audio_features): ) class AudioFlamingo3ForConditionalGeneration(AudioFlamingo3PreTrainedModel, GenerationMixin): _keep_in_fp32_modules_strict = None + _supports_attention_backend = True _tp_plan = None _pp_plan = None diff --git a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py index c325bc85300e..bbe4090b06ea 100644 --- a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py +++ b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py @@ -142,6 +142,7 @@ def __init__(self, config: AudioFlamingo3Config): """ ) class AudioFlamingo3ForConditionalGeneration(VoxtralForConditionalGeneration): + _supports_attention_backend = True _tp_plan = None _pp_plan = None _keep_in_fp32_modules_strict = None diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index d4cb17cddfa6..d73f5069361f 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -210,6 +210,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): ("gpt_oss", "GptOssModel"), ("gptj", "GPTJModel"), ("granite", "GraniteModel"), + ("granite_speech", "GraniteSpeechForConditionalGeneration"), ("granitemoe", "GraniteMoeModel"), ("granitemoehybrid", "GraniteMoeHybridModel"), ("granitemoeshared", "GraniteMoeSharedModel"), diff --git a/src/transformers/models/glmasr/modeling_glmasr.py b/src/transformers/models/glmasr/modeling_glmasr.py index aff96cad3217..9430e8a91018 100644 --- a/src/transformers/models/glmasr/modeling_glmasr.py +++ b/src/transformers/models/glmasr/modeling_glmasr.py @@ -356,6 +356,7 @@ def forward(self, audio_features): ) class GlmAsrForConditionalGeneration(GlmAsrPreTrainedModel, GenerationMixin): _keep_in_fp32_modules_strict = None + _supports_attention_backend = True _tp_plan = None _pp_plan = None diff --git a/src/transformers/models/glmasr/modular_glmasr.py b/src/transformers/models/glmasr/modular_glmasr.py index ff0b8b6062a4..2c6085eb3a18 100644 --- a/src/transformers/models/glmasr/modular_glmasr.py +++ b/src/transformers/models/glmasr/modular_glmasr.py @@ -357,6 +357,8 @@ def __init__(self, config: GlmAsrConfig): """ ) class GlmAsrForConditionalGeneration(AudioFlamingo3ForConditionalGeneration): + _supports_attention_backend = True + @can_return_tuple @auto_docstring( custom_intro="Compute audio embeddings from log-mel input features using the audio encoder and multi-modal projector." diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py index 0fbc1d1035bf..03024afe8337 100644 --- a/src/transformers/models/granite_speech/modeling_granite_speech.py +++ b/src/transformers/models/granite_speech/modeling_granite_speech.py @@ -327,6 +327,8 @@ def forward( """ ) class GraniteSpeechForConditionalGeneration(GraniteSpeechPreTrainedModel, GenerationMixin): + _supports_attention_backend = True + def __init__(self, config: GraniteSpeechConfig): super().__init__(config) # NOTE: It doesn't matter when we initialize from config, but we should be careful diff --git a/src/transformers/models/vibevoice_acoustic_tokenizer/feature_extraction_vibevoice_acoustic_tokenizer.py b/src/transformers/models/vibevoice_acoustic_tokenizer/feature_extraction_vibevoice_acoustic_tokenizer.py index 9990852d83cf..859dc58e5873 100644 --- a/src/transformers/models/vibevoice_acoustic_tokenizer/feature_extraction_vibevoice_acoustic_tokenizer.py +++ b/src/transformers/models/vibevoice_acoustic_tokenizer/feature_extraction_vibevoice_acoustic_tokenizer.py @@ -73,6 +73,7 @@ def __call__( max_length: int | None = None, return_attention_mask: bool | None = True, return_tensors: str | None = "pt", + **kwargs, ) -> BatchFeature: """ Args: diff --git a/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py b/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py index 703bb6ca5130..3d26d0fbe9f3 100644 --- a/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py +++ b/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py @@ -256,6 +256,7 @@ def _init_weights(self, module): ) class VibeVoiceAsrForConditionalGeneration(VibeVoiceAsrPreTrainedModel, GenerationMixin): _keep_in_fp32_modules_strict = None + _supports_attention_backend = True _tp_plan = None _pp_plan = None diff --git a/src/transformers/models/vibevoice_asr/modular_vibevoice_asr.py b/src/transformers/models/vibevoice_asr/modular_vibevoice_asr.py index 95606aea8023..7d29824dee35 100644 --- a/src/transformers/models/vibevoice_asr/modular_vibevoice_asr.py +++ b/src/transformers/models/vibevoice_asr/modular_vibevoice_asr.py @@ -167,6 +167,8 @@ class VibeVoiceAsrPreTrainedModel(VibeVoiceAcousticTokenizerPreTrainedModel): """ ) class VibeVoiceAsrForConditionalGeneration(AudioFlamingo3ForConditionalGeneration): + _supports_attention_backend = True + def __init__(self, config: VibeVoiceAsrConfig): super().__init__(config) self.acoustic_tokenizer_encoder = AutoModel.from_config(config.acoustic_tokenizer_encoder_config) From 6c3b855586f2ef3f0ee730b886ec2326255fee89 Mon Sep 17 00:00:00 2001 From: Harshal Janjani Date: Thu, 9 Apr 2026 08:20:38 +0000 Subject: [PATCH 2/3] fix: Fix ci/circleci: check_repository_consistency --- src/transformers/models/musicflamingo/modeling_musicflamingo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/musicflamingo/modeling_musicflamingo.py b/src/transformers/models/musicflamingo/modeling_musicflamingo.py index adec95bbf3e1..4ec4215a2989 100644 --- a/src/transformers/models/musicflamingo/modeling_musicflamingo.py +++ b/src/transformers/models/musicflamingo/modeling_musicflamingo.py @@ -200,6 +200,7 @@ def apply_rotary_time_emb(hidden_states, cos, sin): ) class MusicFlamingoForConditionalGeneration(MusicFlamingoPreTrainedModel, GenerationMixin): _keep_in_fp32_modules_strict = None + _supports_attention_backend = True _tp_plan = None _pp_plan = None From 5c1e3285cc00a867ad179c3b14a21d8bc88b5eb2 Mon Sep 17 00:00:00 2001 From: Harshal Janjani Date: Tue, 14 Apr 2026 15:29:01 +0000 Subject: [PATCH 3/3] nit: Skip incompatible test --- tests/models/granite_speech/test_modeling_granite_speech.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py index c5e7aa3defcd..95c6c443d6f0 100644 --- a/tests/models/granite_speech/test_modeling_granite_speech.py +++ b/tests/models/granite_speech/test_modeling_granite_speech.py @@ -230,6 +230,12 @@ def setUp(self): has_text_modality=False, ) + @unittest.skip( + reason="This test does not apply to GraniteSpeech since inputs_embeds corresponding to audio tokens are replaced when input features are provided." + ) + def test_inputs_embeds_matches_input_ids(self): + pass + def test_inputs_embeds(self): # overwrite inputs_embeds tests because we need to delete "input features" for the audio model config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()