diff --git a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py index 1fbbc733c308..888b3b1c29c3 100644 --- a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py +++ b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py @@ -408,6 +408,7 @@ def forward(self, audio_features): ) class AudioFlamingo3ForConditionalGeneration(AudioFlamingo3PreTrainedModel, GenerationMixin): _keep_in_fp32_modules_strict = None + _supports_attention_backend = True _tp_plan = None _pp_plan = None diff --git a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py index c325bc85300e..bbe4090b06ea 100644 --- a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py +++ b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py @@ -142,6 +142,7 @@ def __init__(self, config: AudioFlamingo3Config): """ ) class AudioFlamingo3ForConditionalGeneration(VoxtralForConditionalGeneration): + _supports_attention_backend = True _tp_plan = None _pp_plan = None _keep_in_fp32_modules_strict = None diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 50bbd5721413..bd6e6f3b212c 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -210,6 +210,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): ("gpt_oss", "GptOssModel"), ("gptj", "GPTJModel"), ("granite", "GraniteModel"), + ("granite_speech", "GraniteSpeechForConditionalGeneration"), ("granitemoe", "GraniteMoeModel"), ("granitemoehybrid", "GraniteMoeHybridModel"), ("granitemoeshared", "GraniteMoeSharedModel"), diff --git a/src/transformers/models/glmasr/modeling_glmasr.py b/src/transformers/models/glmasr/modeling_glmasr.py index aff96cad3217..9430e8a91018 100644 --- a/src/transformers/models/glmasr/modeling_glmasr.py +++ b/src/transformers/models/glmasr/modeling_glmasr.py @@ -356,6 +356,7 @@ def forward(self, audio_features): ) class GlmAsrForConditionalGeneration(GlmAsrPreTrainedModel, GenerationMixin): _keep_in_fp32_modules_strict = None + _supports_attention_backend = True _tp_plan = None _pp_plan = None diff --git a/src/transformers/models/glmasr/modular_glmasr.py b/src/transformers/models/glmasr/modular_glmasr.py index ff0b8b6062a4..2c6085eb3a18 100644 --- a/src/transformers/models/glmasr/modular_glmasr.py +++ b/src/transformers/models/glmasr/modular_glmasr.py @@ -357,6 +357,8 @@ def __init__(self, config: GlmAsrConfig): """ ) class GlmAsrForConditionalGeneration(AudioFlamingo3ForConditionalGeneration): + _supports_attention_backend = True + @can_return_tuple @auto_docstring( custom_intro="Compute audio embeddings from log-mel input features using the audio encoder and multi-modal projector." diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py index 0fbc1d1035bf..03024afe8337 100644 --- a/src/transformers/models/granite_speech/modeling_granite_speech.py +++ b/src/transformers/models/granite_speech/modeling_granite_speech.py @@ -327,6 +327,8 @@ def forward( """ ) class GraniteSpeechForConditionalGeneration(GraniteSpeechPreTrainedModel, GenerationMixin): + _supports_attention_backend = True + def __init__(self, config: GraniteSpeechConfig): super().__init__(config) # NOTE: It doesn't matter when we initialize from config, but we should be careful diff --git a/src/transformers/models/musicflamingo/modeling_musicflamingo.py b/src/transformers/models/musicflamingo/modeling_musicflamingo.py index adec95bbf3e1..4ec4215a2989 100644 --- a/src/transformers/models/musicflamingo/modeling_musicflamingo.py +++ b/src/transformers/models/musicflamingo/modeling_musicflamingo.py @@ -200,6 +200,7 @@ def apply_rotary_time_emb(hidden_states, cos, sin): ) class MusicFlamingoForConditionalGeneration(MusicFlamingoPreTrainedModel, GenerationMixin): _keep_in_fp32_modules_strict = None + _supports_attention_backend = True _tp_plan = None _pp_plan = None diff --git a/src/transformers/models/vibevoice_acoustic_tokenizer/feature_extraction_vibevoice_acoustic_tokenizer.py b/src/transformers/models/vibevoice_acoustic_tokenizer/feature_extraction_vibevoice_acoustic_tokenizer.py index 9990852d83cf..859dc58e5873 100644 --- a/src/transformers/models/vibevoice_acoustic_tokenizer/feature_extraction_vibevoice_acoustic_tokenizer.py +++ b/src/transformers/models/vibevoice_acoustic_tokenizer/feature_extraction_vibevoice_acoustic_tokenizer.py @@ -73,6 +73,7 @@ def __call__( max_length: int | None = None, return_attention_mask: bool | None = True, return_tensors: str | None = "pt", + **kwargs, ) -> BatchFeature: """ Args: diff --git a/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py b/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py index 703bb6ca5130..3d26d0fbe9f3 100644 --- a/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py +++ b/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py @@ -256,6 +256,7 @@ def _init_weights(self, module): ) class VibeVoiceAsrForConditionalGeneration(VibeVoiceAsrPreTrainedModel, GenerationMixin): _keep_in_fp32_modules_strict = None + _supports_attention_backend = True _tp_plan = None _pp_plan = None diff --git a/src/transformers/models/vibevoice_asr/modular_vibevoice_asr.py b/src/transformers/models/vibevoice_asr/modular_vibevoice_asr.py index 95606aea8023..7d29824dee35 100644 --- a/src/transformers/models/vibevoice_asr/modular_vibevoice_asr.py +++ b/src/transformers/models/vibevoice_asr/modular_vibevoice_asr.py @@ -167,6 +167,8 @@ class VibeVoiceAsrPreTrainedModel(VibeVoiceAcousticTokenizerPreTrainedModel): """ ) class VibeVoiceAsrForConditionalGeneration(AudioFlamingo3ForConditionalGeneration): + _supports_attention_backend = True + def __init__(self, config: VibeVoiceAsrConfig): super().__init__(config) self.acoustic_tokenizer_encoder = AutoModel.from_config(config.acoustic_tokenizer_encoder_config) diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py index c5e7aa3defcd..95c6c443d6f0 100644 --- a/tests/models/granite_speech/test_modeling_granite_speech.py +++ b/tests/models/granite_speech/test_modeling_granite_speech.py @@ -230,6 +230,12 @@ def setUp(self): has_text_modality=False, ) + @unittest.skip( + reason="This test does not apply to GraniteSpeech since inputs_embeds corresponding to audio tokens are replaced when input features are provided." + ) + def test_inputs_embeds_matches_input_ids(self): + pass + def test_inputs_embeds(self): # overwrite inputs_embeds tests because we need to delete "input features" for the audio model config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()