diff --git a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py index 888b3b1c29c3..6f18fcc437ad 100644 --- a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py +++ b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py @@ -34,7 +34,7 @@ from ...modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_compilable_check from ...utils.generic import merge_with_config_defaults from ...utils.output_capturing import capture_outputs from ..auto import AutoModel, AutoModelForCausalLM @@ -474,6 +474,30 @@ def get_audio_features( return audio_output + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + @can_return_tuple @auto_docstring def forward( @@ -560,10 +584,10 @@ def forward( audio_embeds = self.get_audio_features(input_features, input_features_mask, return_dict=True).pooler_output # replace text-audio token placeholders with audio embeddings - audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1) - inputs_embeds = inputs_embeds.masked_scatter( - audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device) + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device)) outputs: CausalLMOutputWithPast = self.language_model( inputs_embeds=inputs_embeds, diff --git a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py index bbe4090b06ea..dfb2c1f54d35 100644 --- a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py +++ b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py @@ -270,10 +270,10 @@ def forward( audio_embeds = self.get_audio_features(input_features, input_features_mask, return_dict=True).pooler_output # replace text-audio token placeholders with audio embeddings - audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1) - inputs_embeds = inputs_embeds.masked_scatter( - audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device) + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device)) outputs: CausalLMOutputWithPast = self.language_model( inputs_embeds=inputs_embeds, diff --git a/src/transformers/models/glmasr/modeling_glmasr.py b/src/transformers/models/glmasr/modeling_glmasr.py index 9430e8a91018..f2c68e56df71 100644 --- a/src/transformers/models/glmasr/modeling_glmasr.py +++ b/src/transformers/models/glmasr/modeling_glmasr.py @@ -30,7 +30,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, is_torch_available +from ...utils import TransformersKwargs, auto_docstring, is_torch_available, torch_compilable_check from ...utils.generic import can_return_tuple, maybe_autocast, merge_with_config_defaults from ...utils.output_capturing import capture_outputs from ..auto import AutoModel, AutoModelForCausalLM @@ -426,6 +426,30 @@ def get_audio_features( return audio_outputs + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + @can_return_tuple @auto_docstring def forward( @@ -478,10 +502,10 @@ def forward( audio_embeds = self.get_audio_features(input_features, input_features_mask, return_dict=True).pooler_output # replace text-audio token placeholders with audio embeddings - audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1) - inputs_embeds = inputs_embeds.masked_scatter( - audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device) + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device)) outputs: CausalLMOutputWithPast = self.language_model( inputs_embeds=inputs_embeds, diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py index d02ac9998696..e5532b3bf880 100644 --- a/src/transformers/models/granite_speech/configuration_granite_speech.py +++ b/src/transformers/models/granite_speech/configuration_granite_speech.py @@ -53,13 +53,19 @@ class GraniteSpeechEncoderConfig(PreTrainedConfig): ```""" model_type = "granite_speech_encoder" + attribute_map = { + "hidden_size": "hidden_dim", + "num_hidden_layers": "num_layers", + "num_attention_heads": "num_heads", + "num_mel_bins": "input_dim", + } input_dim: int = 160 num_layers: int = 10 hidden_dim: int = 1024 feedforward_mult: int = 4 num_heads: int = 8 - dim_head: int = 128 + dim_head: int | None = None output_dim: int = 42 context_size: int = 200 max_pos_emb: int = 512 @@ -67,6 +73,11 @@ class GraniteSpeechEncoderConfig(PreTrainedConfig): conv_kernel_size: int = 15 conv_expansion_factor: int = 2 + def __post_init__(self, **kwargs): + super().__post_init__(**kwargs) + if self.dim_head is None: + self.dim_head = self.hidden_dim // self.num_heads + @auto_docstring(checkpoint="ibm-granite/granite-speech-3.3-2b") @strict diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py index c7fad419e4c4..d7ba0c94c950 100644 --- a/src/transformers/models/granite_speech/modeling_granite_speech.py +++ b/src/transformers/models/granite_speech/modeling_granite_speech.py @@ -516,6 +516,30 @@ def prepare_inputs_for_generation( model_inputs["input_features"] = input_features return model_inputs + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + def get_merged_audio_embeddings( self, input_ids: torch.Tensor, audio_features: torch.Tensor, input_features_mask: torch.Tensor | None = None ) -> torch.Tensor: @@ -536,20 +560,14 @@ def get_merged_audio_embeddings( llm_input_ids = torch.where(is_audio_index, 0, input_ids) inputs_embeds = self.language_model.get_input_embeddings()(llm_input_ids) # [bsz, # features, hidden size] - # Mask the audio features into the text embeddings - special_audio_mask = is_audio_index.unsqueeze(-1) audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype) if input_features_mask is not None: - torch_compilable_check( - not torch.all(is_audio_index.int().sum(dim=1) != input_features_mask.int().sum(dim=1)), - "Number of audio tokens does not match number of audio features", - ) audio_features = audio_features[input_features_mask] - inputs_embeds = inputs_embeds.masked_scatter( - special_audio_mask, - audio_features, + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_features ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features) return inputs_embeds def generate(self, *args, **kwargs) -> torch.LongTensor: diff --git a/src/transformers/models/granite_speech_plus/configuration_granite_speech_plus.py b/src/transformers/models/granite_speech_plus/configuration_granite_speech_plus.py index c17c3f7391f9..1eec538091a4 100644 --- a/src/transformers/models/granite_speech_plus/configuration_granite_speech_plus.py +++ b/src/transformers/models/granite_speech_plus/configuration_granite_speech_plus.py @@ -62,13 +62,19 @@ class GraniteSpeechPlusEncoderConfig(PreTrainedConfig): ```""" model_type = "granite_speech_plus_encoder" + attribute_map = { + "hidden_size": "hidden_dim", + "num_hidden_layers": "num_layers", + "num_attention_heads": "num_heads", + "num_mel_bins": "input_dim", + } input_dim: int = 160 num_layers: int = 10 hidden_dim: int = 1024 feedforward_mult: int = 4 num_heads: int = 8 - dim_head: int = 128 + dim_head: int | None = None output_dim: int = 42 context_size: int = 200 max_pos_emb: int = 512 @@ -78,6 +84,11 @@ class GraniteSpeechPlusEncoderConfig(PreTrainedConfig): cat_hidden_layers: list[int] | None = None + def __post_init__(self, **kwargs): + super().__post_init__(**kwargs) + if self.dim_head is None: + self.dim_head = self.hidden_dim // self.num_heads + @auto_docstring(checkpoint="ibm-granite/granite-speech-4.1-2b-plus") @strict diff --git a/src/transformers/models/granite_speech_plus/modeling_granite_speech_plus.py b/src/transformers/models/granite_speech_plus/modeling_granite_speech_plus.py index 9ff819aa3b6c..d16fb52290ca 100644 --- a/src/transformers/models/granite_speech_plus/modeling_granite_speech_plus.py +++ b/src/transformers/models/granite_speech_plus/modeling_granite_speech_plus.py @@ -537,6 +537,30 @@ def prepare_inputs_for_generation( model_inputs["input_features"] = input_features return model_inputs + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + def get_merged_audio_embeddings( self, input_ids: torch.Tensor, audio_features: torch.Tensor, input_features_mask: torch.Tensor | None = None ) -> torch.Tensor: @@ -557,20 +581,14 @@ def get_merged_audio_embeddings( llm_input_ids = torch.where(is_audio_index, 0, input_ids) inputs_embeds = self.language_model.get_input_embeddings()(llm_input_ids) # [bsz, # features, hidden size] - # Mask the audio features into the text embeddings - special_audio_mask = is_audio_index.unsqueeze(-1) audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype) if input_features_mask is not None: - torch_compilable_check( - not torch.all(is_audio_index.int().sum(dim=1) != input_features_mask.int().sum(dim=1)), - "Number of audio tokens does not match number of audio features", - ) audio_features = audio_features[input_features_mask] - inputs_embeds = inputs_embeds.masked_scatter( - special_audio_mask, - audio_features, + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_features ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features) return inputs_embeds def generate(self, *args, **kwargs) -> torch.LongTensor: diff --git a/src/transformers/models/musicflamingo/modeling_musicflamingo.py b/src/transformers/models/musicflamingo/modeling_musicflamingo.py index 4ec4215a2989..a9e05470662d 100644 --- a/src/transformers/models/musicflamingo/modeling_musicflamingo.py +++ b/src/transformers/models/musicflamingo/modeling_musicflamingo.py @@ -33,7 +33,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available, torch_compilable_check from ..auto import AutoModel, AutoModelForCausalLM from .configuration_musicflamingo import MusicFlamingoConfig @@ -269,6 +269,30 @@ def get_audio_features( return audio_output + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + @can_return_tuple @auto_docstring def forward( @@ -345,10 +369,10 @@ def forward( ).pooler_output # replace text-audio token placeholders with audio embeddings - audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1) - inputs_embeds = inputs_embeds.masked_scatter( - audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device) + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device)) outputs: CausalLMOutputWithPast = self.language_model( inputs_embeds=inputs_embeds, @@ -388,6 +412,13 @@ def _build_audio_timestamps( _, ends = torch.where(diff == -1) sample_lengths = (ends - starts).to(torch.long) + n_audio_tokens = audio_token_mask.sum() + n_audio_features = post_lengths.sum() + torch_compilable_check( + n_audio_tokens == n_audio_features, + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + # Account for 4x downsampling in audio encoder (conv2 and avg pooling) audio_embed_frame_step = self.config.audio_frame_step * 4 frame_offsets = ( diff --git a/src/transformers/models/musicflamingo/modular_musicflamingo.py b/src/transformers/models/musicflamingo/modular_musicflamingo.py index 7d98d0ffdeab..e16ae28f6c68 100644 --- a/src/transformers/models/musicflamingo/modular_musicflamingo.py +++ b/src/transformers/models/musicflamingo/modular_musicflamingo.py @@ -25,7 +25,7 @@ from ...modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available, torch_compilable_check from ..audioflamingo3.configuration_audioflamingo3 import AudioFlamingo3Config from ..audioflamingo3.modeling_audioflamingo3 import ( AudioFlamingo3ForConditionalGeneration, @@ -274,6 +274,13 @@ def _build_audio_timestamps( _, ends = torch.where(diff == -1) sample_lengths = (ends - starts).to(torch.long) + n_audio_tokens = audio_token_mask.sum() + n_audio_features = post_lengths.sum() + torch_compilable_check( + n_audio_tokens == n_audio_features, + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + # Account for 4x downsampling in audio encoder (conv2 and avg pooling) audio_embed_frame_step = self.config.audio_frame_step * 4 frame_offsets = ( @@ -408,10 +415,10 @@ def forward( ).pooler_output # replace text-audio token placeholders with audio embeddings - audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1) - inputs_embeds = inputs_embeds.masked_scatter( - audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device) + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device)) outputs: CausalLMOutputWithPast = self.language_model( inputs_embeds=inputs_embeds, diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index 1564d2b36de9..081823bf222f 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -99,7 +99,12 @@ class Qwen2_5OmniAudioEncoderConfig(PreTrainedConfig): ```""" model_type = "qwen2_5_omni_audio_encoder" - attribute_map = {"num_hidden_layers": "encoder_layers"} + attribute_map = { + "num_hidden_layers": "encoder_layers", + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + "intermediate_size": "encoder_ffn_dim", + } num_mel_bins: int = 128 encoder_layers: int = 32 diff --git a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py index a617f33e6177..6aec9eace900 100644 --- a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py @@ -42,7 +42,12 @@ class Qwen2AudioEncoderConfig(PreTrainedConfig): ```""" model_type = "qwen2_audio_encoder" - attribute_map = {"num_hidden_layers": "encoder_layers"} + attribute_map = { + "num_hidden_layers": "encoder_layers", + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + "intermediate_size": "encoder_ffn_dim", + } num_mel_bins: int = 128 encoder_layers: int = 32 diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index 23413471c2b2..4e9bdd35f21f 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -47,7 +47,12 @@ class Qwen3OmniMoeAudioEncoderConfig(PreTrainedConfig): """ model_type = "qwen3_omni_moe_audio_encoder" - attribute_map = {"num_hidden_layers": "encoder_layers"} + attribute_map = { + "num_hidden_layers": "encoder_layers", + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + "intermediate_size": "encoder_ffn_dim", + } num_mel_bins: int = 128 encoder_layers: int = 32 diff --git a/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py b/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py index 3d26d0fbe9f3..b66dd15b2cb1 100644 --- a/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py +++ b/src/transformers/models/vibevoice_asr/modeling_vibevoice_asr.py @@ -28,7 +28,13 @@ from ...modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling +from ...utils import ( + TransformersKwargs, + auto_docstring, + can_return_tuple, + is_torchdynamo_compiling, + torch_compilable_check, +) from ..auto import AutoModel, AutoModelForCausalLM from .configuration_vibevoice_asr import VibeVoiceAsrConfig @@ -363,6 +369,30 @@ def get_audio_features( return BaseModelOutputWithPooling(last_hidden_state=acoustic_latents, pooler_output=combined_features) + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/voxtral/modeling_voxtral.py b/src/transformers/models/voxtral/modeling_voxtral.py index 76da78cc558f..54466321b79e 100644 --- a/src/transformers/models/voxtral/modeling_voxtral.py +++ b/src/transformers/models/voxtral/modeling_voxtral.py @@ -32,7 +32,7 @@ from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, CausalLMOutputWithPast from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_compilable_check from ...utils.generic import merge_with_config_defaults from ...utils.output_capturing import capture_outputs from ..auto import AutoModel, AutoModelForCausalLM @@ -418,6 +418,30 @@ def get_audio_features( return audio_outputs + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + @can_return_tuple @auto_docstring def forward( @@ -473,10 +497,10 @@ def forward( audio_embeds = self.get_audio_features(input_features, return_dict=True).pooler_output # replace text-audio token placeholders with audio embeddings - audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1) - inputs_embeds = inputs_embeds.masked_scatter( - audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device) + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device)) outputs: BaseModelOutputWithPast = self.language_model( attention_mask=attention_mask, diff --git a/src/transformers/models/voxtral/modular_voxtral.py b/src/transformers/models/voxtral/modular_voxtral.py index c7b2c53e16d4..02e8e2806a0f 100644 --- a/src/transformers/models/voxtral/modular_voxtral.py +++ b/src/transformers/models/voxtral/modular_voxtral.py @@ -25,7 +25,7 @@ CausalLMOutputWithPast, ) from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check from ...utils.generic import merge_with_config_defaults from ...utils.output_capturing import capture_outputs from ..auto import AutoModel, AutoModelForCausalLM @@ -187,6 +187,30 @@ def get_audio_features( return audio_outputs + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + @can_return_tuple @auto_docstring def forward( @@ -242,10 +266,10 @@ def forward( audio_embeds = self.get_audio_features(input_features, return_dict=True).pooler_output # replace text-audio token placeholders with audio embeddings - audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1) - inputs_embeds = inputs_embeds.masked_scatter( - audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device) + special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_embeds ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_embeds.to(inputs_embeds.device)) outputs: BaseModelOutputWithPast = self.language_model( attention_mask=attention_mask, diff --git a/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py index 07325b0ea559..dbecd9a6f530 100644 --- a/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py +++ b/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py @@ -39,7 +39,14 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging +from ...utils import ( + TransformersKwargs, + auto_docstring, + can_return_tuple, + is_torchdynamo_compiling, + logging, + torch_compilable_check, +) from ...utils.generic import maybe_autocast, merge_with_config_defaults from ...utils.output_capturing import capture_outputs from ..auto import AutoModel @@ -1007,6 +1014,30 @@ def get_audio_features( return audio_outputs + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, audio_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_audio_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_audio_mask = special_audio_mask.all(-1) + else: + special_audio_mask = input_ids == self.config.audio_token_id + + n_audio_tokens = special_audio_mask.sum() + n_audio_features = audio_features.shape[0] + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_audio_mask].numel() == audio_features.numel(), + f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}", + ) + return special_audio_mask + @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 863242a695c6..908337fd4fd4 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -228,6 +228,34 @@ "conditional_generation_class": "ForConditionalGeneration", } +# Shared text-model defaults for CausalLMModelTester and MultiModalModelTester. +_TEXT_MODEL_TESTER_DEFAULTS = { + "batch_size": 13, + "seq_length": 7, + "is_training": True, + "use_input_mask": True, + "use_labels": True, + "vocab_size": 99, + "hidden_size": 32, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "num_key_value_heads": 2, + "intermediate_size": 32, + "hidden_act": "gelu", + "max_position_embeddings": 512, + "pad_token_id": 0, + "bos_token_id": 1, + "eos_token_id": 2, + "expert_interval": 1, + "moe_layer_start_index": 0, + "moe_intermediate_size": 16, + "shared_expert_intermediate_size": 36, + "shared_expert_gate": True, + "moe_num_shared_experts": 2, + "num_experts_per_tok": 2, + "num_experts": 8, +} + if is_torch_available(): import torch diff --git a/tests/alm_tester.py b/tests/alm_tester.py new file mode 100644 index 000000000000..c34d4d45524c --- /dev/null +++ b/tests/alm_tester.py @@ -0,0 +1,226 @@ +# Copyright 2026 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import unittest +from inspect import signature + +from .multimodal_tester import MultiModalModelTest, MultiModalModelTester +from .test_modeling_common import ( + floats_tensor, + ids_tensor, + is_torch_available, + torch_device, +) + + +if is_torch_available(): + import torch + + +class ALMModelTester(MultiModalModelTester): + audio_config_class = None + audio_config_key = "audio_config" + # Name under which the audio mask is passed to the model's forward (e.g. "feature_attention_mask" + # for Qwen2Audio). Leave as `None` if the model does not consume a separate audio-level mask; + # `_prepare_modality_inputs` then skips adding it to the inputs dict. + audio_mask_key = None + _required_attributes = MultiModalModelTester._required_attributes + ("audio_config_class",) + + @property + def pipeline_model_mapping(self): + # TODO: @eustlb, we don't have pipeline testing for audio-text-to-text + mapping = { + "feature-extraction": self.base_model_class, + # "audio-text-to-text": self.conditional_generation_class, + } + # TODO: should we add automatic-speech-recognition with a special flag? + return mapping + + def __init__(self, parent, **kwargs): + # Overrides of _TEXT_MODEL_TESTER_DEFAULTS + kwargs.setdefault("seq_length", 32) + kwargs.setdefault("pad_token_id", 1) + + # ALM-specific defaults + kwargs.setdefault("feat_seq_length", 128) + kwargs.setdefault("num_mel_bins", 80) + kwargs.setdefault("audio_token_id", 0) + + super().__init__(parent, **kwargs) + + # -- Overridable ALM-specific hooks ------------------------------------------------------ + + def create_audio_features(self): + """Create audio feature tensor. Override for different shapes (e.g. [B, T, features]).""" + return floats_tensor([self.batch_size, self.num_mel_bins, self.feat_seq_length]) + + def get_audio_embeds_mask(self, audio_embeds_mask): + """Get audio embeds mask from audio mask. Override for different shapes.""" + raise NotImplementedError("This method should be overridden in the subclass") + + def place_audio_tokens(self, input_ids, config, num_audio_tokens): + """Place audio placeholder tokens contiguously after BOS. Override for different placement. + + Deterministic placement (position 0 reserved for BOS; audio tokens at [1:1+n]) keeps + the tail of each sequence text-only, which downstream tests (e.g. resize_token_embeddings + overwriting column -2) rely on. + """ + input_ids = input_ids.clone() + input_ids[input_ids == self.audio_token_id] = self.pad_token_id + for i in range(input_ids.shape[0]): + n = num_audio_tokens[i].item() if isinstance(num_audio_tokens, torch.Tensor) else num_audio_tokens + if 1 + int(n) > self.seq_length: + raise ValueError( + f"Cannot place {int(n)} audio tokens after BOS in a sequence of length {self.seq_length}. " + "This likely indicates a mismatch between your feature extraction/configuration and your sequence length. " + "Please ensure `seq_length` is >= the number of audio embedding positions + 1." + ) + input_ids[i, 1 : 1 + int(n)] = self.audio_token_id + return input_ids + + def get_audio_feature_key(self): + """Key name for audio features in the inputs dict.""" + return "input_features" + + def create_audio_mask(self): + """Create audio-level attention mask with contiguous valid regions per batch element. + + Each element gets a random offset and length, producing masks like [0, 0, 1, 1, 1, 0, 0]. + """ + # Sample lengths in [1, feat_seq_length] and offsets in [0, feat_seq_length - length] + lengths = ids_tensor([self.batch_size], vocab_size=self.feat_seq_length).abs() + 1 + lengths = lengths.clamp(max=self.feat_seq_length) + offsets = ids_tensor([self.batch_size], vocab_size=self.feat_seq_length).abs() + offsets = offsets % (self.feat_seq_length - lengths + 1) + + positions = torch.arange(self.feat_seq_length, device=torch_device)[None, :] + audio_mask = ((positions >= offsets[:, None]) & (positions < offsets[:, None] + lengths[:, None])).long() + return audio_mask + + # -- Hooks consumed by the shared base --------------------------------------------------- + + @property + def _special_token_ids(self): + return super()._special_token_ids | {self.audio_token_id} + + def _build_modality_sub_configs(self): + return {self.audio_config_key: self.get_audio_config()} + + def _prepare_modality_inputs(self, input_ids, config): + audio_features = self.create_audio_features() + audio_mask = self.create_audio_mask() + audio_embeds_mask = self.get_audio_embeds_mask(audio_mask) + num_audio_tokens = audio_embeds_mask.sum(dim=1) + input_ids = self.place_audio_tokens(input_ids, config, num_audio_tokens) + + modality_inputs = {self.get_audio_feature_key(): audio_features} + if self.audio_mask_key is not None: + modality_inputs[self.audio_mask_key] = audio_mask + return input_ids, modality_inputs + + # -- Audio sub-config construction ------------------------------------------------------- + + @property + def audio_config_args(self): + return list(signature(self.audio_config_class.__init__).parameters.keys()) + + def get_audio_config(self): + kwargs = self._collect_kwargs(self.audio_config_args, self.audio_config_class) + return self.audio_config_class(**kwargs) + + +class ALMModelTest(MultiModalModelTest): + """ + Base test class for Audio-Language Models. + + Subclasses should set: + - `model_tester_class`: The tester class (subclass of ALMModelTester) + + Optional: + - `all_model_classes`: Override if not using default from model_tester + - `pipeline_model_mapping`: Override if not using default from model_tester + """ + + # TODO: @eustlb, remove this once #45534 is merged + @unittest.skip("Audio-LMs have no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + + def test_mismatching_num_audio_tokens(self): + """ + Tests that ALMs throw an error with explicit message saying what is wrong + when number of audios don't match number of audio tokens in the text. + Also we need to test multi-audio cases when one prompt has multiple audio tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + audio_feature_key = self.model_tester.get_audio_feature_key() + audio_mask_key = self.model_tester.audio_mask_key + + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + model.eval() + curr_input_dict = copy.deepcopy(input_dict) + _ = model(**curr_input_dict) # successful forward with no modifications + + # Test 1: remove one audio but leave the audio tokens in the text + curr_input_dict[audio_feature_key] = curr_input_dict[audio_feature_key][-1:, ...] + if audio_mask_key is not None: + curr_input_dict[audio_mask_key] = curr_input_dict[audio_mask_key][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**curr_input_dict) + + # Test 2: add one audio but leave the audio tokens in the text + curr_input_dict = copy.deepcopy(input_dict) + curr_input_dict[audio_feature_key] = torch.cat( + [curr_input_dict[audio_feature_key], curr_input_dict[audio_feature_key][:1, ...]], dim=0 + ) + if audio_mask_key is not None: + curr_input_dict[audio_mask_key] = torch.cat( + [curr_input_dict[audio_mask_key], curr_input_dict[audio_mask_key][:1, ...]], dim=0 + ) + with self.assertRaises(ValueError): + _ = model(**curr_input_dict) + + # Test 3: duplicate the text along the seq dim so each prompt has twice as many + # audio tokens, while leaving the audio features unchanged -> mismatch + curr_input_dict = copy.deepcopy(input_dict) + curr_input_dict["input_ids"] = torch.cat( + [curr_input_dict["input_ids"], curr_input_dict["input_ids"]], dim=1 + ) + curr_input_dict["attention_mask"] = torch.cat( + [curr_input_dict["attention_mask"], curr_input_dict["attention_mask"]], dim=1 + ) + with self.assertRaises(ValueError): + _ = model(**curr_input_dict) + + # Test 4: multi-audio valid case. A prompt may contain multiple audio segments; + # all audio segments are concatenated along the batch dim on the audio side. + # Duplicating input_ids along seq dim (-> [audios, audios] per prompt) and the + # audio features along batch dim (-> batch_size * 2) must forward successfully. + curr_input_dict = copy.deepcopy(input_dict) + curr_input_dict["input_ids"] = torch.cat( + [curr_input_dict["input_ids"], curr_input_dict["input_ids"]], dim=1 + ) + curr_input_dict["attention_mask"] = torch.cat( + [curr_input_dict["attention_mask"], curr_input_dict["attention_mask"]], dim=1 + ) + curr_input_dict[audio_feature_key] = torch.cat( + [curr_input_dict[audio_feature_key], curr_input_dict[audio_feature_key]], dim=0 + ) + if audio_mask_key is not None: + curr_input_dict[audio_mask_key] = torch.cat( + [curr_input_dict[audio_mask_key], curr_input_dict[audio_mask_key]], dim=0 + ) + _ = model(**curr_input_dict) diff --git a/tests/causal_lm_tester.py b/tests/causal_lm_tester.py index b3398f13c393..6b94a520d4f2 100644 --- a/tests/causal_lm_tester.py +++ b/tests/causal_lm_tester.py @@ -22,6 +22,7 @@ from transformers.models.auto.auto_factory import getattribute_from_module from transformers.testing_utils import ( _COMMON_MODEL_NAMES_MAP, + _TEXT_MODEL_TESTER_DEFAULTS, is_flaky, require_flash_attn, require_torch_accelerator, @@ -166,84 +167,43 @@ def pipeline_model_mapping(self): def __init__( self, parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, use_token_type_ids=False, - use_labels=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=2, - num_attention_heads=2, - num_key_value_heads=2, - intermediate_size=32, - hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, - max_position_embeddings=512, type_vocab_size=16, type_sequence_label_size=2, initializer_range=0.02, num_labels=3, num_choices=4, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, is_decoder=False, scope=None, - expert_interval=1, - moe_layer_start_index=0, - moe_intermediate_size=16, - shared_expert_intermediate_size=36, - shared_expert_gate=True, - moe_num_shared_experts=2, - num_experts_per_tok=2, - num_experts=8, mamba_n_groups=1, mamba_n_heads=16, mamba_d_state=16, mamba_d_conv=4, mamba_expand=2, mamba_chunk_size=16, + **kwargs, ): self._verify_and_infer_model_attributes() self.parent = parent - self.batch_size = batch_size - self.seq_length = seq_length - self.is_training = is_training - self.use_input_mask = use_input_mask + + # Apply shared text-model defaults, then let caller kwargs override + for key, default in _TEXT_MODEL_TESTER_DEFAULTS.items(): + setattr(self, key, kwargs.pop(key, default)) + + # CausalLM-specific defaults (not shared with multimodal testers) self.use_token_type_ids = use_token_type_ids - self.use_labels = use_labels - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.type_sequence_label_size = type_sequence_label_size self.initializer_range = initializer_range self.num_labels = num_labels self.num_choices = num_choices - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id self.scope = scope self.head_dim = self.hidden_size // self.num_attention_heads self.is_decoder = is_decoder - self.expert_interval = expert_interval - self.moe_layer_start_index = moe_layer_start_index - self.moe_intermediate_size = moe_intermediate_size - self.shared_expert_intermediate_size = shared_expert_intermediate_size - self.shared_expert_gate = shared_expert_gate - self.moe_num_shared_experts = moe_num_shared_experts - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts self.mamba_n_groups = mamba_n_groups self.mamba_n_heads = mamba_n_heads self.mamba_d_state = mamba_d_state @@ -252,6 +212,10 @@ def __init__( self.mamba_chunk_size = mamba_chunk_size self.tie_word_embeddings = False + # Any remaining kwargs become attributes (for model-specific params) + for key, value in kwargs.items(): + setattr(self, key, value) + def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) diff --git a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py index 7301812e7032..9629fe3ba086 100644 --- a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py +++ b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py @@ -15,16 +15,15 @@ """Testing suite for the PyTorch AudioFlamingo3 model.""" import json -import tempfile import unittest from pathlib import Path -import pytest - from transformers import ( AudioFlamingo3Config, + AudioFlamingo3EncoderConfig, AudioFlamingo3ForConditionalGeneration, AutoProcessor, + Qwen2Config, is_torch_available, ) from transformers.testing_utils import ( @@ -34,128 +33,52 @@ torch_device, ) -from ...generation.test_utils import GenerationTesterMixin -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...alm_tester import ALMModelTest, ALMModelTester if is_torch_available(): import torch -class AudioFlamingo3ModelTester: - """ - Builds a tiny AudioFlamingo3 config and synthetic inputs that respect AF3's - post-pool token accounting: num tokens per sample == post-pool frame count. - """ - - def __init__( - self, - parent, - audio_token_id=0, - seq_length=25, - feat_seq_length=60, - text_config=None, - audio_config=None, - is_training=True, - ): - self.parent = parent - self.audio_token_id = audio_token_id - self.seq_length = seq_length - self.feat_seq_length = feat_seq_length - self.is_training = is_training - - # Small text backbone (Qwen2-ish) - if text_config is None: - text_config = { - "model_type": "qwen2", - "intermediate_size": 36, - "initializer_range": 0.02, - "hidden_size": 32, - "max_position_embeddings": 52, - "num_hidden_layers": 2, - "num_attention_heads": 4, - "num_key_value_heads": 2, - "use_labels": True, - "use_mrope": False, - "vocab_size": 99, - "pad_token_id": 1, # Ensure pad token != audio token - } - # Small audio encoder (AF3 Whisper-style) - if audio_config is None: - audio_config = { - "model_type": "audioflamingo3_encoder", - "hidden_size": 16, - "num_attention_heads": 4, - "intermediate_size": 16, - "num_hidden_layers": 2, - "num_mel_bins": 80, - "max_source_positions": 30, - "initializer_range": 0.02, - } - - self.text_config = text_config - self.audio_config = audio_config - - self.batch_size = 3 - self.vocab_size = text_config["vocab_size"] - self.hidden_size = text_config["hidden_size"] - self.num_attention_heads = text_config["num_attention_heads"] - self.num_hidden_layers = text_config["num_hidden_layers"] - self.encoder_seq_length = seq_length - - def get_config(self): - return AudioFlamingo3Config( - text_config=self.text_config, - audio_config=self.audio_config, - audio_token_id=self.audio_token_id, - ) - - def prepare_config_and_inputs(self): - # (#windows == batch_size, n_mels, T_mel) - input_features_values = floats_tensor( - [self.batch_size, self.audio_config["num_mel_bins"], self.feat_seq_length] - ) - config = self.get_config() - # Per-window mel validity (all ones => full length) - input_features_mask = torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) - return config, input_features_values, input_features_mask - - def _post_pool_tokens_per_window(self, T_mel): - # Mirror AF3 processor math: - pre = (T_mel - 1) // 2 + 1 - post = (pre - 2) // 2 + 1 - return post - - def prepare_config_and_inputs_for_common(self): - config, input_features_values, input_features_mask = self.prepare_config_and_inputs() - # Every window has same T_mel here - num_audio_tokens_per_sample = self._post_pool_tokens_per_window(input_features_values.shape[-1]) - - # Build token ids with valid range and K tokens - input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2 - attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=torch_device) - attention_mask[:, :1] = 0 # left padding sentinel - - # Fill first K positions (after padding) with the audio token id, for each sample - input_ids[:, 1 : 1 + num_audio_tokens_per_sample] = config.audio_token_id - - inputs_dict = { - "input_features": input_features_values, - "input_features_mask": input_features_mask, - "input_ids": input_ids, - "attention_mask": attention_mask, - } - return config, inputs_dict +class AudioFlamingo3ModelTester(ALMModelTester): + config_class = AudioFlamingo3Config + conditional_generation_class = AudioFlamingo3ForConditionalGeneration + text_config_class = Qwen2Config + audio_config_class = AudioFlamingo3EncoderConfig + audio_mask_key = "input_features_mask" + + def __init__(self, parent, **kwargs): + # feat_seq_length → (L-1)//2+1 after conv2 → (·-2)//2+1 after avg_pool, so + # feat_seq_length=60 gives 15 audio embed tokens (fits inside seq_length=32 + BOS + text). + kwargs.setdefault("feat_seq_length", 60) + # Encoder adds a learned positional embedding of size max_source_positions to post-conv2 features, + # so it must equal (feat_seq_length - 1) // 2 + 1. + kwargs.setdefault("max_source_positions", (kwargs["feat_seq_length"] - 1) // 2 + 1) + super().__init__(parent, **kwargs) + + def create_audio_mask(self): + # Full-length mask matches real processor output and lets the audio encoder dispatch to Flash + # Attention (which rejects non-null attn_masks) on `test_sdpa_can_dispatch_on_flash`. + return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) + + def get_audio_embeds_mask(self, audio_mask): + # Mirrors AudioFlamingo3Encoder._get_feat_extract_output_lengths: + # conv2 (k=3,s=2,p=1) then avg_pool (k=2,s=2). + input_lengths = audio_mask.sum(-1) + input_lengths = (input_lengths - 1) // 2 + 1 + output_lengths = (input_lengths - 2) // 2 + 1 + max_len = int(output_lengths.max().item()) + positions = torch.arange(max_len, device=audio_mask.device)[None, :] + return (positions < output_lengths[:, None]).long() @require_torch -class AudioFlamingo3ForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): +class AudioFlamingo3ForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): """ Model tester for `AudioFlamingo3ForConditionalGeneration`. """ - all_model_classes = (AudioFlamingo3ForConditionalGeneration,) if is_torch_available() else () + model_tester_class = AudioFlamingo3ModelTester # TODO: @eustlb, this is incorrect pipeline_model_mapping = ( { @@ -165,73 +88,14 @@ class AudioFlamingo3ForConditionalGenerationModelTest(ModelTesterMixin, Generati if is_torch_available() else {} ) - _is_composite = True - - def setUp(self): - self.model_tester = AudioFlamingo3ModelTester(self) - self.config_tester = ConfigTester(self, config_class=AudioFlamingo3Config, has_text_modality=False) @unittest.skip( - reason="This test does not apply to AudioFlamingo3 since inputs_embeds corresponding to audio tokens are replaced when input features are provided." + reason="This test does not apply to AudioFlamingo3 since inputs_embeds corresponding to audio tokens " + "are replaced when input features are provided." ) def test_inputs_embeds_matches_input_ids(self): pass - @unittest.skip(reason="Compile not yet supported for AudioFlamingo3 models") - @pytest.mark.torch_compile_test - def test_sdpa_can_compile_dynamic(self): - pass - - @unittest.skip(reason="Compile not yet supported for AudioFlamingo3 models") - def test_sdpa_can_dispatch_on_flash(self): - pass - - @unittest.skip(reason="AudioFlamingo3 tests avoid right-padding equivalence; fusion is in-place.") - def test_flash_attn_2_inference_equivalence_right_padding(self): - pass - - @unittest.skip(reason="AudioFlamingo3 has no separate base model without a head.") - def test_model_base_model_prefix(self): - pass - - def test_sdpa_can_dispatch_composite_models(self): - # AF3 is audio+text composite; verify SDPA toggles propagate to submodules. - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - if not self._is_composite: - self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") - - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - # SDPA (default) - model_sdpa = model_class.from_pretrained(tmpdirname) - model_sdpa = model_sdpa.eval().to(torch_device) - - text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" - audio_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager" - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - self.assertTrue(model.language_model.config._attn_implementation == text_attn) - self.assertTrue(model.audio_tower.config._attn_implementation == audio_attn) - - # Eager - model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") - model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") - self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager") - - for _, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - @require_torch class AudioFlamingo3ForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/gemma3/test_modeling_gemma3.py b/tests/models/gemma3/test_modeling_gemma3.py index 288c41eed6fb..d5a648eed6f3 100644 --- a/tests/models/gemma3/test_modeling_gemma3.py +++ b/tests/models/gemma3/test_modeling_gemma3.py @@ -281,7 +281,7 @@ def create_attention_mask(self, input_ids): # Gemma3 uses padding mask for bidirectional attention on image tokens return input_ids.ne(self.pad_token_id).to(torch_device) - def get_additional_inputs(self, config, input_ids, pixel_values): + def get_additional_inputs(self, config, input_ids, modality_inputs): # Gemma3 requires specific token_type_ids for bidirectional attention on image tokens token_type_ids = torch.zeros_like(input_ids) token_type_ids[input_ids == config.image_token_id] = 1 diff --git a/tests/models/glmasr/test_modeling_glmasr.py b/tests/models/glmasr/test_modeling_glmasr.py index 744e268e74c7..b19e91a61209 100644 --- a/tests/models/glmasr/test_modeling_glmasr.py +++ b/tests/models/glmasr/test_modeling_glmasr.py @@ -13,17 +13,16 @@ # limitations under the License. """Testing suite for the PyTorch glmasr model.""" -import tempfile import unittest -import pytest - from transformers import ( AutoProcessor, GlmAsrConfig, GlmAsrForConditionalGeneration, + LlamaConfig, is_torch_available, ) +from transformers.models.glmasr.configuration_glmasr import GlmAsrEncoderConfig from transformers.testing_utils import ( cleanup, require_torch, @@ -31,183 +30,51 @@ torch_device, ) -from ...generation.test_utils import GenerationTesterMixin -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor -from ...test_pipeline_mixin import PipelineTesterMixin +from ...alm_tester import ALMModelTest, ALMModelTester if is_torch_available(): import torch -class GlmAsrModelTester: - def __init__( - self, - parent, - ignore_index=-100, - audio_token_id=0, - seq_length=35, - feat_seq_length=64, - text_config={ - "model_type": "llama", - "intermediate_size": 64, - "initializer_range": 0.02, - "hidden_size": 16, - "max_position_embeddings": 52, - "num_hidden_layers": 2, - "num_attention_heads": 4, - "use_labels": True, - "use_mrope": False, - "vocab_size": 99, - "head_dim": 8, - "pad_token_id": 1, # can't be the same as the audio token id - }, - is_training=True, - audio_config={ - "model_type": "glmasr_encoder", - "hidden_size": 128, - "num_attention_heads": 2, - "intermediate_size": 512, - "num_hidden_layers": 2, - "num_mel_bins": 128, - "max_source_positions": 32, - "initializer_range": 0.02, - }, - ): - self.parent = parent - self.ignore_index = ignore_index - self.audio_token_id = audio_token_id - self.text_config = text_config - self.audio_config = audio_config - self.seq_length = seq_length - self.feat_seq_length = feat_seq_length - - self.num_hidden_layers = text_config["num_hidden_layers"] - self.vocab_size = text_config["vocab_size"] - self.hidden_size = text_config["hidden_size"] - self.num_attention_heads = text_config["num_attention_heads"] - self.is_training = is_training - - self.batch_size = 3 - self.encoder_seq_length = seq_length - - def get_config(self): - return GlmAsrConfig( - text_config=self.text_config, - audio_config=self.audio_config, - ignore_index=self.ignore_index, - audio_token_id=self.audio_token_id, - ) - - def prepare_config_and_inputs(self): - input_features_values = floats_tensor( - [ - self.batch_size, - self.audio_config["num_mel_bins"], - self.feat_seq_length, - ] - ) - config = self.get_config() - input_features_mask = torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) - return config, input_features_values, input_features_mask +class GlmAsrModelTester(ALMModelTester): + config_class = GlmAsrConfig + conditional_generation_class = GlmAsrForConditionalGeneration + text_config_class = LlamaConfig + audio_config_class = GlmAsrEncoderConfig + audio_mask_key = "input_features_mask" - def prepare_config_and_inputs_for_common(self): - config_and_inputs = self.prepare_config_and_inputs() - config, input_features_values, input_features_mask = config_and_inputs - num_audio_tokens_per_batch_idx = 8 + def __init__(self, parent, **kwargs): + kwargs.setdefault("head_dim", 8) + super().__init__(parent, **kwargs) - input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1 - attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) - attention_mask[:, :1] = 0 - - input_ids[:, 1 : 1 + num_audio_tokens_per_batch_idx] = config.audio_token_id - inputs_dict = { - "input_ids": input_ids, - "attention_mask": attention_mask, - "input_features": input_features_values, - "input_features_mask": input_features_mask, - } - return config, inputs_dict + def get_audio_embeds_mask(self, audio_mask): + # conv1 (s=1) preserves length; conv2 (s=2, k=3, p=1) halves; merge_factor=4 post-projector. + audio_lengths = audio_mask.sum(-1) + for padding, kernel_size, stride in [(1, 3, 1), (1, 3, 2)]: + audio_lengths = (audio_lengths + 2 * padding - (kernel_size - 1) - 1) // stride + 1 + merge_factor = 4 + post_lengths = (audio_lengths - merge_factor) // merge_factor + 1 + max_len = int(post_lengths.max().item()) + positions = torch.arange(max_len, device=audio_mask.device)[None, :] + return (positions < post_lengths[:, None]).long() @require_torch -class GlmAsrForConditionalGenerationModelTest( - ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase -): +class GlmAsrForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): """ Model tester for `GlmAsrForConditionalGeneration`. """ - all_model_classes = (GlmAsrForConditionalGeneration,) if is_torch_available() else () + model_tester_class = GlmAsrModelTester pipeline_model_mapping = {"audio-text-to-text": GlmAsrForConditionalGeneration} if is_torch_available() else {} - _is_composite = True - - def setUp(self): - self.model_tester = GlmAsrModelTester(self) - self.config_tester = ConfigTester(self, config_class=GlmAsrConfig, has_text_modality=False) - @unittest.skip( reason="This test does not apply to GlmAsr since inputs_embeds corresponding to audio tokens are replaced when input features are provided." ) def test_inputs_embeds_matches_input_ids(self): pass - @unittest.skip(reason="Compile not yet supported for GlmAsr models") - @pytest.mark.torch_compile_test - def test_sdpa_can_compile_dynamic(self): - pass - - @unittest.skip(reason="Compile not yet supported for GlmAsr models") - def test_sdpa_can_dispatch_on_flash(self): - pass - - @unittest.skip(reason="GlmAsr tests avoid right-padding equivalence; fusion is in-place.") - def test_flash_attn_2_inference_equivalence_right_padding(self): - pass - - @unittest.skip(reason="GlmAsr has no separate base model without a head.") - def test_model_base_model_prefix(self): - pass - - def test_sdpa_can_dispatch_composite_models(self): - # GlmAsr is audio+text composite; verify SDPA toggles propagate to submodules. - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - if not self._is_composite: - self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") - - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - # SDPA (default) - model_sdpa = model_class.from_pretrained(tmpdirname) - model_sdpa = model_sdpa.eval().to(torch_device) - - text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" - audio_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager" - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - self.assertTrue(model.language_model.config._attn_implementation == text_attn) - self.assertTrue(model.audio_tower.config._attn_implementation == audio_attn) - - # Eager - model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") - model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") - self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager") - - for _, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - @require_torch class GlmAsrForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py index 95c6c443d6f0..e4ecebbcb0ee 100644 --- a/tests/models/granite_speech/test_modeling_granite_speech.py +++ b/tests/models/granite_speech/test_modeling_granite_speech.py @@ -13,14 +13,15 @@ # limitations under the License. """Testing suite for the IBM Granite Speech model.""" -import tempfile import unittest import pytest from transformers import ( AutoProcessor, + GraniteConfig, GraniteSpeechConfig, + GraniteSpeechEncoderConfig, GraniteSpeechForConditionalGeneration, ) from transformers.testing_utils import ( @@ -35,14 +36,8 @@ is_torch_available, ) -from ...generation.test_utils import GenerationTesterMixin -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ( - ModelTesterMixin, - floats_tensor, - ids_tensor, -) -from ...test_pipeline_mixin import PipelineTesterMixin +from ...alm_tester import ALMModelTest, ALMModelTester +from ...test_modeling_common import floats_tensor if is_torch_available(): @@ -52,129 +47,40 @@ from datasets import load_dataset -class GraniteSpeechForConditionalGenerationModelTester: - def __init__( - self, - parent, - seq_length=7, - encoder_config={ - "model_type": "granite_speech_encoder", - "context_size": 200, - "conv_expansion_factor": 2, - "conv_kernel_size": 15, - "dim_head": 32, - "dropout": 0.1, - "feedforward_mult": 4, - "hidden_dim": 32, - "input_dim": 160, - "num_heads": 4, - "num_layers": 2, - "output_dim": 42, - }, - text_config={ - "model_type": "granite", - "is_training": True, - "seq_length": 7, - "use_token_type_ids": False, - "use_labels": True, - "vocab_size": 99, +class GraniteSpeechModelTester(ALMModelTester): + config_class = GraniteSpeechConfig + conditional_generation_class = GraniteSpeechForConditionalGeneration + text_config_class = GraniteConfig + audio_config_class = GraniteSpeechEncoderConfig + audio_config_key = "encoder_config" + + def __init__(self, parent, **kwargs): + kwargs["projector_config"] = { + "model_type": "blip_2_qformer", "hidden_size": 32, "num_hidden_layers": 2, "num_attention_heads": 4, - "intermediate_size": 37, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 580, - "type_vocab_size": 16, - "type_sequence_label_size": 2, - "initializer_range": 0.02, - "num_labels": 3, - "num_choices": 4, - "pad_token_id": 1, - }, - projector_config={ - "attention_probs_dropout_prob": 0.1, - "cross_attention_frequency": 1, - "encoder_hidden_size": 32, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 32, - "initializer_range": 0.02, "intermediate_size": 256, - "layer_norm_eps": 1e-12, - "max_position_embeddings": 2048, - "model_type": "blip_2_qformer", - "num_attention_heads": 4, - "num_hidden_layers": 2, - "use_qformer_text_input": False, - "vocab_size": 30522, - }, - audio_token_index=0, - tie_word_embeddings=True, - initializer_range=0.02, - has_lora_adapter=True, - downsample_rate=5, - window_size=15, - is_training=True, - ): - self.parent = parent - self.encoder_config = encoder_config - self.text_config = text_config - self.projector_config = projector_config - self.audio_token_index = audio_token_index - self.tie_word_embeddings = tie_word_embeddings - self.initializer_range = initializer_range - self.has_lora_adapter = has_lora_adapter - self.downsample_rate = downsample_rate - self.window_size = window_size - self.is_training = is_training - - # Dims for audio features - self.sequence_dim = 844 - self.feature_dim = 160 - self.num_attention_heads = text_config["num_attention_heads"] - self.num_hidden_layers = text_config["num_hidden_layers"] - self.hidden_size = text_config["hidden_size"] - self.batch_size = 3 - self.pad_token_id = text_config["pad_token_id"] - self.seq_len = 7 - self.num_audio_tokens = 2 - self.seq_length = seq_length + self.num_audio_tokens - - def get_config(self): - return GraniteSpeechConfig( - encoder_config=self.encoder_config, - text_config=self.text_config, - projector_config=self.projector_config, - audio_token_index=self.audio_token_index, - tie_word_embeddings=self.tie_word_embeddings, - initializer_range=self.initializer_range, - has_lora_adapter=self.has_lora_adapter, - ) + "encoder_hidden_size": 32, + } - def prepare_config_and_inputs(self): - input_features = floats_tensor( - [self.batch_size, self.sequence_dim, self.feature_dim], - ) - config = self.get_config() - return config, input_features + super().__init__(parent, **kwargs) - def prepare_config_and_inputs_for_common(self): - config_and_inputs = self.prepare_config_and_inputs() - config, input_features = config_and_inputs - input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2 - attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) - input_ids[input_ids == config.audio_token_index] = self.pad_token_id + def create_audio_features(self): + # GraniteSpeech expects [B, seq_len, features] (time-first), unlike the standard [B, features, seq_len] + return floats_tensor([self.batch_size, self.feat_seq_length, self.num_mel_bins]) - input_ids[:, : self.num_audio_tokens] = config.audio_token_index + def get_audio_embeds_mask(self, audio_mask): + # Projector: ceil(feat_seq_length / window_size) * (window_size // downsample_rate) tokens per sample. + import math - inputs_dict = { - "input_features": input_features, - "input_ids": input_ids, - "attention_mask": attention_mask, - } - return config, inputs_dict + config = self.get_config() + nblocks = math.ceil(self.feat_seq_length / config.window_size) + num_audio_tokens = nblocks * (config.window_size // config.downsample_rate) + return torch.ones([self.batch_size, num_audio_tokens], dtype=torch.long).to(torch_device) + + def create_attention_mask(self, input_ids): + return torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) def create_and_check_granite_speech_model_fp16_forward(self, config, input_ids, input_features, attention_mask): model = GraniteSpeechForConditionalGeneration(config=config) @@ -211,24 +117,13 @@ def create_and_check_granite_speech_model_fp16_autocast_forward( @require_torch -class GraniteSpeechForConditionalGenerationModelTest( - ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase -): +class GraniteSpeechForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): """ Model tester for `GraniteSpeechForConditionalGeneration`. """ - all_model_classes = (GraniteSpeechForConditionalGeneration,) if is_torch_available() else () + model_tester_class = GraniteSpeechModelTester pipeline_model_mapping = {"any-to-any": GraniteSpeechForConditionalGeneration} if is_torch_available() else {} - _is_composite = True - - def setUp(self): - self.model_tester = GraniteSpeechForConditionalGenerationModelTester(self) - self.config_tester = ConfigTester( - self, - config_class=GraniteSpeechConfig, - has_text_modality=False, - ) @unittest.skip( reason="This test does not apply to GraniteSpeech since inputs_embeds corresponding to audio tokens are replaced when input features are provided." @@ -237,7 +132,7 @@ def test_inputs_embeds_matches_input_ids(self): pass def test_inputs_embeds(self): - # overwrite inputs_embeds tests because we need to delete "input features" for the audio model + # Overwrite inputs_embeds tests because we need to delete "input_features" for the audio model config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: @@ -257,53 +152,12 @@ def test_inputs_embeds(self): with torch.no_grad(): model(**inputs) - def test_sdpa_can_dispatch_composite_models(self): - # overwrite because Granite Speech is audio+text model (not vision+text) - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - if not self._is_composite: - self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") - - for model_class in self.all_model_classes: - # NOTE - currently we only enable alternate attention implementations on - # the encapsulated LLM; in the future, this should be added for the conformer - # encoder as well. - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model_sdpa = model_class.from_pretrained(tmpdirname) - model_sdpa = model_sdpa.eval().to(torch_device) - - text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" - - # `None` as it is the requested one which will be assigned to each sub-config - # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - self.assertTrue(model.language_model.config._attn_implementation == text_attn) - - model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") - model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - @pytest.mark.generate @slow @unittest.skip(reason="Granite Speech doesn't support SDPA for all backbones") def test_eager_matches_sdpa_generate(self): pass - @unittest.skip(reason="GraniteSpeech has no separate base model without a head.") - def test_model_base_model_prefix(self): - pass - class GraniteSpeechForConditionalGenerationIntegrationTest(unittest.TestCase): def setUp(self): diff --git a/tests/models/granite_speech_plus/test_modeling_granite_speech_plus.py b/tests/models/granite_speech_plus/test_modeling_granite_speech_plus.py index 4108a4fbb79b..21f1d997efb4 100644 --- a/tests/models/granite_speech_plus/test_modeling_granite_speech_plus.py +++ b/tests/models/granite_speech_plus/test_modeling_granite_speech_plus.py @@ -15,18 +15,18 @@ import unittest -from parameterized import parameterized - -from transformers import AutoProcessor, GraniteSpeechPlusConfig, GraniteSpeechPlusForConditionalGeneration +from transformers import ( + AutoProcessor, + GraniteSpeechPlusConfig, + GraniteSpeechPlusEncoderConfig, + GraniteSpeechPlusForConditionalGeneration, +) from transformers.testing_utils import cleanup, require_torch, slow, torch_device -from transformers.utils import ModelOutput, is_datasets_available, is_torch_available +from transformers.utils import is_datasets_available, is_torch_available -from ...test_configuration_common import ConfigTester -from ..granite_speech.test_modeling_granite_speech import ( - GraniteSpeechForConditionalGenerationModelTest as _GraniteSpeechModelTestBase, -) from ..granite_speech.test_modeling_granite_speech import ( - GraniteSpeechForConditionalGenerationModelTester as _GraniteSpeechModelTesterBase, + GraniteSpeechForConditionalGenerationModelTest, + GraniteSpeechModelTester, ) @@ -35,155 +35,55 @@ if is_datasets_available(): from datasets import load_dataset -from transformers import set_seed - -class GraniteSpeechPlusForConditionalGenerationModelTester(_GraniteSpeechModelTesterBase): +class GraniteSpeechPlusForConditionalGenerationModelTester(GraniteSpeechModelTester): """ - Plus variant that exercises the ``encoder_hidden_layers`` concat path. The projector's - ``encoder_hidden_size`` is scaled to match ``encoder_config.hidden_dim * (len(encoder_hidden_layers) + 1)``. + Plus variant that exercises the ``cat_hidden_layers`` concat path. The projector's + ``encoder_hidden_size`` is scaled to match ``encoder_config.hidden_dim * (len(cat_hidden_layers) + 1)``. """ - def __init__(self, parent, encoder_hidden_layers=(0,), **kwargs): - projector_config = kwargs.pop( - "projector_config", - { - "attention_probs_dropout_prob": 0.1, - "cross_attention_frequency": 1, - "encoder_hidden_size": 64, # 32 (hidden_dim) * (1 intermediate + 1 last) = 64 - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 32, - "initializer_range": 0.02, - "intermediate_size": 256, - "layer_norm_eps": 1e-12, - "max_position_embeddings": 2048, - "model_type": "blip_2_qformer", - "num_attention_heads": 4, - "num_hidden_layers": 2, - "use_qformer_text_input": False, - "vocab_size": 30522, - }, - ) - super().__init__(parent=parent, projector_config=projector_config, **kwargs) - self.encoder_hidden_layers = list(encoder_hidden_layers) - self.encoder_config["cat_hidden_layers"] = self.encoder_hidden_layers + config_class = GraniteSpeechPlusConfig + conditional_generation_class = GraniteSpeechPlusForConditionalGeneration + audio_config_class = GraniteSpeechPlusEncoderConfig - def get_config(self): - return GraniteSpeechPlusConfig( - encoder_config=self.encoder_config, - text_config=self.text_config, - projector_config=self.projector_config, - audio_token_index=self.audio_token_index, - tie_word_embeddings=self.tie_word_embeddings, - initializer_range=self.initializer_range, - has_lora_adapter=self.has_lora_adapter, - ) + def __init__(self, parent, cat_hidden_layers=(0,), **kwargs): + super().__init__(parent, **kwargs) + self.cat_hidden_layers = list(cat_hidden_layers) + # Projector encoder_hidden_size must equal hidden_dim * (len(cat_hidden_layers) + 1). + self.projector_config = { + "model_type": "blip_2_qformer", + "hidden_size": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 256, + "encoder_hidden_size": 32 * (len(self.cat_hidden_layers) + 1), + } @require_torch -class GraniteSpeechPlusForConditionalGenerationModelTest(_GraniteSpeechModelTestBase): +class GraniteSpeechPlusForConditionalGenerationModelTest(GraniteSpeechForConditionalGenerationModelTest): """ Model tester for `GraniteSpeechPlusForConditionalGeneration`. """ - all_model_classes = (GraniteSpeechPlusForConditionalGeneration,) if is_torch_available() else () + model_tester_class = GraniteSpeechPlusForConditionalGenerationModelTester pipeline_model_mapping = {"any-to-any": GraniteSpeechPlusForConditionalGeneration} if is_torch_available() else {} - def setUp(self): - self.model_tester = GraniteSpeechPlusForConditionalGenerationModelTester(self) - self.config_tester = ConfigTester( - self, - config_class=GraniteSpeechPlusConfig, - has_text_modality=False, - ) + # The cat path changes the encoder output feature dim, so the generic shape assertion in + # `test_get_audio_features_output` (which assumes hidden_dim) does not apply. + skip_test_audio_features_output_shape = True def test_encoder_hidden_layers_concat_shape(self): - """With ``encoder_hidden_layers`` set, get_audio_features concatenates the selected intermediate - hidden states with the final hidden state before the projector.""" + """``encoder_config.cat_hidden_layers`` concatenates selected intermediate hidden states with the final + hidden state along the feature dim before the projector.""" config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = GraniteSpeechPlusForConditionalGeneration(config).to( - self.model_tester.parent.device if hasattr(self.model_tester.parent, "device") else "cpu" - ) - model.eval() + model = GraniteSpeechPlusForConditionalGeneration(config).to(torch_device).eval() with torch.no_grad(): out = model.get_audio_features(inputs_dict["input_features"].to(next(model.parameters()).device)) - self.assertEqual(out.pooler_output.shape[0], inputs_dict["input_features"].shape[0]) - - @parameterized.expand([True, False, None]) - def test_get_audio_features_output(self, return_dict: bool | None): - for model_class in self.all_model_classes: - if not hasattr(model_class, "get_audio_features"): - continue - - config, inputs_dict = self._audio_features_prepare_config_and_inputs() - if return_dict is not None: - config.return_dict = return_dict - - model = model_class(config).eval() - model = model.to(torch_device) - - set_seed(42) - with torch.no_grad(): - outputs = model.get_audio_features(**inputs_dict) - - if return_dict in (True, None): - self.assertTrue( - isinstance(outputs, ModelOutput), "get_audio_features() must return a BaseModelOutputWithPooling" - ) - self.assertTrue( - hasattr(outputs, "last_hidden_state"), - "get_audio_features() must return a BaseModelOutputWithPooling with last_hidden_state", - ) - self.assertTrue( - hasattr(outputs, "pooler_output"), - "get_audio_features() must return a BaseModelOutputWithPooling with pooler_output", - ) - self.assertTrue( - hasattr(outputs, "hidden_states"), - "get_audio_features() must return a BaseModelOutputWithPooling with hidden_states", - ) - if self.has_attentions: - self.assertTrue( - hasattr(outputs, "attentions"), - "get_audio_features() must return a BaseModelOutputWithPooling with attentions", - ) - - if getattr(self, "skip_test_audio_features_output_shape", False): - return - - last_hidden_state_shape = outputs.last_hidden_state.shape - - if "input_features" in inputs_dict: - batch_size = inputs_dict["input_features"].shape[0] - else: - batch_size = inputs_dict["input_values"].shape[0] - self.assertEqual( - last_hidden_state_shape[0], - batch_size, - f"batch_size mismatch, full shape: {last_hidden_state_shape}", - ) - - audio_config = config.audio_config if hasattr(config, "audio_config") else config - hidden_size = None - if hasattr(audio_config, "projection_dim"): - hidden_size = audio_config.projection_dim - elif hasattr(audio_config, "hidden_size"): - hidden_size = audio_config.hidden_size - elif hasattr(audio_config, "encoder_config"): - hidden_size = audio_config.encoder_config.hidden_dim * ( - len(audio_config.encoder_config.cat_hidden_layers) + 1 - ) - elif hasattr(audio_config, "encoder_ffn_dim"): - hidden_size = audio_config.encoder_ffn_dim - self.assertEqual( - last_hidden_state_shape[-1], - hidden_size, - f"hidden_size mismatch, full shape: {last_hidden_state_shape}", - ) - - else: - self.assertIsInstance(outputs, tuple, "get_audio_features() must return a tuple if return_dict=False") + cat_factor = len(config.encoder_config.cat_hidden_layers) + 1 + expected_hidden_size = config.encoder_config.hidden_dim * cat_factor + self.assertEqual(out.last_hidden_state.shape[0], inputs_dict["input_features"].shape[0]) + self.assertEqual(out.last_hidden_state.shape[-1], expected_hidden_size) class GraniteSpeechPlusForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index 7e7b40e4eaba..60d50830ab74 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -84,7 +84,7 @@ def create_pixel_values(self): ] ) - def get_additional_inputs(self, config, input_ids, pixel_values): + def get_additional_inputs(self, config, input_ids, modality_inputs): """LlavaNext requires image_sizes tensor""" return { "image_sizes": torch.tensor([[self.image_size, self.image_size]] * self.batch_size), diff --git a/tests/models/musicflamingo/test_modeling_musicflamingo.py b/tests/models/musicflamingo/test_modeling_musicflamingo.py index 8c3b0ce549c8..2615af219ff5 100644 --- a/tests/models/musicflamingo/test_modeling_musicflamingo.py +++ b/tests/models/musicflamingo/test_modeling_musicflamingo.py @@ -16,16 +16,15 @@ import json import os -import tempfile import unittest from pathlib import Path -import pytest - from transformers import ( + AudioFlamingo3EncoderConfig, AutoProcessor, MusicFlamingoConfig, MusicFlamingoForConditionalGeneration, + Qwen2Config, is_torch_available, ) from transformers.testing_utils import ( @@ -37,129 +36,60 @@ torch_device, ) -from ...generation.test_utils import GenerationTesterMixin -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...alm_tester import ALMModelTest, ALMModelTester +from ...test_modeling_common import ids_tensor if is_torch_available(): import torch -class MusicFlamingoModelTester: +class MusicFlamingoModelTester(ALMModelTester): """ Builds a tiny MusicFlamingo config and synthetic inputs that respect MusicFlamingo's post-pool token accounting: num tokens per sample == post-pool frame count. """ - def __init__( - self, - parent, - audio_token_id=0, - seq_length=25, - feat_seq_length=60, - text_config=None, - audio_config=None, - is_training=True, - ): - self.parent = parent - self.audio_token_id = audio_token_id - self.seq_length = seq_length - self.feat_seq_length = feat_seq_length - self.is_training = is_training - - # Small text backbone (Qwen2-ish) - if text_config is None: - text_config = { - "model_type": "qwen2", - "intermediate_size": 36, - "initializer_range": 0.02, - "hidden_size": 32, - "max_position_embeddings": 52, - "num_hidden_layers": 2, - "num_attention_heads": 4, - "num_key_value_heads": 2, - "use_labels": True, - "use_mrope": False, - "vocab_size": 99, - "pad_token_id": 1, # Ensure pad token != audio token - } - # Small audio encoder (MusicFlamingo Whisper-style) - if audio_config is None: - audio_config = { - "model_type": "musicflamingo_encoder", - "hidden_size": 16, - "num_attention_heads": 4, - "intermediate_size": 16, - "num_hidden_layers": 2, - "num_mel_bins": 80, - "max_source_positions": 30, - "initializer_range": 0.02, - } - - self.text_config = text_config - self.audio_config = audio_config - - self.batch_size = 3 - self.vocab_size = text_config["vocab_size"] - self.hidden_size = text_config["hidden_size"] - self.num_attention_heads = text_config["num_attention_heads"] - self.num_hidden_layers = text_config["num_hidden_layers"] - self.encoder_seq_length = seq_length + config_class = MusicFlamingoConfig + conditional_generation_class = MusicFlamingoForConditionalGeneration + text_config_class = Qwen2Config + audio_config_class = AudioFlamingo3EncoderConfig + audio_mask_key = "input_features_mask" + + def __init__(self, parent, **kwargs): + # feat_seq_length=60 → (60-1)//2+1=30 → (30-2)//2+1=15 audio embed tokens. + kwargs.setdefault("feat_seq_length", 60) + kwargs.setdefault("max_source_positions", (kwargs["feat_seq_length"] - 1) // 2 + 1) + super().__init__(parent, **kwargs) + + def create_audio_mask(self): + # Deterministic full-length mask — base default uses unseeded Python `random`, which makes + # multi-call generation-comparison tests (e.g. assisted decoding vs greedy) flaky. + return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) + + def get_audio_embeds_mask(self, audio_mask): + # AudioFlamingo3Encoder._get_feat_extract_output_lengths: conv2 (k=3,s=2) then avg_pool (k=2,s=2). + input_lengths = audio_mask.sum(-1) + input_lengths = (input_lengths - 1) // 2 + 1 + output_lengths = (input_lengths - 2) // 2 + 1 + max_len = int(output_lengths.max().item()) + positions = torch.arange(max_len, device=audio_mask.device)[None, :] + return (positions < output_lengths[:, None]).long() def get_config(self): - return MusicFlamingoConfig( - text_config=self.text_config, - audio_config=self.audio_config, - audio_token_id=self.audio_token_id, - rope_parameters={"rope_type": "default", "rope_theta": 2048, "partial_rotary_factor": 0.5}, - ) - - def prepare_config_and_inputs(self): - # (#windows == batch_size, n_mels, T_mel) - input_features_values = floats_tensor( - [self.batch_size, self.audio_config["num_mel_bins"], self.feat_seq_length] - ) - config = self.get_config() - # Per-window mel validity (all ones => full length) - input_features_mask = torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) - return config, input_features_values, input_features_mask - - def _post_pool_tokens_per_window(self, T_mel): - # Mirror MusicFlamingo processor math: - pre = (T_mel - 1) // 2 + 1 - post = (pre - 2) // 2 + 1 - return post - - def prepare_config_and_inputs_for_common(self): - config, input_features_values, input_features_mask = self.prepare_config_and_inputs() - # Every window has same T_mel here - num_audio_tokens_per_sample = self._post_pool_tokens_per_window(input_features_values.shape[-1]) - - # Build token ids with valid range and K tokens - input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2 - attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=torch_device) - attention_mask[:, :1] = 0 # left padding sentinel - - # Fill first K positions (after padding) with the audio token id, for each sample - input_ids[:, 1 : 1 + num_audio_tokens_per_sample] = config.audio_token_id - - inputs_dict = { - "input_features": input_features_values, - "input_features_mask": input_features_mask, - "input_ids": input_ids, - "attention_mask": attention_mask, - } - return config, inputs_dict + # MusicFlamingoConfig requires rope_parameters. + config = super().get_config() + config.rope_parameters = {"rope_type": "default", "rope_theta": 2048, "partial_rotary_factor": 0.5} + return config @require_torch -class MusicFlamingoForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): +class MusicFlamingoForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): """ Model tester for `MusicFlamingoForConditionalGeneration`. """ - all_model_classes = (MusicFlamingoForConditionalGeneration,) if is_torch_available() else () + model_tester_class = MusicFlamingoModelTester pipeline_model_mapping = ( { "text-to-speech": MusicFlamingoForConditionalGeneration, @@ -168,11 +98,6 @@ class MusicFlamingoForConditionalGenerationModelTest(ModelTesterMixin, Generatio if is_torch_available() else {} ) - _is_composite = True - - def setUp(self): - self.model_tester = MusicFlamingoModelTester(self) - self.config_tester = ConfigTester(self, config_class=MusicFlamingoConfig, has_text_modality=False) def test_rotary_window_axis_resets_per_audio(self): config = self.model_tester.get_config() @@ -233,61 +158,6 @@ def test_build_audio_timestamps_reconstructs_windows_from_input_ids(self): def test_inputs_embeds_matches_input_ids(self): pass - @unittest.skip(reason="Compile not yet supported for MusicFlamingo models") - @pytest.mark.torch_compile_test - def test_sdpa_can_compile_dynamic(self): - pass - - @unittest.skip(reason="Compile not yet supported for MusicFlamingo models") - def test_sdpa_can_dispatch_on_flash(self): - pass - - @unittest.skip(reason="MusicFlamingo tests avoid right-padding equivalence; fusion is in-place.") - def test_flash_attn_2_inference_equivalence_right_padding(self): - pass - - @unittest.skip(reason="MusicFlamingo has no separate base model without a head.") - def test_model_base_model_prefix(self): - pass - - def test_sdpa_can_dispatch_composite_models(self): - # MusicFlamingo is audio+text composite; verify SDPA toggles propagate to submodules. - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - if not self._is_composite: - self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") - - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - # SDPA (default) - model_sdpa = model_class.from_pretrained(tmpdirname) - model_sdpa = model_sdpa.eval().to(torch_device) - - text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" - audio_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager" - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - self.assertTrue(model.language_model.config._attn_implementation == text_attn) - self.assertTrue(model.audio_tower.config._attn_implementation == audio_attn) - - # Eager - model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") - model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") - self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager") - - for _, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - @require_torch class MusicFlamingoForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py index 4df16b9f6f4b..1557217fdd63 100644 --- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py +++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py @@ -13,18 +13,18 @@ # limitations under the License. """Testing suite for the PyTorch Qwen2Audio model.""" -import tempfile import unittest from io import BytesIO from urllib.request import urlopen import librosa -import pytest from transformers import ( AutoProcessor, Qwen2AudioConfig, + Qwen2AudioEncoderConfig, Qwen2AudioForConditionalGeneration, + Qwen2Config, is_torch_available, ) from transformers.testing_utils import ( @@ -34,172 +34,56 @@ torch_device, ) -from ...generation.test_utils import GenerationTesterMixin -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor -from ...test_pipeline_mixin import PipelineTesterMixin +from ...alm_tester import ALMModelTest, ALMModelTester if is_torch_available(): import torch -class Qwen2AudioModelTester: - def __init__( - self, - parent, - ignore_index=-100, - audio_token_index=0, - seq_length=25, - feat_seq_length=60, - text_config={ - "model_type": "qwen2", - "intermediate_size": 36, - "initializer_range": 0.02, - "hidden_size": 32, - "max_position_embeddings": 52, - "num_hidden_layers": 2, - "num_attention_heads": 4, - "num_key_value_heads": 2, - "use_labels": True, - "use_mrope": False, - "vocab_size": 99, - "pad_token_id": 1, # can't be the same as the audio token id - }, - is_training=True, - audio_config={ - "model_type": "qwen2_audio_encoder", - "d_model": 16, - "encoder_attention_heads": 4, - "encoder_ffn_dim": 16, - "encoder_layers": 2, - "num_mel_bins": 80, - "max_source_positions": 30, - "initializer_range": 0.02, - }, - ): - self.parent = parent - self.ignore_index = ignore_index - self.audio_token_index = audio_token_index - self.text_config = text_config - self.audio_config = audio_config - self.seq_length = seq_length - self.feat_seq_length = feat_seq_length - - self.num_hidden_layers = text_config["num_hidden_layers"] - self.vocab_size = text_config["vocab_size"] - self.hidden_size = text_config["hidden_size"] - self.num_attention_heads = text_config["num_attention_heads"] - self.is_training = is_training - - self.batch_size = 3 - self.encoder_seq_length = seq_length - - def get_config(self): - return Qwen2AudioConfig( - text_config=self.text_config, - audio_config=self.audio_config, - ignore_index=self.ignore_index, - audio_token_index=self.audio_token_index, - ) - - def prepare_config_and_inputs(self): - input_features_values = floats_tensor( - [ - self.batch_size, - self.audio_config["num_mel_bins"], - self.feat_seq_length, - ] - ) - config = self.get_config() - feature_attention_mask = torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.long).to(torch_device) - return config, input_features_values, feature_attention_mask - - def prepare_config_and_inputs_for_common(self): - config_and_inputs = self.prepare_config_and_inputs() - config, input_features_values, feature_attention_mask = config_and_inputs - input_length = (input_features_values.shape[-1] - 1) // 2 + 1 - num_audio_tokens = (input_length - 2) // 2 + 1 - input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1 - attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) - attention_mask[:, :1] = 0 - # we are giving 3 audios let's make sure we pass in 3 audios tokens - input_ids[:, 1 : 1 + num_audio_tokens] = config.audio_token_index - inputs_dict = { - "input_features": input_features_values, - "feature_attention_mask": feature_attention_mask, - "input_ids": input_ids, - "attention_mask": attention_mask, - } - return config, inputs_dict +class Qwen2AudioModelTester(ALMModelTester): + config_class = Qwen2AudioConfig + conditional_generation_class = Qwen2AudioForConditionalGeneration + text_config_class = Qwen2Config + audio_config_class = Qwen2AudioEncoderConfig + audio_mask_key = "feature_attention_mask" + + def __init__(self, parent, **kwargs): + # feat_seq_length=60 → after conv2 s=2: 30 → after avg_pool s=2: 15 audio embed tokens. + kwargs.setdefault("feat_seq_length", 60) + # Encoder asserts input_features.shape[-1] == max_source_positions * conv1.stride * conv2.stride == 2 * max_source_positions. + kwargs.setdefault("max_source_positions", kwargs["feat_seq_length"] // 2) + super().__init__(parent, **kwargs) + + def create_audio_mask(self): + # Deterministic full-length mask: the base default randomizes via Python's `random`, which isn't + # re-seeded per test call and desynchronizes the two `prepare_config_and_inputs_for_common` + # invocations inside generation-comparison tests (e.g. test_greedy_generate_dict_outputs). + return torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device) + + def get_audio_embeds_mask(self, audio_mask): + # Mirrors Qwen2AudioEncoder._get_feat_extract_output_lengths: conv2 (k=3,s=2,p=1) then avg_pool (k=2,s=2). + input_lengths = audio_mask.sum(-1) + input_lengths = (input_lengths - 1) // 2 + 1 + output_lengths = (input_lengths - 2) // 2 + 1 + max_len = int(output_lengths.max().item()) + positions = torch.arange(max_len, device=audio_mask.device)[None, :] + return (positions < output_lengths[:, None]).long() @require_torch -class Qwen2AudioForConditionalGenerationModelTest( - ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase -): +class Qwen2AudioForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): """ Model tester for `Qwen2AudioForConditionalGeneration`. """ - all_model_classes = (Qwen2AudioForConditionalGeneration,) if is_torch_available() else () + model_tester_class = Qwen2AudioModelTester pipeline_model_mapping = {"any-to-any": Qwen2AudioForConditionalGeneration} if is_torch_available() else {} - _is_composite = True - - def setUp(self): - self.model_tester = Qwen2AudioModelTester(self) - self.config_tester = ConfigTester(self, config_class=Qwen2AudioConfig, has_text_modality=False) - @unittest.skip(reason="Compile not yet supported because in Qwen2Audio models") - @pytest.mark.torch_compile_test - def test_sdpa_can_compile_dynamic(self): + @unittest.skip(reason="inputs_embeds is the audio-fused path; can't match raw token-only embeddings.") + def test_inputs_embeds_matches_input_ids(self): pass - @unittest.skip(reason="Compile not yet supported because in Qwen2Audio models") - def test_sdpa_can_dispatch_on_flash(self): - pass - - @unittest.skip(reason="Qwen2Audio has no separate base model without a head.") - def test_model_base_model_prefix(self): - pass - - def test_sdpa_can_dispatch_composite_models(self): - # overwrite because Qwen2 is audio+text model (not vision+text) - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - if not self._is_composite: - self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") - - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model_sdpa = model_class.from_pretrained(tmpdirname) - model_sdpa = model_sdpa.eval().to(torch_device) - - text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" - vision_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager" - - # `None` as it is the requested one which will be assigned to each sub-config - # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - self.assertTrue(model.language_model.config._attn_implementation == text_attn) - self.assertTrue(model.audio_tower.config._attn_implementation == vision_attn) - - model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") - model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") - self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - @require_torch class Qwen2AudioForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/qwen3_vl/test_modeling_qwen3_vl.py b/tests/models/qwen3_vl/test_modeling_qwen3_vl.py index 9874ce4a8203..d80cb3819486 100644 --- a/tests/models/qwen3_vl/test_modeling_qwen3_vl.py +++ b/tests/models/qwen3_vl/test_modeling_qwen3_vl.py @@ -107,7 +107,7 @@ def place_image_tokens(self, input_ids, config): input_ids[:, 0] = self.vision_start_token_id return input_ids - def get_additional_inputs(self, config, input_ids, pixel_values): + def get_additional_inputs(self, config, input_ids, modality_inputs): mm_token_type_ids = torch.zeros_like(input_ids) mm_token_type_ids[input_ids == self.image_token_id] = 1 return { diff --git a/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py b/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py index 0b0523de3b71..03a93ef1d7fd 100644 --- a/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py +++ b/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py @@ -106,7 +106,7 @@ def place_image_tokens(self, input_ids, config): input_ids[:, 0] = self.vision_start_token_id return input_ids - def get_additional_inputs(self, config, input_ids, pixel_values): + def get_additional_inputs(self, config, input_ids, modality_inputs): # Qwen3VL requires image_grid_thw tensor mm_token_type_ids = torch.zeros_like(input_ids) mm_token_type_ids[input_ids == self.image_token_id] = 1 diff --git a/tests/models/vibevoice_asr/test_modeling_vibevoice_asr.py b/tests/models/vibevoice_asr/test_modeling_vibevoice_asr.py index be0ece165e36..fc8bb11568ea 100644 --- a/tests/models/vibevoice_asr/test_modeling_vibevoice_asr.py +++ b/tests/models/vibevoice_asr/test_modeling_vibevoice_asr.py @@ -17,7 +17,6 @@ import unittest from pathlib import Path -import pytest from parameterized import parameterized from transformers import ( @@ -150,19 +149,6 @@ def setUp(self): def test_inputs_embeds_matches_input_ids(self): pass - @unittest.skip(reason="Compile not yet supported for VibeVoiceAsr models") - @pytest.mark.torch_compile_test - def test_sdpa_can_compile_dynamic(self): - pass - - @unittest.skip(reason="Compile not yet supported for VibeVoiceAsr models") - def test_sdpa_can_dispatch_on_flash(self): - pass - - @unittest.skip(reason="VibeVoiceAsr tests avoid right-padding equivalence; fusion is in-place.") - def test_flash_attn_2_inference_equivalence_right_padding(self): - pass - @unittest.skip(reason="VibeVoiceAsr has no separate base model without a head.") def test_model_base_model_prefix(self): pass diff --git a/tests/models/voxtral/test_modeling_voxtral.py b/tests/models/voxtral/test_modeling_voxtral.py index 0cff2a66779b..4f0c604ce05f 100644 --- a/tests/models/voxtral/test_modeling_voxtral.py +++ b/tests/models/voxtral/test_modeling_voxtral.py @@ -13,12 +13,13 @@ # limitations under the License. """Testing suite for the PyTorch Voxtral model.""" -import tempfile import unittest from transformers import ( AutoProcessor, + LlamaConfig, VoxtralConfig, + VoxtralEncoderConfig, VoxtralForConditionalGeneration, is_torch_available, ) @@ -30,126 +31,50 @@ torch_device, ) -from ...generation.test_utils import GenerationTesterMixin -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor -from ...test_pipeline_mixin import PipelineTesterMixin +from ...alm_tester import ALMModelTest, ALMModelTester if is_torch_available(): import torch -class VoxtralModelTester: - def __init__( - self, - parent, - ignore_index=-100, - audio_token_id=0, - seq_length=35, - feat_seq_length=60, - text_config={ - "model_type": "llama", - "intermediate_size": 36, - "initializer_range": 0.02, - "hidden_size": 32, - "max_position_embeddings": 52, - "num_hidden_layers": 2, - "num_attention_heads": 4, - "num_key_value_heads": 2, - "use_labels": True, - "use_mrope": False, - "vocab_size": 99, - "head_dim": 8, - "pad_token_id": 1, # can't be the same as the audio token id - }, - is_training=True, - audio_config={ - "model_type": "voxtral_encoder", - "hidden_size": 16, - "num_attention_heads": 4, - "intermediate_size": 16, - "num_hidden_layers": 2, - "num_mel_bins": 80, - "max_source_positions": 30, - "initializer_range": 0.02, - }, - ): - self.parent = parent - self.ignore_index = ignore_index - self.audio_token_id = audio_token_id - self.text_config = text_config - self.audio_config = audio_config - self.seq_length = seq_length - self.feat_seq_length = feat_seq_length - - self.num_hidden_layers = text_config["num_hidden_layers"] - self.vocab_size = text_config["vocab_size"] - self.hidden_size = text_config["hidden_size"] - self.num_attention_heads = text_config["num_attention_heads"] - self.is_training = is_training - - self.batch_size = 3 - self.encoder_seq_length = seq_length - - def get_config(self): - return VoxtralConfig( - text_config=self.text_config, - audio_config=self.audio_config, - ignore_index=self.ignore_index, - audio_token_id=self.audio_token_id, - ) - - def prepare_config_and_inputs(self): - input_features_values = floats_tensor( - [ - self.batch_size, - self.audio_config["num_mel_bins"], - self.feat_seq_length, - ] - ) - config = self.get_config() - return config, input_features_values - - def prepare_config_and_inputs_for_common(self): - config_and_inputs = self.prepare_config_and_inputs() - config, input_features_values = config_and_inputs - num_audio_tokens_per_batch_idx = 30 +class VoxtralModelTester(ALMModelTester): + config_class = VoxtralConfig + conditional_generation_class = VoxtralForConditionalGeneration + text_config_class = LlamaConfig + audio_config_class = VoxtralEncoderConfig - input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1 - attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) - attention_mask[:, :1] = 0 + def __init__(self, parent, **kwargs): + # seq_length 35 = BOS + 30 audio + 4 text (keeps column -2 text-only for resize test). + kwargs.setdefault("seq_length", 35) + # feat_seq_length 60 → conv2(s=2) → 30 audio embeds (Voxtral's encoder does not apply avg_pool + # in the forward; projector reshapes to B*30 embeddings). + kwargs.setdefault("feat_seq_length", 60) + # Encoder asserts input_features.shape[-1] == max_source_positions * 2. + kwargs.setdefault("max_source_positions", kwargs["feat_seq_length"] // 2) + # Llama needs head_dim + kwargs.setdefault("head_dim", 8) + super().__init__(parent, **kwargs) - input_ids[:, 1 : 1 + num_audio_tokens_per_batch_idx] = config.audio_token_id - inputs_dict = { - "input_ids": input_ids, - "attention_mask": attention_mask, - "input_features": input_features_values, - } - return config, inputs_dict + def get_audio_embeds_mask(self, audio_mask): + # Voxtral encoder only applies conv2 (stride 2); no avg_pool in forward. + output_length = (self.feat_seq_length - 1) // 2 + 1 + return torch.ones([self.batch_size, output_length], dtype=torch.long).to(torch_device) @require_torch -class VoxtralForConditionalGenerationModelTest( - ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase -): +class VoxtralForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): """ Model tester for `VoxtralForConditionalGeneration`. """ - all_model_classes = (VoxtralForConditionalGeneration,) if is_torch_available() else () + model_tester_class = VoxtralModelTester pipeline_model_mapping = ( {"text-to-speech": VoxtralForConditionalGeneration, "any-to-any": VoxtralForConditionalGeneration} if is_torch_available() else {} ) - _is_composite = True - - def setUp(self): - self.model_tester = VoxtralModelTester(self) - self.config_tester = ConfigTester(self, config_class=VoxtralConfig, has_text_modality=False) - @unittest.skip( reason="This test does not apply to Voxtral since inputs_embeds corresponding to audio tokens are replaced when input features are provided." ) @@ -192,47 +117,6 @@ def test_flash_attention_3_padding_matches_padding_free_with_position_ids(self): def test_flash_attention_3_padding_matches_padding_free_with_position_ids_and_fa_kwargs(self): pass - @unittest.skip(reason="Voxtral has no separate base model without a head.") - def test_model_base_model_prefix(self): - pass - - def test_sdpa_can_dispatch_composite_models(self): - # overwrite because Voxtral is audio+text model (not vision+text) - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - if not self._is_composite: - self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") - - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model_sdpa = model_class.from_pretrained(tmpdirname) - model_sdpa = model_sdpa.eval().to(torch_device) - - text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" - vision_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager" - - # `None` as it is the requested one which will be assigned to each sub-config - # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - self.assertTrue(model.language_model.config._attn_implementation == text_attn) - self.assertTrue(model.audio_tower.config._attn_implementation == vision_attn) - - model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") - model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") - self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - @require_torch class VoxtralForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py index 9aa817f3cba6..150d7a894104 100644 --- a/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py +++ b/tests/models/voxtral_realtime/test_modeling_voxtral_realtime.py @@ -24,6 +24,10 @@ is_torch_available, ) from transformers.audio_utils import load_audio +from transformers.models.voxtral_realtime.configuration_voxtral_realtime import ( + VoxtralRealtimeEncoderConfig, + VoxtralRealtimeTextConfig, +) from transformers.testing_utils import ( cleanup, require_torch, @@ -31,10 +35,8 @@ torch_device, ) -from ...generation.test_utils import GenerationTesterMixin -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor -from ...test_pipeline_mixin import PipelineTesterMixin +from ...alm_tester import ALMModelTest, ALMModelTester +from ...test_modeling_common import floats_tensor, ids_tensor if is_datasets_available(): @@ -44,136 +46,84 @@ import torch -class VoxtralRealtimeModelTester: - def __init__( - self, - parent, - ignore_index=-100, - audio_token_id=0, - seq_length=5, - feat_seq_length=40, - text_config={ - "model_type": "voxtral_realtime_text", - "intermediate_size": 36, - "initializer_range": 0.02, - "hidden_size": 32, - "max_position_embeddings": 52, - "num_hidden_layers": 2, - "num_attention_heads": 4, - "num_key_value_heads": 2, - "use_labels": True, - "vocab_size": 99, - "head_dim": 8, - "pad_token_id": 1, # can't be the same as the audio token id - "hidden_act": "silu", - "rms_norm_eps": 1e-6, - "attention_dropout": 0.0, - "rope_parameters": { - "rope_type": "default", - "rope_theta": 10000.0, - }, - }, - is_training=True, - audio_config={ - "model_type": "voxtral_realtime_encoder", - "hidden_size": 16, - "num_attention_heads": 4, - "num_key_value_heads": 2, - "intermediate_size": 64, - "encoder_layers": 2, - "num_mel_bins": 80, - "max_position_embeddings": 100, - "initializer_range": 0.02, - "rms_norm_eps": 1e-6, - "activation_function": "silu", - "activation_dropout": 0.0, - "attention_dropout": 0.0, - "head_dim": 4, - "rope_parameters": { - "rope_type": "default", - "rope_theta": 10000.0, - }, - }, - ): - self.parent = parent - self.ignore_index = ignore_index - self.audio_token_id = audio_token_id - self.text_config = text_config - self.audio_config = audio_config - self.seq_length = seq_length - self.feat_seq_length = feat_seq_length - - self.num_hidden_layers = text_config["num_hidden_layers"] - self.vocab_size = text_config["vocab_size"] - self.hidden_size = text_config["hidden_size"] - self.num_attention_heads = text_config["num_attention_heads"] - self.is_training = is_training - - self.batch_size = 3 - self.encoder_seq_length = seq_length - self._max_new_tokens = None # this is used to set - - def get_config(self): - return VoxtralRealtimeConfig( - text_config=self.text_config, - audio_config=self.audio_config, - ignore_index=self.ignore_index, - audio_token_id=self.audio_token_id, - ) - - def prepare_config_and_inputs(self): - if self._max_new_tokens is not None: - feat_seq_length = self.feat_seq_length + self._max_new_tokens * 8 - else: - feat_seq_length = self.feat_seq_length - - input_features_values = floats_tensor( - [ - self.batch_size, - self.audio_config["num_mel_bins"], - feat_seq_length, - ] - ) - config = self.get_config() - return config, input_features_values +class VoxtralRealtimeModelTester(ALMModelTester): + config_class = VoxtralRealtimeConfig + conditional_generation_class = VoxtralRealtimeForConditionalGeneration + text_config_class = VoxtralRealtimeTextConfig + audio_config_class = VoxtralRealtimeEncoderConfig + + def __init__(self, parent, **kwargs): + # VoxtralRealtime does additive audio/text fusion: seq_length must equal num_audio_embeds. + # With audio_length_per_tok=8 (config default), num_audio_embeds = feat_seq_length // 8. + kwargs.setdefault("seq_length", 32) + kwargs.setdefault("feat_seq_length", kwargs["seq_length"] * 8) + # Audio encoder uses RoPE; max position must cover post-conv length (feat_seq_length // 2). + kwargs.setdefault("max_position_embeddings", kwargs["feat_seq_length"]) + kwargs.setdefault("head_dim", 8) + kwargs.setdefault("rms_norm_eps", 1e-6) + kwargs.setdefault("activation_function", "silu") + kwargs.setdefault("hidden_act", "silu") + super().__init__(parent, **kwargs) + self._max_new_tokens = None + + def get_audio_embeds_mask(self, audio_mask): + # Causal conv2 (stride 2, left-pad 1): post_conv_len = feat_seq_length // 2. + # Projector reshapes by downsample_factor=4 → post_conv_len // downsample_factor embeds. + downsample_factor = 4 + effective_feat = self.feat_seq_length + (self._max_new_tokens or 0) * 8 + post_conv_len = effective_feat // 2 + output_length = post_conv_len // downsample_factor + return torch.ones([self.batch_size, output_length], dtype=torch.long).to(torch_device) + + def create_audio_features(self): + effective_feat = self.feat_seq_length + (self._max_new_tokens or 0) * 8 + return floats_tensor([self.batch_size, self.num_mel_bins, effective_feat]) + + def place_audio_tokens(self, input_ids, config, num_audio_tokens): + # VoxtralRealtime fuses audio additively over the whole sequence; no placeholder token required. + input_ids = input_ids.clone() + input_ids[input_ids == self.audio_token_id] = self.pad_token_id + return input_ids def prepare_config_and_inputs_for_common(self): - config_and_inputs = self.prepare_config_and_inputs() - config, input_features_values = config_and_inputs - num_audio_tokens_per_batch_idx = 30 + # Custom pipeline: input_ids at seq_length, audio covers seq_length (+ max_new_tokens extras + # during generation so the model can slice future-token audio per decode step). We do not run + # the base-class `audio_embeds_mask.shape[1] <= seq_length` invariant because, for this model, + # audio embeds legitimately exceed input length during generation. + audio_features = self.create_audio_features() + + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + special_tokens = [self.pad_token_id, self.bos_token_id, self.eos_token_id, self.audio_token_id] + for safe_id in range(self.vocab_size): + if safe_id not in special_tokens: + break + else: + raise ValueError("vocab_size too small for a non-special safe token.") + input_ids[input_ids == self.pad_token_id] = safe_id + input_ids[input_ids == self.eos_token_id] = safe_id - input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1 - attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) - attention_mask[:, :1] = 0 + config = self.get_config() + # place_audio_tokens is a no-op for this model; call for symmetry. + input_ids = self.place_audio_tokens(input_ids, config, torch.tensor([self.seq_length] * self.batch_size)) + attention_mask = self.create_attention_mask(input_ids) - input_ids[:, 1 : 1 + num_audio_tokens_per_batch_idx] = config.audio_token_id - inputs_dict = { + return config, { "input_ids": input_ids, "attention_mask": attention_mask, - "input_features": input_features_values, + "input_features": audio_features, } - return config, inputs_dict @require_torch -class VoxtralRealtimeForConditionalGenerationModelTest( - ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase -): +class VoxtralRealtimeForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase): """ Model tester for `VoxtralRealtimeForConditionalGeneration`. """ additional_model_inputs = ["input_features"] - - all_model_classes = (VoxtralRealtimeForConditionalGeneration,) if is_torch_available() else () + model_tester_class = VoxtralRealtimeModelTester pipeline_model_mapping = {"any-to-any": VoxtralRealtimeForConditionalGeneration} if is_torch_available() else {} - _is_composite = True - - def setUp(self): - self.model_tester = VoxtralRealtimeModelTester(self) - self.config_tester = ConfigTester(self, config_class=VoxtralRealtimeConfig, has_text_modality=False) - def _with_max_new_tokens(max_new_tokens): def decorator(test_func): @functools.wraps(test_func) @@ -209,8 +159,11 @@ def test_generate_compile_model_forward_fullgraph(self): def test_generate_with_and_without_position_ids(self): super().test_generate_with_and_without_position_ids() - @unittest.skip(reason="VoxtralRealtime does not have a base model") - def test_model_base_model_prefix(self): + @unittest.skip( + reason="This test does not apply to VoxtralRealtime: audio tokens are not replaced in inputs_embeds, " + "audio and text embeddings are summed instead." + ) + def test_mismatching_num_audio_tokens(self): pass @unittest.skip( diff --git a/tests/multimodal_tester.py b/tests/multimodal_tester.py new file mode 100644 index 000000000000..22559876689b --- /dev/null +++ b/tests/multimodal_tester.py @@ -0,0 +1,254 @@ +# Copyright 2026 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from inspect import signature + +from transformers.testing_utils import _TEXT_MODEL_TESTER_DEFAULTS + +from .test_configuration_common import ConfigTester +from .test_modeling_common import ( + GenerationTesterMixin, + ModelTesterMixin, + ids_tensor, + is_torch_available, + require_torch, + torch_device, +) +from .test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + +class MultiModalModelTester: + """Shared tester base for VLM (vision-language) and ALM (audio-language) models. + + Concrete subclasses (e.g. `VLMModelTester`, `ALMModelTester`) supply: + - the modality-specific sub-config class (`vision_config_class` for VLMs, `audio_config_class` for ALMs, ...), + - the modality-specific defaults and helper methods, + - the hooks `_build_modality_sub_configs` and `_prepare_modality_inputs`, + - optionally an extended `_special_token_ids` and `pipeline_model_mapping`. + + This tester provides shared logic for evaluating and verifying models that combine text with other modalities, + centering on the needs of vision-language (VLM) and audio-language (ALM) models. + """ + + # If the model follows the standard naming conventions, only `base_model_class` needs to be set + # (the others are inferred from available public classes). + base_model_class = None + config_class = None + text_config_class = None + conditional_generation_class = None + sequence_classification_class = None + + # Required attributes after the initialization phase of the tester. Subclasses extend. + _required_attributes = ("config_class", "text_config_class", "conditional_generation_class") + + # Arguments that should be passed to the config class even if not in its signature + forced_config_args = ["pad_token_id"] + + @property + def all_model_classes(self): + # Models that set `all_model_classes` in their `XXXModelTest` class must have a new class that doesn't fit + # any of the common classes. + return [ + model_class + for model_class in ( + self.base_model_class, + self.conditional_generation_class, + self.sequence_classification_class, + ) + if model_class is not None + ] + + def __init__(self, parent, **kwargs): + self.parent = parent + + # Multimodal-specific overrides of shared defaults (applied before the shared + # defaults so they take precedence, but after any subclass setdefault calls). + kwargs.setdefault("batch_size", 3) + kwargs.setdefault("moe_intermediate_size", 12) + + # Apply shared text-model defaults for anything not already set. + # Subclasses are expected to `setdefault` their modality-specific kwargs + # (and any differing values such as `pad_token_id`) *before* calling super. + for key, default in _TEXT_MODEL_TESTER_DEFAULTS.items(): + kwargs.setdefault(key, default) + + kwargs.setdefault("ignore_index", -100) + kwargs.setdefault("scope", None) + + for key, value in kwargs.items(): + setattr(self, key, value) + + self._check_required_attributes() + + def _check_required_attributes(self): + for required_attribute in self._required_attributes: + if getattr(self, required_attribute, None) is None: + raise ValueError( + f"You have inherited from {type(self).__name__} but did not set the {required_attribute} attribute." + ) + + # -- Overridable modality hooks ----------------------------------------------------------- + + def create_attention_mask(self, input_ids): + """Default causal (lower-triangular) attention mask. Override for bidirectional models like Gemma3.""" + return torch.tril(torch.ones_like(input_ids).to(torch_device)) + + def get_additional_inputs(self, config, input_ids, modality_inputs): + """Model-specific extra inputs (e.g. LlavaNext `image_sizes`, Qwen3VL `mm_token_type_ids`). + + ``modality_inputs`` is the full dict returned by ``_prepare_modality_inputs``. + """ + return {} + + @property + def _special_token_ids(self): + """Special token ids that must never appear as random text tokens. Subclasses add modality tokens.""" + return {self.pad_token_id, self.bos_token_id, self.eos_token_id} + + def _build_modality_sub_configs(self): + """Return the {sub-config-key: sub-config-instance} entries for the main config constructor.""" + raise NotImplementedError + + def _prepare_modality_inputs(self, input_ids, config): + """Create modality features, place modality placeholder tokens in ``input_ids``, and return: + + (input_ids_with_placeholders, modality_inputs_dict) + """ + raise NotImplementedError + + # -- End of overridable hooks ------------------------------------------------------------- + + def _safe_token_id(self): + """Smallest token ID that is not a special token. Used to scrub random ids_tensor outputs.""" + special_tokens = self._special_token_ids + for i in range(self.vocab_size): + if i not in special_tokens: + return i + raise ValueError("vocab_size is too small and there is no token ID that is not a special token!") + + def prepare_config_and_inputs_for_common(self): + config = self.get_config() + + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + # Avoid flaky tests by scrubbing any accidental special tokens produced by ids_tensor. + # Modality placeholder tokens are scrubbed and placed by `_prepare_modality_inputs`. + safe_token_id = self._safe_token_id() + for token_id in self._special_token_ids: + input_ids[input_ids == token_id] = safe_token_id + + input_ids, modality_inputs = self._prepare_modality_inputs(input_ids, config) + + # Create attention mask with final input_ids (after modality placeholders are placed) — important + # for models that derive padding from token values. + attention_mask = self.create_attention_mask(input_ids) if self.use_input_mask else None + + inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask} + inputs_dict.update(modality_inputs) + inputs_dict.update(self.get_additional_inputs(config, input_ids, modality_inputs)) + return config, inputs_dict + + # -- Config construction helpers ---------------------------------------------------------- + + @property + def config_args(self): + return list(signature(self.config_class.__init__).parameters.keys()) + + @property + def text_config_args(self): + args = list(signature(self.text_config_class.__init__).parameters.keys()) + for token_arg in ["pad_token_id", "bos_token_id", "eos_token_id"]: # Not always explicitly in the sig + if token_arg not in args: + args.append(token_arg) + return args + + def _collect_kwargs(self, sig_keys, config_class): + """Collect kwargs for ``config_class`` by matching ``sig_keys`` (and its ``attribute_map``) against ``self``.""" + attribute_map = getattr(config_class, "attribute_map", {}) + model_name_to_common_name = {v: k for k, v in attribute_map.items()} + kwargs = {} + for k in sig_keys: + if hasattr(self, k) and k != "self": + kwargs[k] = getattr(self, k) + elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]): + kwargs[k] = getattr(self, model_name_to_common_name[k]) + return kwargs + + def get_config(self): + kwargs = self._collect_kwargs(self.config_args + self.forced_config_args, self.config_class) + kwargs["text_config"] = self.get_text_config() + kwargs.update(self._build_modality_sub_configs()) + return self.config_class(**kwargs) + + def get_text_config(self): + kwargs = self._collect_kwargs(self.text_config_args, self.text_config_class) + return self.text_config_class(**kwargs) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = self.base_model_class(config=config) + model.to(torch_device) + model.eval() + model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + +@require_torch +class MultiModalModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin): + """Shared test-class base for multimodal model families. + + Subclasses must set: + - ``model_tester_class``: The tester class (subclass of ``MultiModalModelTester``) + + Optional: + - ``all_model_classes``: override if not using the default from the model tester + - ``pipeline_model_mapping``: override if not using the default from the model tester + """ + + model_tester_class = None + all_model_classes = None + pipeline_model_mapping = None + + # Multimodal models are always composite + _is_composite = True + + def setUp(self): + if self.model_tester_class is None: + raise ValueError( + f"You have inherited from {type(self).__name__} but did not set the model_tester_class attribute." + ) + self.model_tester = self.model_tester_class(self) + self.config_tester = ConfigTester(self, config_class=self.model_tester.config_class, has_text_modality=False) + + if self.pipeline_model_mapping is None: + if self.all_model_classes is not None: + raise ValueError( + f"Tests that inherit from `{type(self).__name__}` and set `all_model_classes` must manually set " + "`pipeline_model_mapping`." + ) + else: + self.pipeline_model_mapping = self.model_tester.pipeline_model_mapping + + if self.all_model_classes is None: + self.all_model_classes = self.model_tester.all_model_classes + + def test_config(self): + """Test config common functionality.""" + self.config_tester.run_common_tests() diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index e7a289389cf1..e9d0ba027478 100644 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -3594,30 +3594,38 @@ def test_sdpa_can_dispatch_composite_models(self): model_sdpa = model_class.from_pretrained(tmpdirname) model_sdpa = model_sdpa.base_model - vision_model_names = {"visual", "image_tower", "vision_tower", "vision_model"} + modality_tower_names = { + "visual", + "image_tower", + "vision_tower", + "vision_model", + "audio_tower", + "audio_model", + } language_model_names = {"language_model", "model", "text_model"} - vision_model_name = [name for name in vision_model_names if hasattr(model_sdpa, name)] - vision_model_name = vision_model_name[0] if len(vision_model_name) > 0 else None + modality_tower_name = [name for name in modality_tower_names if hasattr(model_sdpa, name)] + modality_tower_name = modality_tower_name[0] if len(modality_tower_name) > 0 else None language_model_name = [name for name in language_model_names if hasattr(model_sdpa, name)] language_model_name = language_model_name[0] if len(language_model_name) > 0 else None - if language_model_name is None or vision_model_name is None: + if language_model_name is None or modality_tower_name is None: self.skipTest( - reason="Model does not have both vision and language sub-models, cannot test composite SDPA dispatch" + reason="Model does not have both a non-text modality tower and a language sub-model, " + "cannot test composite SDPA dispatch" ) - vision_model_sdpa = getattr(model_sdpa, vision_model_name) + modality_tower_sdpa = getattr(model_sdpa, modality_tower_name) language_model_sdpa = getattr(model_sdpa, language_model_name) text_attn = "sdpa" if language_model_sdpa._supports_sdpa else "eager" - vision_attn = "sdpa" if vision_model_sdpa._supports_sdpa else "eager" + modality_attn = "sdpa" if modality_tower_sdpa._supports_sdpa else "eager" # `None` as it is the requested one which will be assigned to each sub-config # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) self.assertTrue(language_model_sdpa.config._attn_implementation == text_attn) - self.assertTrue(vision_model_sdpa.config._attn_implementation == vision_attn) + self.assertTrue(modality_tower_sdpa.config._attn_implementation == modality_attn) model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") model_eager = model_eager.base_model self.assertTrue(getattr(model_eager, language_model_name).config._attn_implementation == "eager") - self.assertTrue(getattr(model_eager, vision_model_name).config._attn_implementation == "eager") + self.assertTrue(getattr(model_eager, modality_tower_name).config._attn_implementation == "eager") for name, submodule in model_eager.named_modules(): class_name = submodule.__class__.__name__ diff --git a/tests/vlm_tester.py b/tests/vlm_tester.py index c40b42785836..05be8bdfa8f1 100644 --- a/tests/vlm_tester.py +++ b/tests/vlm_tester.py @@ -16,146 +16,74 @@ import unittest from inspect import signature -from .test_configuration_common import ConfigTester +from .multimodal_tester import MultiModalModelTest, MultiModalModelTester from .test_modeling_common import ( - GenerationTesterMixin, - ModelTesterMixin, floats_tensor, - ids_tensor, is_torch_available, - require_torch, torch_device, ) -from .test_pipeline_mixin import PipelineTesterMixin if is_torch_available(): import torch -class VLMModelTester: - # If the model follows the standard naming conventions, only `base_model_class` needs to be set (the others are - # inferred from available public classes). - base_model_class = None - config_class = None - text_config_class = None +class VLMModelTester(MultiModalModelTester): vision_config_class = None - conditional_generation_class = None - sequence_classification_class = None - # These attributes are required after the initialization phase of the tester. - _required_attributes = ("base_model_class", "config_class", "conditional_generation_class") - - # Arguments that should be passed to the config class even if not in its signature - forced_config_args = ["pad_token_id"] - - @property - def all_model_classes(self): - # Models that set `all_model_classes` in their `XXXModelTest` class must have a new class that doesn't fit - # any of the common classes. - return [ - model_class - for model_class in ( - self.base_model_class, - self.conditional_generation_class, - self.sequence_classification_class, - ) - if model_class is not None - ] + _required_attributes = MultiModalModelTester._required_attributes + ("base_model_class", "vision_config_class") @property def pipeline_model_mapping(self): - mapping = { + return { "feature-extraction": self.base_model_class, "image-text-to-text": self.conditional_generation_class, } - return mapping def __init__(self, parent, **kwargs): - self.parent = parent + # Overrides of _TEXT_MODEL_TESTER_DEFAULTS + kwargs.setdefault( + "seq_length", + 7 + + kwargs.get( + "num_image_tokens", + (kwargs.get("image_size", 8) // kwargs.get("patch_size", 4)) ** 2, + ), + ) + kwargs.setdefault("pad_token_id", 0) - # Standard defaults - kwargs.setdefault("batch_size", 3) - kwargs.setdefault("is_training", True) - kwargs.setdefault("use_input_mask", True) + # VLM-specific defaults kwargs.setdefault("use_token_type_ids", False) - kwargs.setdefault("use_labels", True) - kwargs.setdefault("vocab_size", 99) - kwargs.setdefault("hidden_size", 32) - kwargs.setdefault("num_hidden_layers", 2) - kwargs.setdefault("num_attention_heads", 2) - kwargs.setdefault("num_key_value_heads", 2) - kwargs.setdefault("intermediate_size", 32) # Keep this divisible by 8 for fp16/bf16/fp32 16-bytes alignment - kwargs.setdefault("hidden_act", "gelu") kwargs.setdefault("hidden_dropout_prob", 0.1) kwargs.setdefault("attention_probs_dropout_prob", 0.1) - kwargs.setdefault("max_position_embeddings", 512) kwargs.setdefault("type_vocab_size", 16) kwargs.setdefault("type_sequence_label_size", 2) kwargs.setdefault("initializer_range", 0.02) kwargs.setdefault("num_labels", 3) kwargs.setdefault("num_choices", 4) - kwargs.setdefault("pad_token_id", 0) - kwargs.setdefault("bos_token_id", 1) - kwargs.setdefault("eos_token_id", 2) kwargs.setdefault("image_token_id", 3) kwargs.setdefault("is_decoder", False) - kwargs.setdefault("scope", None) - kwargs.setdefault("expert_interval", 1) - kwargs.setdefault("moe_layer_start_index", 0) - kwargs.setdefault("moe_intermediate_size", 12) - kwargs.setdefault("shared_expert_intermediate_size", 36) - kwargs.setdefault("shared_expert_gate", True) - kwargs.setdefault("moe_num_shared_experts", 2) - kwargs.setdefault("num_experts_per_tok", 2) - kwargs.setdefault("num_experts", 8) - kwargs.setdefault("mamba_n_groups", 1) - kwargs.setdefault("mamba_n_heads", 16) - kwargs.setdefault("mamba_d_state", 16) - kwargs.setdefault("mamba_d_conv", 4) - kwargs.setdefault("mamba_expand", 2) - kwargs.setdefault("mamba_chunk_size", 16) kwargs.setdefault("image_size", 8) kwargs.setdefault("patch_size", 4) kwargs.setdefault("num_channels", 3) kwargs.setdefault("projection_dim", 32) kwargs.setdefault("projector_hidden_act", "gelu") - kwargs.setdefault("ignore_index", -100) kwargs.setdefault("vision_feature_select_strategy", "default") kwargs.setdefault("vision_feature_layer", -1) kwargs.setdefault("tie_word_embeddings", False) - - # Computed defaults (can still be overridden in derived classes) - kwargs.setdefault("head_dim", kwargs["hidden_size"] // kwargs["num_attention_heads"]) kwargs.setdefault("num_image_tokens", (kwargs["image_size"] // kwargs["patch_size"]) ** 2) - kwargs.setdefault("seq_length", 7 + kwargs["num_image_tokens"]) - - # Set all kwargs as instance attributes - for key, value in kwargs.items(): - setattr(self, key, value) - - for required_attribute in [ - "base_model_class", - "config_class", - "conditional_generation_class", - "text_config_class", - "vision_config_class", - ]: - if getattr(self, required_attribute) is None: - raise ValueError( - f"You have inherited from VLMModelTester but did not set the {required_attribute} attribute." - ) - # Because VLMs have some different standards in how they handle image tokens, we need a few methods - # that can be overridden if required: + super().__init__(parent, **kwargs) + + # Computed default depending on base-class defaults for hidden_size / num_attention_heads. + if not hasattr(self, "head_dim"): + self.head_dim = self.hidden_size // self.num_attention_heads + + # -- Overridable VLM-specific hooks ------------------------------------------------------ def create_pixel_values(self): # Override to 5D for patch-based models return floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size], scale=1.0) - def create_attention_mask(self, input_ids): - # Override for bidirectional attention models like Gemma3 - return torch.tril(torch.ones_like(input_ids).to(torch_device)) - def place_image_tokens(self, input_ids, config): # Override if the image tokens shouldn't be placed at the start of the test sequence image_token_id = getattr(config, "image_token_id", self.image_token_id) @@ -166,111 +94,32 @@ def place_image_tokens(self, input_ids, config): input_ids[:, : self.num_image_tokens] = image_token_id return input_ids - def get_additional_inputs(self, config, input_ids, pixel_values): - # Override for model-specific inputs like LlavaNext's image_sizes - return {} - - # End of overridable methods - - def prepare_config_and_inputs_for_common(self): - input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) - pixel_values = self.create_pixel_values() - - config = self.get_config() + # -- Hooks consumed by the shared base --------------------------------------------------- - special_tokens = [self.pad_token_id, self.bos_token_id, self.eos_token_id, self.image_token_id] - for i in range(self.vocab_size): - if i not in special_tokens: - # The smallest token ID that is not a special token - safe_token_id = i - break - else: - raise ValueError("vocab_size is too small and there is no token ID that is not a special token!") + @property + def _special_token_ids(self): + return super()._special_token_ids | {self.image_token_id} - # Avoid flaky tests, clear any special tokens in ids_tensor - # image_token_id is handled separately by place_image_tokens() - input_ids[input_ids == self.pad_token_id] = safe_token_id - input_ids[input_ids == self.eos_token_id] = safe_token_id + def _build_modality_sub_configs(self): + return {"vision_config": self.get_vision_config()} + def _prepare_modality_inputs(self, input_ids, config): + pixel_values = self.create_pixel_values() input_ids = self.place_image_tokens(input_ids, config) + return input_ids, {"pixel_values": pixel_values} - # Create attention mask with final input_ids (after image tokens are placed) - # This is important for models that use padding masks based on token values - input_mask = None - if self.use_input_mask: - input_mask = self.create_attention_mask(input_ids) - - inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask, "pixel_values": pixel_values} - - additional_inputs = self.get_additional_inputs(config, input_ids, pixel_values) - inputs_dict.update(additional_inputs) - - return config, inputs_dict - - @property - def config_args(self): - return list(signature(self.config_class.__init__).parameters.keys()) - - @property - def text_config_args(self): - args = list(signature(self.text_config_class.__init__).parameters.keys()) - for token_arg in ["pad_token_id", "bos_token_id", "eos_token_id"]: # Not always explicitly in the sig - if token_arg not in args: - args.append(token_arg) - return args + # -- Vision sub-config construction ------------------------------------------------------ @property def vision_config_args(self): return list(signature(self.vision_config_class.__init__).parameters.keys()) - def get_config(self): - kwargs = {} - attribute_map = getattr(self.config_class, "attribute_map", {}) - model_name_to_common_name = {v: k for k, v in attribute_map.items()} - for k in self.config_args + self.forced_config_args: - if hasattr(self, k) and k != "self": - kwargs[k] = getattr(self, k) - elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]): - kwargs[k] = getattr(self, model_name_to_common_name[k]) - kwargs["text_config"] = self.get_text_config() - kwargs["vision_config"] = self.get_vision_config() - return self.config_class(**kwargs) - - def get_text_config(self): - kwargs = {} - attribute_map = getattr(self.text_config_class, "attribute_map", {}) - model_name_to_common_name = {v: k for k, v in attribute_map.items()} - for k in self.text_config_args: - if hasattr(self, k) and k != "self": - kwargs[k] = getattr(self, k) - elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]): - kwargs[k] = getattr(self, model_name_to_common_name[k]) - return self.text_config_class(**kwargs) - def get_vision_config(self): - kwargs = {} - attribute_map = getattr(self.vision_config_class, "attribute_map", {}) - model_name_to_common_name = {v: k for k, v in attribute_map.items()} - for k in self.vision_config_args: - if hasattr(self, k) and k != "self": - kwargs[k] = getattr(self, k) - elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]): - kwargs[k] = getattr(self, model_name_to_common_name[k]) + kwargs = self._collect_kwargs(self.vision_config_args, self.vision_config_class) return self.vision_config_class(**kwargs) - def create_and_check_model( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - ): - model = self.base_model_class(config=config) - model.to(torch_device) - model.eval() - model(input_ids, attention_mask=input_mask) - result = model(input_ids) - self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) - -@require_torch -class VLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin): +class VLMModelTest(MultiModalModelTest): """ Base test class for Vision-Language Models. @@ -282,35 +131,6 @@ class VLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin) - `pipeline_model_mapping`: Override if not using default from model_tester """ - model_tester_class = None - all_model_classes = None - pipeline_model_mapping = None - - # VLMs are always composite - _is_composite = True - - def setUp(self): - if self.model_tester_class is None: - raise ValueError("You have inherited from VLMModelTest but did not set the model_tester_class attribute.") - self.model_tester = self.model_tester_class(self) - self.config_tester = ConfigTester(self, config_class=self.model_tester.config_class, has_text_modality=False) - - if self.pipeline_model_mapping is None: - if self.all_model_classes is not None: - raise ValueError( - "Tests that inherit from `VLMModelTest` and set `all_model_classes` must manually set " - "`pipeline_model_mapping`." - ) - else: - self.pipeline_model_mapping = self.model_tester.pipeline_model_mapping - - if self.all_model_classes is None: - self.all_model_classes = self.model_tester.all_model_classes - - def test_config(self): - """Test config common functionality.""" - self.config_tester.run_common_tests() - def test_mismatching_num_image_tokens(self): """ Tests that VLMs throw an error with explicit message saying what is wrong diff --git a/utils/check_repo.py b/utils/check_repo.py index 3387f39d72ff..5a7484409e31 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -831,6 +831,24 @@ def find_tested_models(test_file: str) -> set[str]: continue model_tested.add(tested_class) + # Same as above, but for ALMModelTester. Audio-LMs typically only set `conditional_generation_class` + # (no base_model_class). `GraniteSpeechModelTester` is listed because `GraniteSpeechPlusForConditionalGenerationModelTester` + # uses `ALMModelTester` indirectly through it; in the future we may want to resolve inheritance properly. + audio_class_match = re.search(r"class \w+\((?:ALMModelTester|GraniteSpeechModelTester)\)", content) + if audio_class_match is not None: + audio_content = content[audio_class_match.start() :] + for test_class_type in [ + "config_class", + "conditional_generation_class", + "base_model_class", + "sequence_classification_class", + ]: + tested_class = re.findall(rf"{test_class_type}\s+=.*", audio_content) + if tested_class: + tested_class = tested_class[0].split("=")[1].strip() + if tested_class != "None": + model_tested.add(tested_class) + return model_tested