From 8e64ba2890bd3231916cddcec77ba6331c306031 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 12 Mar 2024 22:46:19 +0500 Subject: [PATCH] Add tests for batching support (#29297) * add tests for batching support * Update src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py Co-authored-by: Joao Gante * Update src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py Co-authored-by: Joao Gante * Update tests/test_modeling_common.py Co-authored-by: Joao Gante * Update tests/test_modeling_common.py Co-authored-by: Joao Gante * Update tests/test_modeling_common.py Co-authored-by: Joao Gante * fixes and comments * use cosine distance for conv models * skip mra model testing * Update tests/models/vilt/test_modeling_vilt.py Co-authored-by: Joao Gante * finzalize and make style * check model type by input names * Update tests/models/vilt/test_modeling_vilt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * fixed batch size for all testers * Revert "fixed batch size for all testers" This reverts commit 525f3a0a058f069fbda00352cf202b728d40df99. * add batch_size for all testers * dict from model output * do not skip layoutlm * bring back some code from git revert * Update tests/test_modeling_common.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/test_modeling_common.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * clean-up * where did minus go in tolerance * make whisper happy * deal with consequences of losing minus * deal with consequences of losing minus * maskformer needs its own test for happiness * fix more models * tag flaky CV models from Amy's approval * make codestyle --------- Co-authored-by: Joao Gante Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- .../models/clipseg/modeling_clipseg.py | 2 +- .../models/encodec/modeling_encodec.py | 12 +-- .../models/funnel/modeling_funnel.py | 2 +- src/transformers/models/tvp/modeling_tvp.py | 2 +- src/transformers/models/yoso/modeling_yoso.py | 12 +-- tests/models/align/test_modeling_align.py | 1 + tests/models/altclip/test_modeling_altclip.py | 1 + .../autoformer/test_modeling_autoformer.py | 1 + tests/models/bark/test_modeling_bark.py | 6 +- tests/models/blip/test_modeling_blip.py | 4 + tests/models/blip_2/test_modeling_blip_2.py | 2 + .../test_modeling_chinese_clip.py | 1 + tests/models/clap/test_modeling_clap.py | 1 + tests/models/clip/test_modeling_clip.py | 1 + tests/models/clipseg/test_modeling_clipseg.py | 1 + tests/models/clvp/test_modeling_clvp.py | 1 + .../test_modeling_conditional_detr.py | 1 + tests/models/cpmant/test_modeling_cpmant.py | 2 +- tests/models/detr/test_modeling_detr.py | 1 + tests/models/dpt/test_modeling_dpt_hybrid.py | 6 +- tests/models/encodec/test_modeling_encodec.py | 15 ++- .../test_modeling_fastspeech2_conformer.py | 14 +++ tests/models/flava/test_modeling_flava.py | 1 + .../models/groupvit/test_modeling_groupvit.py | 1 + .../models/informer/test_modeling_informer.py | 4 + .../test_modeling_instructblip.py | 1 + tests/models/kosmos2/test_modeling_kosmos2.py | 1 + .../layoutlmv2/test_modeling_layoutlmv2.py | 59 ++++++++++++ .../longformer/test_modeling_longformer.py | 4 + .../maskformer/test_modeling_maskformer.py | 63 +++++++++++++ .../test_modeling_mobilenet_v2.py | 6 +- tests/models/mra/test_modeling_mra.py | 4 + .../models/musicgen/test_modeling_musicgen.py | 4 +- tests/models/owlv2/test_modeling_owlv2.py | 2 + tests/models/owlvit/test_modeling_owlvit.py | 2 + .../pix2struct/test_modeling_pix2struct.py | 1 + tests/models/siglip/test_modeling_siglip.py | 1 + .../models/speecht5/test_modeling_speecht5.py | 8 ++ .../test_modeling_table_transformer.py | 1 + .../test_modeling_time_series_transformer.py | 1 + tests/models/univnet/test_modeling_univnet.py | 42 ++------- tests/models/vilt/test_modeling_vilt.py | 6 ++ .../vit_hybrid/test_modeling_vit_hybrid.py | 6 +- tests/models/vit_mae/test_modeling_vit_mae.py | 4 + tests/models/vits/test_modeling_vits.py | 4 + tests/models/whisper/test_modeling_whisper.py | 7 +- tests/models/x_clip/test_modeling_x_clip.py | 1 + tests/test_modeling_common.py | 94 +++++++++++++++++++ 48 files changed, 350 insertions(+), 67 deletions(-) diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index c0cf6b3b16..b250e09ad2 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -1292,7 +1292,7 @@ class CLIPSegDecoder(CLIPSegPreTrainedModel): batch_size = conditional_embeddings.shape[0] output = output.view(batch_size, output.shape[1], size, size) - logits = self.transposed_convolution(output).squeeze() + logits = self.transposed_convolution(output).squeeze(1) if not return_dict: return tuple(v for v in [logits, all_hidden_states, all_attentions] if v is not None) diff --git a/src/transformers/models/encodec/modeling_encodec.py b/src/transformers/models/encodec/modeling_encodec.py index 441f4a27d8..bf7503efb4 100644 --- a/src/transformers/models/encodec/modeling_encodec.py +++ b/src/transformers/models/encodec/modeling_encodec.py @@ -51,13 +51,13 @@ ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST = [ class EncodecOutput(ModelOutput): """ Args: - audio_codes (`torch.FloatTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*): + audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*): Discret code embeddings computed using `model.encode`. audio_values (`torch.FlaotTensor` of shape `(batch_size, sequence_length)`, *optional*) Decoded audio values, obtained using the decoder part of Encodec. """ - audio_codes: torch.FloatTensor = None + audio_codes: torch.LongTensor = None audio_values: torch.FloatTensor = None @@ -65,13 +65,13 @@ class EncodecOutput(ModelOutput): class EncodecEncoderOutput(ModelOutput): """ Args: - audio_codes (`torch.FloatTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*): + audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*): Discret code embeddings computed using `model.encode`. audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*): Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding. """ - audio_codes: torch.FloatTensor = None + audio_codes: torch.LongTensor = None audio_scales: torch.FloatTensor = None @@ -514,7 +514,7 @@ ENCODEC_INPUTS_DOCSTRING = r""" The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented as `bandwidth == 6.0` - audio_codes (`torch.FloatTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*): + audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*): Discret code embeddings computed using `model.encode`. audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*): Scaling factor for each `audio_codes` input. @@ -718,7 +718,7 @@ class EncodecModel(EncodecPreTrainedModel): trimmed. Args: - audio_codes (`torch.FloatTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*): + audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*): Discret code embeddings computed using `model.encode`. audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*): Scaling factor for each `audio_codes` input. diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py index b822b67595..50f8df3743 100644 --- a/src/transformers/models/funnel/modeling_funnel.py +++ b/src/transformers/models/funnel/modeling_funnel.py @@ -776,7 +776,7 @@ class FunnelDiscriminatorPredictions(nn.Module): def forward(self, discriminator_hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.dense(discriminator_hidden_states) hidden_states = ACT2FN[self.config.hidden_act](hidden_states) - logits = self.dense_prediction(hidden_states).squeeze() + logits = self.dense_prediction(hidden_states).squeeze(-1) return logits diff --git a/src/transformers/models/tvp/modeling_tvp.py b/src/transformers/models/tvp/modeling_tvp.py index c80cc9df0b..159b4926af 100644 --- a/src/transformers/models/tvp/modeling_tvp.py +++ b/src/transformers/models/tvp/modeling_tvp.py @@ -679,7 +679,7 @@ class TvpFramePadPrompter(nn.Module): prompt = torch.cat([self.pad_left, base, self.pad_right], dim=4) prompt = torch.cat([self.pad_up, prompt, self.pad_down], dim=3) prompt = torch.cat(pixel_values.size(0) * [prompt]) - pixel_values += prompt.to(pixel_values.dtype) + pixel_values = pixel_values + prompt.to(pixel_values.dtype) return pixel_values diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py index 5361adc3ed..41e34a6c66 100644 --- a/src/transformers/models/yoso/modeling_yoso.py +++ b/src/transformers/models/yoso/modeling_yoso.py @@ -371,10 +371,12 @@ class YosoSelfAttention(nn.Module): key_layer = key_layer.reshape(batch_size * num_heads, seq_len, head_dim) value_layer = value_layer.reshape(batch_size * num_heads, seq_len, head_dim) - # revert changes made by get_extended_attention_mask attention_mask = 1.0 + attention_mask / 10000.0 attention_mask = ( - attention_mask.squeeze().repeat(1, num_heads, 1).reshape(batch_size * num_heads, seq_len).int() + attention_mask.unsqueeze(1) + .repeat_interleave(num_heads, dim=1) + .reshape(batch_size * num_heads, seq_len) + .int() ) # The CUDA kernels are most efficient with inputs whose size is a multiple of a GPU's warp size (32). Inputs @@ -808,10 +810,6 @@ class YosoModel(YosoPreTrainedModel): else: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) - # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N @@ -827,7 +825,7 @@ class YosoModel(YosoPreTrainedModel): ) encoder_outputs = self.encoder( embedding_output, - attention_mask=extended_attention_mask, + attention_mask=attention_mask, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py index 99daeb816d..2f32978994 100644 --- a/tests/models/align/test_modeling_align.py +++ b/tests/models/align/test_modeling_align.py @@ -405,6 +405,7 @@ class AlignModelTester: self.parent = parent self.text_model_tester = AlignTextModelTester(parent, **text_kwargs) self.vision_model_tester = AlignVisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.is_training = is_training def prepare_config_and_inputs(self): diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index 610a66f8ae..10b0e167d7 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -380,6 +380,7 @@ class AltCLIPModelTester: self.parent = parent self.text_model_tester = AltCLIPTextModelTester(parent, **text_kwargs) self.vision_model_tester = AltCLIPVisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.is_training = is_training def prepare_config_and_inputs(self): diff --git a/tests/models/autoformer/test_modeling_autoformer.py b/tests/models/autoformer/test_modeling_autoformer.py index 965e5dcd87..265f5dd7b7 100644 --- a/tests/models/autoformer/test_modeling_autoformer.py +++ b/tests/models/autoformer/test_modeling_autoformer.py @@ -107,6 +107,7 @@ class AutoformerModelTester: cardinality=[self.cardinality], embedding_dimension=[self.embedding_dimension], moving_average=self.moving_average, + scaling="std", # we need std to get non-zero `loc` ) def prepare_autoformer_inputs_dict(self, config): diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py index 1246fa5615..8744cb168f 100644 --- a/tests/models/bark/test_modeling_bark.py +++ b/tests/models/bark/test_modeling_bark.py @@ -67,7 +67,7 @@ class BarkSemanticModelTester: def __init__( self, parent, - batch_size=2, + batch_size=3, # need batch_size != num_hidden_layers seq_length=4, is_training=False, # for now training is not supported use_input_mask=True, @@ -203,7 +203,7 @@ class BarkCoarseModelTester: def __init__( self, parent, - batch_size=2, + batch_size=3, # need batch_size != num_hidden_layers seq_length=4, is_training=False, # for now training is not supported use_input_mask=True, @@ -339,7 +339,7 @@ class BarkFineModelTester: def __init__( self, parent, - batch_size=2, + batch_size=3, # need batch_size != num_hidden_layers seq_length=4, is_training=False, # for now training is not supported use_input_mask=True, diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py index 4e87dca58f..51f1690ff1 100644 --- a/tests/models/blip/test_modeling_blip.py +++ b/tests/models/blip/test_modeling_blip.py @@ -387,6 +387,7 @@ class BlipModelTester: self.parent = parent self.text_model_tester = BlipTextModelTester(parent, **text_kwargs) self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.is_training = is_training def prepare_config_and_inputs(self): @@ -596,6 +597,7 @@ class BlipTextRetrievalModelTester: self.parent = parent self.text_model_tester = BlipTextModelTester(parent, **text_kwargs) self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.is_training = is_training def prepare_config_and_inputs(self): @@ -643,6 +645,7 @@ class BlipTextImageModelsModelTester: self.parent = parent self.text_model_tester = BlipTextModelTester(parent, **text_kwargs) self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.is_training = is_training def prepare_config_and_inputs(self): @@ -691,6 +694,7 @@ class BlipVQAModelTester: self.parent = parent self.text_model_tester = BlipTextModelTester(parent, **text_kwargs) self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.is_training = is_training def prepare_config_and_inputs(self): diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index dd87961372..cffb7a1fe7 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -390,6 +390,7 @@ class Blip2ForConditionalGenerationDecoderOnlyModelTester: self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs) self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs) self.text_model_tester = Blip2TextModelDecoderOnlyTester(parent, **text_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.is_training = is_training self.num_query_tokens = num_query_tokens @@ -616,6 +617,7 @@ class Blip2ModelTester: self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs) self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs) self.text_model_tester = Blip2TextModelTester(parent, **text_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.is_training = is_training self.num_query_tokens = num_query_tokens diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index 8d0eb131e2..06c946bf10 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -510,6 +510,7 @@ class ChineseCLIPModelTester: self.parent = parent self.text_model_tester = ChineseCLIPTextModelTester(parent, **text_kwargs) self.vision_model_tester = ChineseCLIPVisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.is_training = is_training def prepare_config_and_inputs(self): diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py index 458290c921..fe3e8b0e54 100644 --- a/tests/models/clap/test_modeling_clap.py +++ b/tests/models/clap/test_modeling_clap.py @@ -466,6 +466,7 @@ class ClapModelTester: self.parent = parent self.text_model_tester = ClapTextModelTester(parent, **text_kwargs) self.audio_model_tester = ClapAudioModelTester(parent, **audio_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.is_training = is_training def prepare_config_and_inputs(self): diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 2351f055b5..fbcb22575a 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -437,6 +437,7 @@ class CLIPModelTester: self.parent = parent self.text_model_tester = CLIPTextModelTester(parent, **text_kwargs) self.vision_model_tester = CLIPVisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.is_training = is_training def prepare_config_and_inputs(self): diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index f8e05caa1e..8f3ab2b04f 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -388,6 +388,7 @@ class CLIPSegModelTester: self.parent = parent self.text_model_tester = CLIPSegTextModelTester(parent, **text_kwargs) self.vision_model_tester = CLIPSegVisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.is_training = is_training self.extract_layers = extract_layers diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py index e27d9e08eb..59e6c1be40 100644 --- a/tests/models/clvp/test_modeling_clvp.py +++ b/tests/models/clvp/test_modeling_clvp.py @@ -344,6 +344,7 @@ class ClvpModelForConditionalGenerationTester: self.parent = parent self.clvp_encoder_tester = ClvpEncoderTester(parent) self.is_training = is_training + self.batch_size = self.clvp_encoder_tester.batch_size # need bs for batching_equivalence test def get_config(self): decoder_config = ClvpDecoderConfig( diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py index f297634a2e..d1152ed862 100644 --- a/tests/models/conditional_detr/test_modeling_conditional_detr.py +++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py @@ -194,6 +194,7 @@ class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline test_pruning = False test_head_masking = False test_missing_keys = False + zero_init_hidden_state = True # special case for head models def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/models/cpmant/test_modeling_cpmant.py b/tests/models/cpmant/test_modeling_cpmant.py index 6ecfe15c2e..7a037becbf 100644 --- a/tests/models/cpmant/test_modeling_cpmant.py +++ b/tests/models/cpmant/test_modeling_cpmant.py @@ -57,7 +57,7 @@ class CpmAntModelTester: prompt_length=8, prompt_types=8, segment_types=8, - init_std=1.0, + init_std=0.02, return_dict=True, ): self.parent = parent diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py index 02159795e8..59b071e031 100644 --- a/tests/models/detr/test_modeling_detr.py +++ b/tests/models/detr/test_modeling_detr.py @@ -194,6 +194,7 @@ class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin test_pruning = False test_head_masking = False test_missing_keys = False + zero_init_hidden_state = True # special case for head models def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py index 13a0cf4db8..2a6e8429ab 100644 --- a/tests/models/dpt/test_modeling_dpt_hybrid.py +++ b/tests/models/dpt/test_modeling_dpt_hybrid.py @@ -19,7 +19,7 @@ import unittest from transformers import DPTConfig from transformers.file_utils import is_torch_available, is_vision_available -from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor @@ -306,6 +306,10 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): with self.assertRaises(ValueError): _ = DPTForDepthEstimation(config) + @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516") + def test_batching_equivalence(self): + super().test_batching_equivalence() + # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py index 8f1b06da06..0c021eaad2 100644 --- a/tests/models/encodec/test_modeling_encodec.py +++ b/tests/models/encodec/test_modeling_encodec.py @@ -33,11 +33,7 @@ from transformers.testing_utils import ( ) from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ( - ModelTesterMixin, - _config_zero_init, - floats_tensor, -) +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -107,6 +103,15 @@ class EncodecModelTester: config, inputs_dict = self.prepare_config_and_inputs() return config, inputs_dict + def prepare_config_and_inputs_for_model_class(self, model_class): + config, inputs_dict = self.prepare_config_and_inputs() + inputs_dict["audio_codes"] = ids_tensor([1, self.batch_size, 1, self.num_channels], self.codebook_size).type( + torch.int32 + ) + inputs_dict["audio_scales"] = [None] + + return config, inputs_dict + def get_config(self): return EncodecConfig( audio_channels=self.num_channels, diff --git a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py index ce6bc4218a..4cf104e693 100644 --- a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py +++ b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py @@ -347,6 +347,13 @@ class FastSpeech2ConformerModelTest(ModelTesterMixin, unittest.TestCase): def test_model_common_attributes(self): pass + @unittest.skip( + "FastSpeech2Conformer predicts durations in linear domain during inference" + "Even small differences on hidden states lead to different durations, due to `torch.round`" + ) + def test_batching_equivalence(self): + pass + @require_torch @require_g2p_en @@ -762,6 +769,13 @@ class FastSpeech2ConformerWithHifiGanTest(ModelTesterMixin, unittest.TestCase): def test_model_common_attributes(self): pass + @unittest.skip( + "FastSpeech2Conformer predicts durations in linear domain during inference" + "Even small differences on hidden states lead to different durations, due to `torch.round`" + ) + def test_batching_equivalence(self): + pass + @require_torch @require_g2p_en diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py index 2d22df3ce7..48200dd30c 100644 --- a/tests/models/flava/test_modeling_flava.py +++ b/tests/models/flava/test_modeling_flava.py @@ -836,6 +836,7 @@ class FlavaModelTester: self.projection_dim = projection_dim self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py index 3d7f50ae6e..9f44c3d9ee 100644 --- a/tests/models/groupvit/test_modeling_groupvit.py +++ b/tests/models/groupvit/test_modeling_groupvit.py @@ -507,6 +507,7 @@ class GroupViTModelTester: self.parent = parent self.text_model_tester = GroupViTTextModelTester(parent, **text_kwargs) self.vision_model_tester = GroupViTVisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.is_training = is_training def prepare_config_and_inputs(self): diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py index e68d10241d..f3ebe91ac5 100644 --- a/tests/models/informer/test_modeling_informer.py +++ b/tests/models/informer/test_modeling_informer.py @@ -279,6 +279,10 @@ class InformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase def test_determinism(self): pass + @unittest.skip("randomly selects U keys while calculating attentions") + def test_batching_equivalence(self): + pass + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index 0af427c358..ffc9c6eb0e 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -397,6 +397,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester: self.vision_model_tester = InstructBlipVisionModelTester(parent, **vision_kwargs) self.qformer_model_tester = InstructBlipQFormerModelTester(parent, **qformer_kwargs) self.text_model_tester = InstructBlipTextModelDecoderOnlyTester(parent, **text_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.is_training = is_training self.num_query_tokens = num_query_tokens diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index dd953eedc8..7fbb40e828 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -197,6 +197,7 @@ class Kosmos2ModelTester: self.parent = parent self.text_model_tester = Kosmos2TextModelTester(parent, **text_kwargs) self.vision_model_tester = Kosmos2VisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.latent_query_num = latent_query_num self.is_training = is_training diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py index cffa09d6d0..f1a0cc6c43 100644 --- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py @@ -27,6 +27,7 @@ from ...test_pipeline_mixin import PipelineTesterMixin if is_torch_available(): import torch + import torch.nn.functional as F from transformers import ( LayoutLMv2Config, @@ -442,6 +443,64 @@ class LayoutLMv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) + def test_batching_equivalence(self): + def equivalence(tensor1, tensor2): + return 1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=0) + + def recursive_check(batched_object, single_row_object, model_name, key): + if isinstance(batched_object, (list, tuple)): + for batched_object_value, single_row_object_value in zip(batched_object, single_row_object): + recursive_check(batched_object_value, single_row_object_value, model_name, key) + elif batched_object is None: + return + else: + batched_row = batched_object[:1] + self.assertFalse( + torch.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}" + ) + self.assertFalse( + torch.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}" + ) + self.assertFalse( + torch.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}" + ) + self.assertFalse( + torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}" + ) + self.assertTrue( + (equivalence(batched_row, single_row_object)) <= 1e-03, + msg=( + f"Batched and Single row outputs are not equal in {model_name} for key={key}. " + f"Difference={equivalence(batched_row, single_row_object)}." + ), + ) + + config, batched_input = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + config.output_hidden_states = True + + model_name = model_class.__name__ + batched_input_prepared = self._prepare_for_class(batched_input, model_class) + model = model_class(config).to(torch_device).eval() + batch_size = self.model_tester.batch_size + + single_row_input = {} + for key, value in batched_input_prepared.items(): + if isinstance(value, torch.Tensor) and value.shape[0] % batch_size == 0: + single_batch_shape = value.shape[0] // batch_size + single_row_input[key] = value[:single_batch_shape] + elif hasattr(value, "tensor"): + # layoutlmv2uses ImageList intead of pixel values (needs for torchscript) + single_row_input[key] = value.tensor[:single_batch_shape] + + with torch.no_grad(): + model_batched_output = model(**batched_input_prepared) + model_row_output = model(**single_row_input) + + for key in model_batched_output: + recursive_check(model_batched_output[key], model_row_output[key], model_name, key) + def prepare_layoutlmv2_batch_inputs(): # Here we prepare a batch of 2 sequences to test a LayoutLMv2 forward pass on: diff --git a/tests/models/longformer/test_modeling_longformer.py b/tests/models/longformer/test_modeling_longformer.py index 7edcd206ab..1ae3db4018 100644 --- a/tests/models/longformer/test_modeling_longformer.py +++ b/tests/models/longformer/test_modeling_longformer.py @@ -388,6 +388,10 @@ class LongformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa # longformer cannot keep gradients in attentions or hidden states return + @unittest.skip("LongFormer calculates global attn only when attn_mask has non-zero elements") + def test_batching_equivalence(self): + return + @require_torch @require_sentencepiece diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py index d376216040..6ba48517c3 100644 --- a/tests/models/maskformer/test_modeling_maskformer.py +++ b/tests/models/maskformer/test_modeling_maskformer.py @@ -39,6 +39,7 @@ from ...test_pipeline_mixin import PipelineTesterMixin if is_torch_available(): import torch + import torch.nn.functional as F from transformers import MaskFormerForInstanceSegmentation, MaskFormerModel @@ -206,6 +207,7 @@ class MaskFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa test_pruning = False test_head_masking = False test_missing_keys = False + zero_init_hidden_state = True def setUp(self): self.model_tester = MaskFormerModelTester(self) @@ -381,6 +383,67 @@ class MaskFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa self.assertIsNotNone(outputs.auxiliary_logits) self.assertEqual(len(outputs.auxiliary_logits), self.model_tester.num_channels - 1) + def test_batching_equivalence(self): + def equivalence(tensor1, tensor2): + return 1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=0).max() + + def recursive_check(batched_object, single_row_object, model_name, key): + if isinstance(batched_object, (list, tuple)): + for batched_object_value, single_row_object_value in zip(batched_object, single_row_object): + recursive_check(batched_object_value, single_row_object_value, model_name, key) + elif batched_object is None: + return + else: + batched_row = batched_object[:1] + self.assertFalse( + torch.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}" + ) + self.assertFalse( + torch.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}" + ) + self.assertFalse( + torch.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}" + ) + self.assertFalse( + torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}" + ) + self.assertTrue( + (equivalence(batched_row, single_row_object)) <= 1e-03, + msg=( + f"Batched and Single row outputs are not equal in {model_name} for key={key}. " + f"Difference={equivalence(batched_row, single_row_object)}." + ), + ) + + config, batched_input = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + config.output_hidden_states = True + + model_name = model_class.__name__ + batched_input_prepared = self._prepare_for_class(batched_input, model_class) + model = model_class(config).to(torch_device).eval() + batch_size = self.model_tester.batch_size + + single_row_input = {} + for key, value in batched_input_prepared.items(): + single_batch_shape = value.shape[0] // batch_size + single_row_input[key] = value[:single_batch_shape] + + with torch.no_grad(): + model_batched_output = model(**batched_input_prepared) + model_row_output = model(**single_row_input) + + for key in model_batched_output: + # remove the first zero-init queries to decoder, otherwise cos_similarity = `nan` + # no need to check all hidden_states, already checked separately each one + if key == "transformer_decoder_hidden_states": + model_batched_output[key] = model_batched_output[key][1:] + model_row_output[key] = model_row_output[key][1:] + elif key == "hidden_states": + continue + recursive_check(model_batched_output[key], model_row_output[key], model_name, key) + TOLERANCE = 1e-4 diff --git a/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py b/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py index 75580bfdf2..17dfe452c2 100644 --- a/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py +++ b/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py @@ -18,7 +18,7 @@ import unittest from transformers import MobileNetV2Config -from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device from transformers.utils import cached_property, is_torch_available, is_vision_available from ...test_configuration_common import ConfigTester @@ -271,6 +271,10 @@ class MobileNetV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC model = MobileNetV2Model.from_pretrained(model_name) self.assertIsNotNone(model) + @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516") + def test_batching_equivalence(self): + super().test_batching_equivalence() + # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/mra/test_modeling_mra.py b/tests/models/mra/test_modeling_mra.py index 02c61fa140..a1b4b4464c 100644 --- a/tests/models/mra/test_modeling_mra.py +++ b/tests/models/mra/test_modeling_mra.py @@ -378,6 +378,10 @@ class MraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def test_training_gradient_checkpointing_use_reentrant_false(self): pass + @unittest.skip("Model has `nan` in hidden_states, see https://github.com/huggingface/transformers/issues/29373.") + def test_batching_equivalence(self): + pass + @require_torch class MraModelIntegrationTest(unittest.TestCase): diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py index b7952d27a7..cd978d8987 100644 --- a/tests/models/musicgen/test_modeling_musicgen.py +++ b/tests/models/musicgen/test_modeling_musicgen.py @@ -103,7 +103,7 @@ class MusicgenDecoderTester: def __init__( self, parent, - batch_size=2, + batch_size=3, # need batch_size != num_hidden_layers seq_length=7, is_training=False, use_labels=False, @@ -441,7 +441,7 @@ class MusicgenTester: def __init__( self, parent, - batch_size=2, + batch_size=3, # need batch_size != num_hidden_layers seq_length=7, is_training=False, use_labels=False, diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py index 3dbcab2c93..74fbaa58d0 100644 --- a/tests/models/owlv2/test_modeling_owlv2.py +++ b/tests/models/owlv2/test_modeling_owlv2.py @@ -385,6 +385,7 @@ class Owlv2ModelTester: self.is_training = is_training self.text_config = self.text_model_tester.get_config().to_dict() self.vision_config = self.vision_model_tester.get_config().to_dict() + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test def prepare_config_and_inputs(self): text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() @@ -591,6 +592,7 @@ class Owlv2ForObjectDetectionTester: self.is_training = is_training self.text_config = self.text_model_tester.get_config().to_dict() self.vision_config = self.vision_model_tester.get_config().to_dict() + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test def prepare_config_and_inputs(self): text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py index e99eb736e8..1966aaeda2 100644 --- a/tests/models/owlvit/test_modeling_owlvit.py +++ b/tests/models/owlvit/test_modeling_owlvit.py @@ -381,6 +381,7 @@ class OwlViTModelTester: self.is_training = is_training self.text_config = self.text_model_tester.get_config().to_dict() self.vision_config = self.vision_model_tester.get_config().to_dict() + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test def prepare_config_and_inputs(self): text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() @@ -585,6 +586,7 @@ class OwlViTForObjectDetectionTester: self.is_training = is_training self.text_config = self.text_model_tester.get_config().to_dict() self.vision_config = self.vision_model_tester.get_config().to_dict() + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test def prepare_config_and_inputs(self): text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py index 204f726a24..0745362272 100644 --- a/tests/models/pix2struct/test_modeling_pix2struct.py +++ b/tests/models/pix2struct/test_modeling_pix2struct.py @@ -386,6 +386,7 @@ class Pix2StructModelTester: self.parent = parent self.text_model_tester = Pix2StructTextModelTester(parent, **text_kwargs) self.vision_model_tester = Pix2StructVisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.is_training = is_training def prepare_config_and_inputs(self): diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py index 438cc8b648..45212751a8 100644 --- a/tests/models/siglip/test_modeling_siglip.py +++ b/tests/models/siglip/test_modeling_siglip.py @@ -389,6 +389,7 @@ class SiglipModelTester: self.parent = parent self.text_model_tester = SiglipTextModelTester(parent, **text_kwargs) self.vision_model_tester = SiglipVisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.is_training = is_training # Copied from tests.models.clip.test_modeling_clip.CLIPModelTester.prepare_config_and_inputs diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py index 7849b59d29..622ae196bd 100644 --- a/tests/models/speecht5/test_modeling_speecht5.py +++ b/tests/models/speecht5/test_modeling_speecht5.py @@ -916,6 +916,10 @@ class SpeechT5ForTextToSpeechTest(ModelTesterMixin, unittest.TestCase): def test_determinism(self): pass + @unittest.skip("skipped because there is always dropout in SpeechT5SpeechDecoderPrenet") + def test_batching_equivalence(self): + pass + def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() @@ -1438,6 +1442,10 @@ class SpeechT5ForSpeechToSpeechTest(ModelTesterMixin, unittest.TestCase): def test_determinism(self): pass + @unittest.skip("skipped because there is always dropout in SpeechT5SpeechDecoderPrenet") + def test_batching_equivalence(self): + pass + def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py index eb5e80c938..79da1d1910 100644 --- a/tests/models/table_transformer/test_modeling_table_transformer.py +++ b/tests/models/table_transformer/test_modeling_table_transformer.py @@ -209,6 +209,7 @@ class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, Pipelin test_pruning = False test_head_masking = False test_missing_keys = False + zero_init_hidden_state = True # special case for head models def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py index c5a3646a5b..330cf95d06 100644 --- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py +++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py @@ -104,6 +104,7 @@ class TimeSeriesTransformerModelTester: num_static_categorical_features=1, cardinality=[self.cardinality], embedding_dimension=[self.embedding_dimension], + scaling="std", # we need std to get non-zero `loc` ) def prepare_time_series_transformer_inputs_dict(self, config): diff --git a/tests/models/univnet/test_modeling_univnet.py b/tests/models/univnet/test_modeling_univnet.py index b1512af284..88a610cfbb 100644 --- a/tests/models/univnet/test_modeling_univnet.py +++ b/tests/models/univnet/test_modeling_univnet.py @@ -66,13 +66,13 @@ class UnivNetModelTester: def prepare_noise_sequence(self): generator = torch.manual_seed(self.seed) - noise_shape = (self.seq_length, self.in_channels) + noise_shape = (self.batch_size, self.seq_length, self.in_channels) # Create noise on CPU for reproducibility noise_sequence = torch.randn(noise_shape, generator=generator, dtype=torch.float) return noise_sequence def prepare_config_and_inputs(self): - spectrogram = floats_tensor([self.seq_length, self.num_mel_bins], scale=1.0) + spectrogram = floats_tensor([self.batch_size, self.seq_length, self.num_mel_bins], scale=1.0) noise_sequence = self.prepare_noise_sequence() noise_sequence = noise_sequence.to(spectrogram.device) config = self.get_config() @@ -89,7 +89,7 @@ class UnivNetModelTester: def create_and_check_model(self, config, spectrogram, noise_sequence): model = UnivNetModel(config=config).to(torch_device).eval() result = model(spectrogram, noise_sequence)[0] - self.parent.assertEqual(result.shape, (1, self.seq_length * 256)) + self.parent.assertEqual(result.shape, (self.batch_size, self.seq_length * 256)) def prepare_config_and_inputs_for_common(self): config, spectrogram, noise_sequence = self.prepare_config_and_inputs() @@ -182,8 +182,8 @@ class UnivNetModelTest(ModelTesterMixin, unittest.TestCase): model.to(torch_device) model.eval() - batched_spectrogram = inputs["input_features"].unsqueeze(0).repeat(2, 1, 1) - batched_noise_sequence = inputs["noise_sequence"].unsqueeze(0).repeat(2, 1, 1) + batched_spectrogram = inputs["input_features"] + batched_noise_sequence = inputs["noise_sequence"] with torch.no_grad(): batched_outputs = model( batched_spectrogram.to(torch_device), @@ -205,37 +205,11 @@ class UnivNetModelTest(ModelTesterMixin, unittest.TestCase): model.eval() with torch.no_grad(): - outputs = model(inputs["input_features"].to(torch_device), inputs["noise_sequence"].to(torch_device))[ - 0 - ] + outputs = model( + inputs["input_features"][:1].to(torch_device), inputs["noise_sequence"][:1].to(torch_device) + )[0] self.assertTrue(outputs.shape[0] == 1, msg="Unbatched input should create batched output with bsz = 1") - def test_unbatched_batched_outputs_consistency(self): - config, inputs = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - model.to(torch_device) - model.eval() - - unbatched_spectrogram = inputs["input_features"].detach().clone() - unbatched_noise_sequence = inputs["noise_sequence"].detach().clone() - batched_spectrogram = inputs["input_features"].unsqueeze(0) - batched_noise_sequence = inputs["noise_sequence"].unsqueeze(0) - - with torch.no_grad(): - unbatched_outputs = model( - unbatched_spectrogram.to(torch_device), - unbatched_noise_sequence.to(torch_device), - )[0] - - batched_outputs = model( - batched_spectrogram.to(torch_device), - batched_noise_sequence.to(torch_device), - )[0] - - torch.testing.assert_close(unbatched_outputs, batched_outputs) - @require_torch_gpu @slow diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py index f885afab08..afc883ef8f 100644 --- a/tests/models/vilt/test_modeling_vilt.py +++ b/tests/models/vilt/test_modeling_vilt.py @@ -345,6 +345,12 @@ class ViltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def test_determinism(self): pass + @unittest.skip( + "VilT samples image tokens from a multinomial distribution, resulting in not deterministic hidden states" + ) + def test_batching_equivalence(self): + pass + @unittest.skip( reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic hidden states""" diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py index 2a8b5087f3..e9fc3de258 100644 --- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py +++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py @@ -18,7 +18,7 @@ import unittest from transformers import ViTHybridConfig -from transformers.testing_utils import require_accelerate, require_torch, require_vision, slow, torch_device +from transformers.testing_utils import is_flaky, require_accelerate, require_torch, require_vision, slow, torch_device from transformers.utils import cached_property, is_torch_available, is_vision_available from ...test_configuration_common import ConfigTester @@ -221,6 +221,10 @@ class ViTHybridModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas model = ViTHybridModel.from_pretrained(model_name) self.assertIsNotNone(model) + @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516") + def test_batching_equivalence(self): + super().test_batching_equivalence() + # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py index c1afc9694d..b5196f12bb 100644 --- a/tests/models/vit_mae/test_modeling_vit_mae.py +++ b/tests/models/vit_mae/test_modeling_vit_mae.py @@ -270,6 +270,10 @@ class ViTMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def test_model_outputs_equivalence(self): pass + @unittest.skip(reason="ViTMAE returns a random mask + ids_restore in each forward pass") + def test_batching_equivalence(self): + pass + @slow def test_model_from_pretrained(self): for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: diff --git a/tests/models/vits/test_modeling_vits.py b/tests/models/vits/test_modeling_vits.py index c1c4117f7e..b83165aff4 100644 --- a/tests/models/vits/test_modeling_vits.py +++ b/tests/models/vits/test_modeling_vits.py @@ -216,6 +216,10 @@ class VitsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def test_determinism(self): pass + @unittest.skip("VITS is not deterministic") + def test_batching_equivalence(self): + pass + @is_flaky( max_attempts=3, description="Weight initialisation for the VITS conv layers sometimes exceeds the kaiming normal range", diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index db7c3ae82a..b79f3a2c0d 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -190,7 +190,7 @@ class WhisperModelTester: def __init__( self, parent, - batch_size=2, + batch_size=3, # need batch_size != num_hidden_layers seq_length=60, is_training=True, use_labels=False, @@ -1446,6 +1446,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi model = WhisperForConditionalGeneration(config).eval().to(torch_device) input_features = input_dict["input_features"].to(torch_device) + input_features = input_features[:2] # len = 250 with num_input_frames = 60 long_input_features = torch.cat([input_features.repeat(1, 1, 4), input_features[:, :, :10]], dim=-1) @@ -2626,7 +2627,7 @@ class WhisperEncoderModelTester: def __init__( self, parent, - batch_size=2, + batch_size=3, # need batch_size != num_hidden layers seq_length=60, is_training=True, use_labels=True, @@ -2997,7 +2998,7 @@ class WhisperStandaloneDecoderModelTester: def __init__( self, parent, - batch_size=2, + batch_size=3, # need batch_size != num_hidden layers is_training=True, use_labels=False, vocab_size=200, diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index db28b41c0b..bf8339c93e 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -479,6 +479,7 @@ class XCLIPModelTester: self.mit_hidden_size = mit_hidden_size self.text_model_tester = XCLIPTextModelTester(parent, **text_kwargs) self.vision_model_tester = XCLIPVisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.is_training = is_training def prepare_config_and_inputs(self): diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 6d4f0734cb..17865cf10f 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -99,6 +99,7 @@ if is_accelerate_available(): if is_torch_available(): import torch + import torch.nn.functional as F from safetensors.torch import load_file as safe_load_file from safetensors.torch import save_file as safe_save_file from torch import nn @@ -693,6 +694,99 @@ class ModelTesterMixin: expected_arg_names = [model.main_input_name] self.assertListEqual(arg_names[:1], expected_arg_names) + def test_batching_equivalence(self): + """ + Tests that the model supports batching and that the output is the nearly the same for the same input in + different batch sizes. + (Why "nearly the same" not "exactly the same"? Batching uses different matmul shapes, which often leads to + different results: https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535) + """ + + def get_tensor_equivalence_function(batched_input): + # models operating on continuous spaces have higher abs difference than LMs + # instead, we can rely on cos distance for image/speech models, similar to `diffusers` + if "input_ids" not in batched_input: + return lambda tensor1, tensor2: ( + 1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=1e-38) + ) + return lambda tensor1, tensor2: torch.max(torch.abs(tensor1 - tensor2)) + + def recursive_check(batched_object, single_row_object, model_name, key): + if isinstance(batched_object, (list, tuple)): + for batched_object_value, single_row_object_value in zip(batched_object, single_row_object): + recursive_check(batched_object_value, single_row_object_value, model_name, key) + elif isinstance(batched_object, dict): + for batched_object_value, single_row_object_value in zip( + batched_object.values(), single_row_object.values() + ): + recursive_check(batched_object_value, single_row_object_value, model_name, key) + # do not compare returned loss (0-dim tensor) or codebook ids (int) + elif batched_object is None or isinstance(batched_object, int): + return + elif batched_object.dim() == 0: + return + else: + # indexing the first element does not always work + # e.g. models that output similarity scores of size (N, M) would need to index [0, 0] + slice_ids = [slice(0, index) for index in single_row_object.shape] + batched_row = batched_object[slice_ids] + self.assertFalse( + torch.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}" + ) + self.assertFalse( + torch.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}" + ) + self.assertFalse( + torch.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}" + ) + self.assertFalse( + torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}" + ) + self.assertTrue( + (equivalence(batched_row, single_row_object)) <= 1e-03, + msg=( + f"Batched and Single row outputs are not equal in {model_name} for key={key}. " + f"Difference={equivalence(batched_row, single_row_object)}." + ), + ) + + config, batched_input = self.model_tester.prepare_config_and_inputs_for_common() + equivalence = get_tensor_equivalence_function(batched_input) + + for model_class in self.all_model_classes: + config.output_hidden_states = True + + model_name = model_class.__name__ + if hasattr(self.model_tester, "prepare_config_and_inputs_for_model_class"): + config, batched_input = self.model_tester.prepare_config_and_inputs_for_model_class(model_class) + batched_input_prepared = self._prepare_for_class(batched_input, model_class) + model = model_class(config).to(torch_device).eval() + + batch_size = self.model_tester.batch_size + single_row_input = {} + for key, value in batched_input_prepared.items(): + if isinstance(value, torch.Tensor) and value.shape[0] % batch_size == 0: + # e.g. musicgen has inputs of size (bs*codebooks). in most cases value.shape[0] == batch_size + single_batch_shape = value.shape[0] // batch_size + single_row_input[key] = value[:single_batch_shape] + else: + single_row_input[key] = value + + with torch.no_grad(): + model_batched_output = model(**batched_input_prepared) + model_row_output = model(**single_row_input) + + if isinstance(model_batched_output, torch.Tensor): + model_batched_output = {"model_output": model_batched_output} + model_row_output = {"model_output": model_row_output} + + for key in model_batched_output: + # DETR starts from zero-init queries to decoder, leading to cos_similarity = `nan` + if hasattr(self, "zero_init_hidden_state") and "decoder_hidden_states" in key: + model_batched_output[key] = model_batched_output[key][1:] + model_row_output[key] = model_row_output[key][1:] + recursive_check(model_batched_output[key], model_row_output[key], model_name, key) + def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None): if not self.model_tester.is_training: return