Add tests for batching support (#29297)

* add tests for batching support * Update src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update tests/test_modeling_common.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update tests/test_modeling_common.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update tests/test_modeling_common.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * fixes and comments * use cosine distance for conv models * skip mra model testing * Update tests/models/vilt/test_modeling_vilt.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * finzalize and make style * check model type by input names * Update tests/models/vilt/test_modeling_vilt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * fixed batch size for all testers * Revert "fixed batch size for all testers" This reverts commit 525f3a0a058f069fbda00352cf202b728d40df99. * add batch_size for all testers * dict from model output * do not skip layoutlm * bring back some code from git revert * Update tests/test_modeling_common.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/test_modeling_common.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * clean-up * where did minus go in tolerance * make whisper happy * deal with consequences of losing minus * deal with consequences of losing minus * maskformer needs its own test for happiness * fix more models * tag flaky CV models from Amy's approval * make codestyle --------- Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
2024-03-12 22:46:19 +05:00
parent 11163fff58
commit 8e64ba2890
48 changed files with 350 additions and 67 deletions
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -1292,7 +1292,7 @@ class CLIPSegDecoder(CLIPSegPreTrainedModel):
        batch_size = conditional_embeddings.shape[0]
        output = output.view(batch_size, output.shape[1], size, size)
-        logits = self.transposed_convolution(output).squeeze()
+        logits = self.transposed_convolution(output).squeeze(1)
        if not return_dict:
            return tuple(v for v in [logits, all_hidden_states, all_attentions] if v is not None)
--- a/src/transformers/models/encodec/modeling_encodec.py
+++ b/src/transformers/models/encodec/modeling_encodec.py
@@ -51,13 +51,13 @@ ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST = [
 class EncodecOutput(ModelOutput):
    """
    Args:
-        audio_codes (`torch.FloatTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
+        audio_codes (`torch.LongTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_values (`torch.FlaotTensor` of shape `(batch_size, sequence_length)`, *optional*)
            Decoded audio values, obtained using the decoder part of Encodec.
    """
-    audio_codes: torch.FloatTensor = None
+    audio_codes: torch.LongTensor = None
    audio_values: torch.FloatTensor = None
@@ -65,13 +65,13 @@ class EncodecOutput(ModelOutput):
 class EncodecEncoderOutput(ModelOutput):
    """
    Args:
-        audio_codes (`torch.FloatTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
+        audio_codes (`torch.LongTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
            Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding.
    """
-    audio_codes: torch.FloatTensor = None
+    audio_codes: torch.LongTensor = None
    audio_scales: torch.FloatTensor = None
@@ -514,7 +514,7 @@ ENCODEC_INPUTS_DOCSTRING = r"""
            The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
            bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented as
            `bandwidth == 6.0`
-        audio_codes (`torch.FloatTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
+        audio_codes (`torch.LongTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
            Scaling factor for each `audio_codes` input.
@@ -718,7 +718,7 @@ class EncodecModel(EncodecPreTrainedModel):
        trimmed.
        Args:
-            audio_codes (`torch.FloatTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
+            audio_codes (`torch.LongTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
                Discret code embeddings computed using `model.encode`.
            audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
                Scaling factor for each `audio_codes` input.
--- a/src/transformers/models/funnel/modeling_funnel.py
+++ b/src/transformers/models/funnel/modeling_funnel.py
@@ -776,7 +776,7 @@ class FunnelDiscriminatorPredictions(nn.Module):
    def forward(self, discriminator_hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dense(discriminator_hidden_states)
        hidden_states = ACT2FN[self.config.hidden_act](hidden_states)
-        logits = self.dense_prediction(hidden_states).squeeze()
+        logits = self.dense_prediction(hidden_states).squeeze(-1)
        return logits
--- a/src/transformers/models/tvp/modeling_tvp.py
+++ b/src/transformers/models/tvp/modeling_tvp.py
@@ -679,7 +679,7 @@ class TvpFramePadPrompter(nn.Module):
            prompt = torch.cat([self.pad_left, base, self.pad_right], dim=4)
            prompt = torch.cat([self.pad_up, prompt, self.pad_down], dim=3)
            prompt = torch.cat(pixel_values.size(0) * [prompt])
-            pixel_values += prompt.to(pixel_values.dtype)
+            pixel_values = pixel_values + prompt.to(pixel_values.dtype)
        return pixel_values
--- a/src/transformers/models/yoso/modeling_yoso.py
+++ b/src/transformers/models/yoso/modeling_yoso.py
@@ -371,10 +371,12 @@ class YosoSelfAttention(nn.Module):
        key_layer = key_layer.reshape(batch_size * num_heads, seq_len, head_dim)
        value_layer = value_layer.reshape(batch_size * num_heads, seq_len, head_dim)
        # revert changes made by get_extended_attention_mask
        attention_mask = 1.0 + attention_mask / 10000.0
        attention_mask = (
-            attention_mask.squeeze().repeat(1, num_heads, 1).reshape(batch_size * num_heads, seq_len).int()
+            attention_mask.unsqueeze(1)
            .repeat_interleave(num_heads, dim=1)
            .reshape(batch_size * num_heads, seq_len)
            .int()
        )
        # The CUDA kernels are most efficient with inputs whose size is a multiple of a GPU's warp size (32). Inputs
@@ -808,10 +810,6 @@ class YosoModel(YosoPreTrainedModel):
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
@@ -827,7 +825,7 @@ class YosoModel(YosoPreTrainedModel):
        )
        encoder_outputs = self.encoder(
            embedding_output,
-            attention_mask=extended_attention_mask,
+            attention_mask=attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -405,6 +405,7 @@ class AlignModelTester:
        self.parent = parent
        self.text_model_tester = AlignTextModelTester(parent, **text_kwargs)
        self.vision_model_tester = AlignVisionModelTester(parent, **vision_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
    def prepare_config_and_inputs(self):
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -380,6 +380,7 @@ class AltCLIPModelTester:
        self.parent = parent
        self.text_model_tester = AltCLIPTextModelTester(parent, **text_kwargs)
        self.vision_model_tester = AltCLIPVisionModelTester(parent, **vision_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
    def prepare_config_and_inputs(self):
--- a/tests/models/autoformer/test_modeling_autoformer.py
+++ b/tests/models/autoformer/test_modeling_autoformer.py
@@ -107,6 +107,7 @@ class AutoformerModelTester:
            cardinality=[self.cardinality],
            embedding_dimension=[self.embedding_dimension],
            moving_average=self.moving_average,
            scaling="std",  # we need std to get non-zero `loc`
        )
    def prepare_autoformer_inputs_dict(self, config):
--- a/tests/models/bark/test_modeling_bark.py
+++ b/tests/models/bark/test_modeling_bark.py
@@ -67,7 +67,7 @@ class BarkSemanticModelTester:
    def __init__(
        self,
        parent,
-        batch_size=2,
+        batch_size=3,  # need batch_size != num_hidden_layers
        seq_length=4,
        is_training=False,  # for now training is not supported
        use_input_mask=True,
@@ -203,7 +203,7 @@ class BarkCoarseModelTester:
    def __init__(
        self,
        parent,
-        batch_size=2,
+        batch_size=3,  # need batch_size != num_hidden_layers
        seq_length=4,
        is_training=False,  # for now training is not supported
        use_input_mask=True,
@@ -339,7 +339,7 @@ class BarkFineModelTester:
    def __init__(
        self,
        parent,
-        batch_size=2,
+        batch_size=3,  # need batch_size != num_hidden_layers
        seq_length=4,
        is_training=False,  # for now training is not supported
        use_input_mask=True,
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -387,6 +387,7 @@ class BlipModelTester:
        self.parent = parent
        self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
        self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
    def prepare_config_and_inputs(self):
@@ -596,6 +597,7 @@ class BlipTextRetrievalModelTester:
        self.parent = parent
        self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
        self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
    def prepare_config_and_inputs(self):
@@ -643,6 +645,7 @@ class BlipTextImageModelsModelTester:
        self.parent = parent
        self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
        self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
    def prepare_config_and_inputs(self):
@@ -691,6 +694,7 @@ class BlipVQAModelTester:
        self.parent = parent
        self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
        self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
    def prepare_config_and_inputs(self):
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -390,6 +390,7 @@ class Blip2ForConditionalGenerationDecoderOnlyModelTester:
        self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs)
        self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs)
        self.text_model_tester = Blip2TextModelDecoderOnlyTester(parent, **text_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
        self.num_query_tokens = num_query_tokens
@@ -616,6 +617,7 @@ class Blip2ModelTester:
        self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs)
        self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs)
        self.text_model_tester = Blip2TextModelTester(parent, **text_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
        self.num_query_tokens = num_query_tokens
--- a/tests/models/chinese_clip/test_modeling_chinese_clip.py
+++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -510,6 +510,7 @@ class ChineseCLIPModelTester:
        self.parent = parent
        self.text_model_tester = ChineseCLIPTextModelTester(parent, **text_kwargs)
        self.vision_model_tester = ChineseCLIPVisionModelTester(parent, **vision_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
    def prepare_config_and_inputs(self):
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -466,6 +466,7 @@ class ClapModelTester:
        self.parent = parent
        self.text_model_tester = ClapTextModelTester(parent, **text_kwargs)
        self.audio_model_tester = ClapAudioModelTester(parent, **audio_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
    def prepare_config_and_inputs(self):
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -437,6 +437,7 @@ class CLIPModelTester:
        self.parent = parent
        self.text_model_tester = CLIPTextModelTester(parent, **text_kwargs)
        self.vision_model_tester = CLIPVisionModelTester(parent, **vision_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
    def prepare_config_and_inputs(self):
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -388,6 +388,7 @@ class CLIPSegModelTester:
        self.parent = parent
        self.text_model_tester = CLIPSegTextModelTester(parent, **text_kwargs)
        self.vision_model_tester = CLIPSegVisionModelTester(parent, **vision_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
        self.extract_layers = extract_layers
--- a/tests/models/clvp/test_modeling_clvp.py
+++ b/tests/models/clvp/test_modeling_clvp.py
@@ -344,6 +344,7 @@ class ClvpModelForConditionalGenerationTester:
        self.parent = parent
        self.clvp_encoder_tester = ClvpEncoderTester(parent)
        self.is_training = is_training
        self.batch_size = self.clvp_encoder_tester.batch_size  # need bs for batching_equivalence test
    def get_config(self):
        decoder_config = ClvpDecoderConfig(
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -194,6 +194,7 @@ class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
    test_pruning = False
    test_head_masking = False
    test_missing_keys = False
    zero_init_hidden_state = True
    # special case for head models
    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
--- a/tests/models/cpmant/test_modeling_cpmant.py
+++ b/tests/models/cpmant/test_modeling_cpmant.py
@@ -57,7 +57,7 @@ class CpmAntModelTester:
        prompt_length=8,
        prompt_types=8,
        segment_types=8,
-        init_std=1.0,
+        init_std=0.02,
        return_dict=True,
    ):
        self.parent = parent
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -194,6 +194,7 @@ class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
    test_pruning = False
    test_head_masking = False
    test_missing_keys = False
    zero_init_hidden_state = True
    # special case for head models
    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
--- a/tests/models/dpt/test_modeling_dpt_hybrid.py
+++ b/tests/models/dpt/test_modeling_dpt_hybrid.py
@@ -19,7 +19,7 @@ import unittest
 from transformers import DPTConfig
 from transformers.file_utils import is_torch_available, is_vision_available
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
@@ -306,6 +306,10 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
        with self.assertRaises(ValueError):
            _ = DPTForDepthEstimation(config)
    @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
    def test_batching_equivalence(self):
        super().test_batching_equivalence()
 # We will verify our results on an image of cute cats
 def prepare_img():
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -33,11 +33,7 @@ from transformers.testing_utils import (
 )
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
    ModelTesterMixin,
    _config_zero_init,
    floats_tensor,
 )
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -107,6 +103,15 @@ class EncodecModelTester:
        config, inputs_dict = self.prepare_config_and_inputs()
        return config, inputs_dict
    def prepare_config_and_inputs_for_model_class(self, model_class):
        config, inputs_dict = self.prepare_config_and_inputs()
        inputs_dict["audio_codes"] = ids_tensor([1, self.batch_size, 1, self.num_channels], self.codebook_size).type(
            torch.int32
        )
        inputs_dict["audio_scales"] = [None]
        return config, inputs_dict
    def get_config(self):
        return EncodecConfig(
            audio_channels=self.num_channels,
--- a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
+++ b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
@@ -347,6 +347,13 @@ class FastSpeech2ConformerModelTest(ModelTesterMixin, unittest.TestCase):
    def test_model_common_attributes(self):
        pass
    @unittest.skip(
        "FastSpeech2Conformer predicts durations in linear domain during inference"
        "Even small differences on hidden states lead to different durations, due to `torch.round`"
    )
    def test_batching_equivalence(self):
        pass
@require_torch
@require_g2p_en
@@ -762,6 +769,13 @@ class FastSpeech2ConformerWithHifiGanTest(ModelTesterMixin, unittest.TestCase):
    def test_model_common_attributes(self):
        pass
    @unittest.skip(
        "FastSpeech2Conformer predicts durations in linear domain during inference"
        "Even small differences on hidden states lead to different durations, due to `torch.round`"
    )
    def test_batching_equivalence(self):
        pass
@require_torch
@require_g2p_en
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -836,6 +836,7 @@ class FlavaModelTester:
        self.projection_dim = projection_dim
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
    def test_config(self):
        self.config_tester.run_common_tests()
--- a/tests/models/groupvit/test_modeling_groupvit.py
+++ b/tests/models/groupvit/test_modeling_groupvit.py
@@ -507,6 +507,7 @@ class GroupViTModelTester:
        self.parent = parent
        self.text_model_tester = GroupViTTextModelTester(parent, **text_kwargs)
        self.vision_model_tester = GroupViTVisionModelTester(parent, **vision_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
    def prepare_config_and_inputs(self):
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -279,6 +279,10 @@ class InformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
    def test_determinism(self):
        pass
    @unittest.skip("randomly selects U keys while calculating attentions")
    def test_batching_equivalence(self):
        pass
    @unittest.skip(
        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
    )
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -397,6 +397,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
        self.vision_model_tester = InstructBlipVisionModelTester(parent, **vision_kwargs)
        self.qformer_model_tester = InstructBlipQFormerModelTester(parent, **qformer_kwargs)
        self.text_model_tester = InstructBlipTextModelDecoderOnlyTester(parent, **text_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
        self.num_query_tokens = num_query_tokens
--- a/tests/models/kosmos2/test_modeling_kosmos2.py
+++ b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -197,6 +197,7 @@ class Kosmos2ModelTester:
        self.parent = parent
        self.text_model_tester = Kosmos2TextModelTester(parent, **text_kwargs)
        self.vision_model_tester = Kosmos2VisionModelTester(parent, **vision_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.latent_query_num = latent_query_num
        self.is_training = is_training
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -27,6 +27,7 @@ from ...test_pipeline_mixin import PipelineTesterMixin
 if is_torch_available():
    import torch
    import torch.nn.functional as F
    from transformers import (
        LayoutLMv2Config,
@@ -442,6 +443,64 @@ class LayoutLMv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                    )
    def test_batching_equivalence(self):
        def equivalence(tensor1, tensor2):
            return 1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=0)
        def recursive_check(batched_object, single_row_object, model_name, key):
            if isinstance(batched_object, (list, tuple)):
                for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
                    recursive_check(batched_object_value, single_row_object_value, model_name, key)
            elif batched_object is None:
                return
            else:
                batched_row = batched_object[:1]
                self.assertFalse(
                    torch.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}"
                )
                self.assertFalse(
                    torch.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}"
                )
                self.assertFalse(
                    torch.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}"
                )
                self.assertFalse(
                    torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
                )
                self.assertTrue(
                    (equivalence(batched_row, single_row_object)) <= 1e-03,
                    msg=(
                        f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
                        f"Difference={equivalence(batched_row, single_row_object)}."
                    ),
                )
        config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
        for model_class in self.all_model_classes:
            config.output_hidden_states = True
            model_name = model_class.__name__
            batched_input_prepared = self._prepare_for_class(batched_input, model_class)
            model = model_class(config).to(torch_device).eval()
            batch_size = self.model_tester.batch_size
            single_row_input = {}
            for key, value in batched_input_prepared.items():
                if isinstance(value, torch.Tensor) and value.shape[0] % batch_size == 0:
                    single_batch_shape = value.shape[0] // batch_size
                    single_row_input[key] = value[:single_batch_shape]
                elif hasattr(value, "tensor"):
                    # layoutlmv2uses ImageList intead of pixel values (needs for torchscript)
                    single_row_input[key] = value.tensor[:single_batch_shape]
            with torch.no_grad():
                model_batched_output = model(**batched_input_prepared)
                model_row_output = model(**single_row_input)
            for key in model_batched_output:
                recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
 def prepare_layoutlmv2_batch_inputs():
    # Here we prepare a batch of 2 sequences to test a LayoutLMv2 forward pass on:
--- a/tests/models/longformer/test_modeling_longformer.py
+++ b/tests/models/longformer/test_modeling_longformer.py
@@ -388,6 +388,10 @@ class LongformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
        # longformer cannot keep gradients in attentions or hidden states
        return
    @unittest.skip("LongFormer calculates global attn only when attn_mask has non-zero elements")
    def test_batching_equivalence(self):
        return
@require_torch
@require_sentencepiece
--- a/tests/models/maskformer/test_modeling_maskformer.py
+++ b/tests/models/maskformer/test_modeling_maskformer.py
@@ -39,6 +39,7 @@ from ...test_pipeline_mixin import PipelineTesterMixin
 if is_torch_available():
    import torch
    import torch.nn.functional as F
    from transformers import MaskFormerForInstanceSegmentation, MaskFormerModel
@@ -206,6 +207,7 @@ class MaskFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
    test_pruning = False
    test_head_masking = False
    test_missing_keys = False
    zero_init_hidden_state = True
    def setUp(self):
        self.model_tester = MaskFormerModelTester(self)
@@ -381,6 +383,67 @@ class MaskFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
            self.assertIsNotNone(outputs.auxiliary_logits)
            self.assertEqual(len(outputs.auxiliary_logits), self.model_tester.num_channels - 1)
    def test_batching_equivalence(self):
        def equivalence(tensor1, tensor2):
            return 1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=0).max()
        def recursive_check(batched_object, single_row_object, model_name, key):
            if isinstance(batched_object, (list, tuple)):
                for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
                    recursive_check(batched_object_value, single_row_object_value, model_name, key)
            elif batched_object is None:
                return
            else:
                batched_row = batched_object[:1]
                self.assertFalse(
                    torch.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}"
                )
                self.assertFalse(
                    torch.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}"
                )
                self.assertFalse(
                    torch.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}"
                )
                self.assertFalse(
                    torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
                )
                self.assertTrue(
                    (equivalence(batched_row, single_row_object)) <= 1e-03,
                    msg=(
                        f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
                        f"Difference={equivalence(batched_row, single_row_object)}."
                    ),
                )
        config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
        for model_class in self.all_model_classes:
            config.output_hidden_states = True
            model_name = model_class.__name__
            batched_input_prepared = self._prepare_for_class(batched_input, model_class)
            model = model_class(config).to(torch_device).eval()
            batch_size = self.model_tester.batch_size
            single_row_input = {}
            for key, value in batched_input_prepared.items():
                single_batch_shape = value.shape[0] // batch_size
                single_row_input[key] = value[:single_batch_shape]
            with torch.no_grad():
                model_batched_output = model(**batched_input_prepared)
                model_row_output = model(**single_row_input)
            for key in model_batched_output:
                # remove the first zero-init queries to decoder, otherwise cos_similarity = `nan`
                # no need to check all hidden_states, already checked separately each one
                if key == "transformer_decoder_hidden_states":
                    model_batched_output[key] = model_batched_output[key][1:]
                    model_row_output[key] = model_row_output[key][1:]
                elif key == "hidden_states":
                    continue
                recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
 TOLERANCE = 1e-4
--- a/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py
+++ b/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py
@@ -18,7 +18,7 @@
 import unittest
 from transformers import MobileNetV2Config
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 from ...test_configuration_common import ConfigTester
@@ -271,6 +271,10 @@ class MobileNetV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
            model = MobileNetV2Model.from_pretrained(model_name)
            self.assertIsNotNone(model)
    @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
    def test_batching_equivalence(self):
        super().test_batching_equivalence()
 # We will verify our results on an image of cute cats
 def prepare_img():
--- a/tests/models/mra/test_modeling_mra.py
+++ b/tests/models/mra/test_modeling_mra.py
@@ -378,6 +378,10 @@ class MraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    def test_training_gradient_checkpointing_use_reentrant_false(self):
        pass
    @unittest.skip("Model has `nan` in hidden_states, see https://github.com/huggingface/transformers/issues/29373.")
    def test_batching_equivalence(self):
        pass
@require_torch
 class MraModelIntegrationTest(unittest.TestCase):
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -103,7 +103,7 @@ class MusicgenDecoderTester:
    def __init__(
        self,
        parent,
-        batch_size=2,
+        batch_size=3,  # need batch_size != num_hidden_layers
        seq_length=7,
        is_training=False,
        use_labels=False,
@@ -441,7 +441,7 @@ class MusicgenTester:
    def __init__(
        self,
        parent,
-        batch_size=2,
+        batch_size=3,  # need batch_size != num_hidden_layers
        seq_length=7,
        is_training=False,
        use_labels=False,
--- a/tests/models/owlv2/test_modeling_owlv2.py
+++ b/tests/models/owlv2/test_modeling_owlv2.py
@@ -385,6 +385,7 @@ class Owlv2ModelTester:
        self.is_training = is_training
        self.text_config = self.text_model_tester.get_config().to_dict()
        self.vision_config = self.vision_model_tester.get_config().to_dict()
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
    def prepare_config_and_inputs(self):
        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
@@ -591,6 +592,7 @@ class Owlv2ForObjectDetectionTester:
        self.is_training = is_training
        self.text_config = self.text_model_tester.get_config().to_dict()
        self.vision_config = self.vision_model_tester.get_config().to_dict()
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
    def prepare_config_and_inputs(self):
        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -381,6 +381,7 @@ class OwlViTModelTester:
        self.is_training = is_training
        self.text_config = self.text_model_tester.get_config().to_dict()
        self.vision_config = self.vision_model_tester.get_config().to_dict()
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
    def prepare_config_and_inputs(self):
        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
@@ -585,6 +586,7 @@ class OwlViTForObjectDetectionTester:
        self.is_training = is_training
        self.text_config = self.text_model_tester.get_config().to_dict()
        self.vision_config = self.vision_model_tester.get_config().to_dict()
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
    def prepare_config_and_inputs(self):
        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -386,6 +386,7 @@ class Pix2StructModelTester:
        self.parent = parent
        self.text_model_tester = Pix2StructTextModelTester(parent, **text_kwargs)
        self.vision_model_tester = Pix2StructVisionModelTester(parent, **vision_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
    def prepare_config_and_inputs(self):
--- a/tests/models/siglip/test_modeling_siglip.py
+++ b/tests/models/siglip/test_modeling_siglip.py
@@ -389,6 +389,7 @@ class SiglipModelTester:
        self.parent = parent
        self.text_model_tester = SiglipTextModelTester(parent, **text_kwargs)
        self.vision_model_tester = SiglipVisionModelTester(parent, **vision_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
    # Copied from tests.models.clip.test_modeling_clip.CLIPModelTester.prepare_config_and_inputs
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -916,6 +916,10 @@ class SpeechT5ForTextToSpeechTest(ModelTesterMixin, unittest.TestCase):
    def test_determinism(self):
        pass
    @unittest.skip("skipped because there is always dropout in SpeechT5SpeechDecoderPrenet")
    def test_batching_equivalence(self):
        pass
    def test_forward_signature(self):
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
@@ -1438,6 +1442,10 @@ class SpeechT5ForSpeechToSpeechTest(ModelTesterMixin, unittest.TestCase):
    def test_determinism(self):
        pass
    @unittest.skip("skipped because there is always dropout in SpeechT5SpeechDecoderPrenet")
    def test_batching_equivalence(self):
        pass
    def test_attention_outputs(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.return_dict = True
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -209,6 +209,7 @@ class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, Pipelin
    test_pruning = False
    test_head_masking = False
    test_missing_keys = False
    zero_init_hidden_state = True
    # special case for head models
    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -104,6 +104,7 @@ class TimeSeriesTransformerModelTester:
            num_static_categorical_features=1,
            cardinality=[self.cardinality],
            embedding_dimension=[self.embedding_dimension],
            scaling="std",  # we need std to get non-zero `loc`
        )
    def prepare_time_series_transformer_inputs_dict(self, config):
--- a/tests/models/univnet/test_modeling_univnet.py
+++ b/tests/models/univnet/test_modeling_univnet.py
@@ -66,13 +66,13 @@ class UnivNetModelTester:
    def prepare_noise_sequence(self):
        generator = torch.manual_seed(self.seed)
-        noise_shape = (self.seq_length, self.in_channels)
+        noise_shape = (self.batch_size, self.seq_length, self.in_channels)
        # Create noise on CPU for reproducibility
        noise_sequence = torch.randn(noise_shape, generator=generator, dtype=torch.float)
        return noise_sequence
    def prepare_config_and_inputs(self):
-        spectrogram = floats_tensor([self.seq_length, self.num_mel_bins], scale=1.0)
+        spectrogram = floats_tensor([self.batch_size, self.seq_length, self.num_mel_bins], scale=1.0)
        noise_sequence = self.prepare_noise_sequence()
        noise_sequence = noise_sequence.to(spectrogram.device)
        config = self.get_config()
@@ -89,7 +89,7 @@ class UnivNetModelTester:
    def create_and_check_model(self, config, spectrogram, noise_sequence):
        model = UnivNetModel(config=config).to(torch_device).eval()
        result = model(spectrogram, noise_sequence)[0]
-        self.parent.assertEqual(result.shape, (1, self.seq_length * 256))
+        self.parent.assertEqual(result.shape, (self.batch_size, self.seq_length * 256))
    def prepare_config_and_inputs_for_common(self):
        config, spectrogram, noise_sequence = self.prepare_config_and_inputs()
@@ -182,8 +182,8 @@ class UnivNetModelTest(ModelTesterMixin, unittest.TestCase):
            model.to(torch_device)
            model.eval()
-            batched_spectrogram = inputs["input_features"].unsqueeze(0).repeat(2, 1, 1)
+            batched_spectrogram = inputs["input_features"]
-            batched_noise_sequence = inputs["noise_sequence"].unsqueeze(0).repeat(2, 1, 1)
+            batched_noise_sequence = inputs["noise_sequence"]
            with torch.no_grad():
                batched_outputs = model(
                    batched_spectrogram.to(torch_device),
@@ -205,37 +205,11 @@ class UnivNetModelTest(ModelTesterMixin, unittest.TestCase):
            model.eval()
            with torch.no_grad():
-                outputs = model(inputs["input_features"].to(torch_device), inputs["noise_sequence"].to(torch_device))[
+                outputs = model(
-                    0
+                    inputs["input_features"][:1].to(torch_device), inputs["noise_sequence"][:1].to(torch_device)
-                ]
+                )[0]
            self.assertTrue(outputs.shape[0] == 1, msg="Unbatched input should create batched output with bsz = 1")
    def test_unbatched_batched_outputs_consistency(self):
        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
        for model_class in self.all_model_classes:
            model = model_class(config)
            model.to(torch_device)
            model.eval()
            unbatched_spectrogram = inputs["input_features"].detach().clone()
            unbatched_noise_sequence = inputs["noise_sequence"].detach().clone()
            batched_spectrogram = inputs["input_features"].unsqueeze(0)
            batched_noise_sequence = inputs["noise_sequence"].unsqueeze(0)
            with torch.no_grad():
                unbatched_outputs = model(
                    unbatched_spectrogram.to(torch_device),
                    unbatched_noise_sequence.to(torch_device),
                )[0]
                batched_outputs = model(
                    batched_spectrogram.to(torch_device),
                    batched_noise_sequence.to(torch_device),
                )[0]
            torch.testing.assert_close(unbatched_outputs, batched_outputs)
@require_torch_gpu
@slow
--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@@ -345,6 +345,12 @@ class ViltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    def test_determinism(self):
        pass
    @unittest.skip(
        "VilT samples image tokens from a multinomial distribution, resulting in not deterministic hidden states"
    )
    def test_batching_equivalence(self):
        pass
    @unittest.skip(
        reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic
                            hidden states"""
--- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
+++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
@@ -18,7 +18,7 @@
 import unittest
 from transformers import ViTHybridConfig
-from transformers.testing_utils import require_accelerate, require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import is_flaky, require_accelerate, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 from ...test_configuration_common import ConfigTester
@@ -221,6 +221,10 @@ class ViTHybridModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
            model = ViTHybridModel.from_pretrained(model_name)
            self.assertIsNotNone(model)
    @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
    def test_batching_equivalence(self):
        super().test_batching_equivalence()
 # We will verify our results on an image of cute cats
 def prepare_img():
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -270,6 +270,10 @@ class ViTMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    def test_model_outputs_equivalence(self):
        pass
    @unittest.skip(reason="ViTMAE returns a random mask + ids_restore in each forward pass")
    def test_batching_equivalence(self):
        pass
    @slow
    def test_model_from_pretrained(self):
        for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
--- a/tests/models/vits/test_modeling_vits.py
+++ b/tests/models/vits/test_modeling_vits.py
@@ -216,6 +216,10 @@ class VitsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    def test_determinism(self):
        pass
    @unittest.skip("VITS is not deterministic")
    def test_batching_equivalence(self):
        pass
    @is_flaky(
        max_attempts=3,
        description="Weight initialisation for the VITS conv layers sometimes exceeds the kaiming normal range",
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -190,7 +190,7 @@ class WhisperModelTester:
    def __init__(
        self,
        parent,
-        batch_size=2,
+        batch_size=3,  # need batch_size != num_hidden_layers
        seq_length=60,
        is_training=True,
        use_labels=False,
@@ -1446,6 +1446,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
        model = WhisperForConditionalGeneration(config).eval().to(torch_device)
        input_features = input_dict["input_features"].to(torch_device)
        input_features = input_features[:2]
        # len = 250 with num_input_frames = 60
        long_input_features = torch.cat([input_features.repeat(1, 1, 4), input_features[:, :, :10]], dim=-1)
@@ -2626,7 +2627,7 @@ class WhisperEncoderModelTester:
    def __init__(
        self,
        parent,
-        batch_size=2,
+        batch_size=3,  # need batch_size != num_hidden layers
        seq_length=60,
        is_training=True,
        use_labels=True,
@@ -2997,7 +2998,7 @@ class WhisperStandaloneDecoderModelTester:
    def __init__(
        self,
        parent,
-        batch_size=2,
+        batch_size=3,  # need batch_size != num_hidden layers
        is_training=True,
        use_labels=False,
        vocab_size=200,
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -479,6 +479,7 @@ class XCLIPModelTester:
        self.mit_hidden_size = mit_hidden_size
        self.text_model_tester = XCLIPTextModelTester(parent, **text_kwargs)
        self.vision_model_tester = XCLIPVisionModelTester(parent, **vision_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
    def prepare_config_and_inputs(self):
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -99,6 +99,7 @@ if is_accelerate_available():
 if is_torch_available():
    import torch
    import torch.nn.functional as F
    from safetensors.torch import load_file as safe_load_file
    from safetensors.torch import save_file as safe_save_file
    from torch import nn
@@ -693,6 +694,99 @@ class ModelTesterMixin:
                expected_arg_names = [model.main_input_name]
                self.assertListEqual(arg_names[:1], expected_arg_names)
    def test_batching_equivalence(self):
        """
        Tests that the model supports batching and that the output is the nearly the same for the same input in
        different batch sizes.
        (Why "nearly the same" not "exactly the same"? Batching uses different matmul shapes, which often leads to
        different results: https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535)
        """
        def get_tensor_equivalence_function(batched_input):
            # models operating on continuous spaces have higher abs difference than LMs
            # instead, we can rely on cos distance for image/speech models, similar to `diffusers`
            if "input_ids" not in batched_input:
                return lambda tensor1, tensor2: (
                    1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=1e-38)
                )
            return lambda tensor1, tensor2: torch.max(torch.abs(tensor1 - tensor2))
        def recursive_check(batched_object, single_row_object, model_name, key):
            if isinstance(batched_object, (list, tuple)):
                for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
                    recursive_check(batched_object_value, single_row_object_value, model_name, key)
            elif isinstance(batched_object, dict):
                for batched_object_value, single_row_object_value in zip(
                    batched_object.values(), single_row_object.values()
                ):
                    recursive_check(batched_object_value, single_row_object_value, model_name, key)
            # do not compare returned loss (0-dim tensor) or codebook ids (int)
            elif batched_object is None or isinstance(batched_object, int):
                return
            elif batched_object.dim() == 0:
                return
            else:
                # indexing the first element does not always work
                # e.g. models that output similarity scores of size (N, M) would need to index [0, 0]
                slice_ids = [slice(0, index) for index in single_row_object.shape]
                batched_row = batched_object[slice_ids]
                self.assertFalse(
                    torch.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}"
                )
                self.assertFalse(
                    torch.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}"
                )
                self.assertFalse(
                    torch.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}"
                )
                self.assertFalse(
                    torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
                )
                self.assertTrue(
                    (equivalence(batched_row, single_row_object)) <= 1e-03,
                    msg=(
                        f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
                        f"Difference={equivalence(batched_row, single_row_object)}."
                    ),
                )
        config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
        equivalence = get_tensor_equivalence_function(batched_input)
        for model_class in self.all_model_classes:
            config.output_hidden_states = True
            model_name = model_class.__name__
            if hasattr(self.model_tester, "prepare_config_and_inputs_for_model_class"):
                config, batched_input = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
            batched_input_prepared = self._prepare_for_class(batched_input, model_class)
            model = model_class(config).to(torch_device).eval()
            batch_size = self.model_tester.batch_size
            single_row_input = {}
            for key, value in batched_input_prepared.items():
                if isinstance(value, torch.Tensor) and value.shape[0] % batch_size == 0:
                    # e.g. musicgen has inputs of size (bs*codebooks). in most cases value.shape[0] == batch_size
                    single_batch_shape = value.shape[0] // batch_size
                    single_row_input[key] = value[:single_batch_shape]
                else:
                    single_row_input[key] = value
            with torch.no_grad():
                model_batched_output = model(**batched_input_prepared)
                model_row_output = model(**single_row_input)
            if isinstance(model_batched_output, torch.Tensor):
                model_batched_output = {"model_output": model_batched_output}
                model_row_output = {"model_output": model_row_output}
            for key in model_batched_output:
                # DETR starts from zero-init queries to decoder, leading to cos_similarity = `nan`
                if hasattr(self, "zero_init_hidden_state") and "decoder_hidden_states" in key:
                    model_batched_output[key] = model_batched_output[key][1:]
                    model_row_output[key] = model_row_output[key][1:]
                recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
    def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
        if not self.model_tester.is_training:
            return