diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index 732f044e07..bba7d2e351 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -108,6 +108,10 @@ class BatchFeature(UserDict):
 
         # Get a function reference for the correct framework
         if tensor_type == TensorType.TENSORFLOW:
+            logger.warning_once(
+                "TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We "
+                "recommend migrating to PyTorch classes or pinning your version of Transformers."
+            )
             if not is_tf_available():
                 raise ImportError(
                     "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
@@ -138,6 +142,10 @@ class BatchFeature(UserDict):
 
             is_tensor = torch.is_tensor
         elif tensor_type == TensorType.JAX:
+            logger.warning_once(
+                "TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We "
+                "recommend migrating to PyTorch classes or pinning your version of Transformers."
+            )
             if not is_flax_available():
                 raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
             import jax.numpy as jnp  # noqa: F811
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index 7697294d6a..4b260b14f6 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -179,6 +179,10 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
     ):
+        logger.warning_once(
+            "TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We "
+            "recommend migrating to PyTorch classes or pinning your version of Transformers."
+        )
         if config is None:
             raise ValueError("config cannot be None")
 
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index af436be20f..0ff83744ae 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -1199,6 +1199,10 @@ class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushT
         self.name_or_path = config.name_or_path
         self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None
         self._set_save_spec(self.input_signature)
+        logger.warning_once(
+            "TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We "
+            "recommend migrating to PyTorch classes or pinning your version of Transformers."
+        )
 
     def get_config(self):
         return self.config.to_dict()
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 9ca0566789..8878ad00b9 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -937,6 +937,11 @@ class Pipeline(_ScikitCompat, PushToHubMixin):
     ):
         if framework is None:
             framework, model = infer_framework_load_model(model, config=model.config)
+        if framework in ("tf", "jax"):
+            logger.warning_once(
+                "TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We "
+                "recommend migrating to PyTorch classes or pinning your version of Transformers."
+            )
 
         self.task = task
         self.model = model
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 74e2f2c1e7..eac2fcfb71 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2838,6 +2838,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
             "split_special_tokens": kwargs.pop("split_special_tokens", self.split_special_tokens),
             "verbose": verbose,
         }
+
+        if return_tensors in ("tf", "jax"):
+            logger.warning_once(
+                "TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We "
+                "recommend migrating to PyTorch classes or pinning your version of Transformers."
+            )
         all_kwargs.update(kwargs)
         if text is None and text_target is None:
             raise ValueError("You need to specify either `text` or `text_target`.")
diff --git a/tests/models/albert/test_modeling_flax_albert.py b/tests/models/albert/test_modeling_flax_albert.py
deleted file mode 100644
index ca8eeec591..0000000000
--- a/tests/models/albert/test_modeling_flax_albert.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from transformers import AlbertConfig, is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_flax_available():
-    import jax.numpy as jnp
-
-    from transformers.models.albert.modeling_flax_albert import (
-        FlaxAlbertForMaskedLM,
-        FlaxAlbertForMultipleChoice,
-        FlaxAlbertForPreTraining,
-        FlaxAlbertForQuestionAnswering,
-        FlaxAlbertForSequenceClassification,
-        FlaxAlbertForTokenClassification,
-        FlaxAlbertModel,
-    )
-
-
-class FlaxAlbertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_attention_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_choices=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_choices = num_choices
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        config = AlbertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, token_type_ids, attention_mask
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, token_type_ids, attention_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_flax
-class FlaxAlbertModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            FlaxAlbertModel,
-            FlaxAlbertForPreTraining,
-            FlaxAlbertForMaskedLM,
-            FlaxAlbertForMultipleChoice,
-            FlaxAlbertForQuestionAnswering,
-            FlaxAlbertForSequenceClassification,
-            FlaxAlbertForTokenClassification,
-            FlaxAlbertForQuestionAnswering,
-        )
-        if is_flax_available()
-        else ()
-    )
-
-    def setUp(self):
-        self.model_tester = FlaxAlbertModelTester(self)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("albert/albert-base-v2")
-            outputs = model(np.ones((1, 1)))
-            self.assertIsNotNone(outputs)
-
-
-@require_flax
-class FlaxAlbertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head_absolute_embedding(self):
-        model = FlaxAlbertModel.from_pretrained("albert/albert-base-v2")
-        input_ids = np.array([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-        attention_mask = np.array([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = (1, 11, 768)
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = np.array(
-            [[[-0.6513, 1.5035, -0.2766], [-0.6515, 1.5046, -0.2780], [-0.6512, 1.5049, -0.2784]]]
-        )
-
-        self.assertTrue(jnp.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
diff --git a/tests/models/albert/test_modeling_tf_albert.py b/tests/models/albert/test_modeling_tf_albert.py
deleted file mode 100644
index 339943de9e..0000000000
--- a/tests/models/albert/test_modeling_tf_albert.py
+++ /dev/null
@@ -1,328 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import AlbertConfig, is_tf_available
-from transformers.models.auto import get_values
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TF_MODEL_FOR_PRETRAINING_MAPPING
-    from transformers.models.albert.modeling_tf_albert import (
-        TFAlbertForMaskedLM,
-        TFAlbertForMultipleChoice,
-        TFAlbertForPreTraining,
-        TFAlbertForQuestionAnswering,
-        TFAlbertForSequenceClassification,
-        TFAlbertForTokenClassification,
-        TFAlbertModel,
-    )
-
-
-class TFAlbertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        embedding_size=16,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.embedding_size = 16
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = AlbertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            embedding_size=self.embedding_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def create_and_check_albert_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFAlbertModel(config=config)
-        # inputs = {'input_ids': input_ids,
-        #           'attention_mask': input_mask,
-        #           'token_type_ids': token_type_ids}
-        # sequence_output, pooled_output = model(**inputs)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_albert_for_pretraining(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFAlbertForPreTraining(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertEqual(result.sop_logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_albert_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFAlbertForMaskedLM(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_albert_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFAlbertForSequenceClassification(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_albert_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFAlbertForQuestionAnswering(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_albert_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = TFAlbertForMultipleChoice(config=config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
-
-    def create_and_check_albert_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFAlbertForTokenClassification(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFAlbertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFAlbertModel,
-            TFAlbertForPreTraining,
-            TFAlbertForMaskedLM,
-            TFAlbertForSequenceClassification,
-            TFAlbertForQuestionAnswering,
-            TFAlbertForTokenClassification,
-            TFAlbertForMultipleChoice,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFAlbertModel,
-            "fill-mask": TFAlbertForMaskedLM,
-            "question-answering": TFAlbertForQuestionAnswering,
-            "text-classification": TFAlbertForSequenceClassification,
-            "token-classification": TFAlbertForTokenClassification,
-            "zero-shot": TFAlbertForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    # special case for ForPreTraining model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class in get_values(TF_MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["sentence_order_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = TFAlbertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_albert_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_model(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_for_pretraining(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_for_multiple_choice(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_for_question_answering(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "albert/albert-base-v1"
-        model = TFAlbertModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_tf
-class TFAlbertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = TFAlbertForPreTraining.from_pretrained("albert/albert-base-v2")
-        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        expected_shape = [1, 6, 30000]
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = tf.constant(
-            [
-                [
-                    [4.595668, 0.74462754, -1.818147],
-                    [4.5954347, 0.7454184, -1.8188258],
-                    [4.5954905, 0.7448235, -1.8182316],
-                ]
-            ]
-        )
-        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
diff --git a/tests/models/auto/test_modeling_flax_auto.py b/tests/models/auto/test_modeling_flax_auto.py
deleted file mode 100644
index 8880972e04..0000000000
--- a/tests/models/auto/test_modeling_flax_auto.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers import AutoConfig, AutoTokenizer, BertConfig, TensorType, is_flax_available
-from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_flax, slow
-
-
-if is_flax_available():
-    import jax
-
-    from transformers.models.auto.modeling_flax_auto import FlaxAutoModel
-    from transformers.models.bert.modeling_flax_bert import FlaxBertModel
-    from transformers.models.roberta.modeling_flax_roberta import FlaxRobertaModel
-
-
-@require_flax
-class FlaxAutoModelTest(unittest.TestCase):
-    @slow
-    def test_bert_from_pretrained(self):
-        for model_name in ["google-bert/bert-base-cased", "google-bert/bert-large-uncased"]:
-            with self.subTest(model_name):
-                config = AutoConfig.from_pretrained(model_name)
-                self.assertIsNotNone(config)
-                self.assertIsInstance(config, BertConfig)
-
-                model = FlaxAutoModel.from_pretrained(model_name)
-                self.assertIsNotNone(model)
-                self.assertIsInstance(model, FlaxBertModel)
-
-    @slow
-    def test_roberta_from_pretrained(self):
-        for model_name in ["FacebookAI/roberta-base", "FacebookAI/roberta-large"]:
-            with self.subTest(model_name):
-                config = AutoConfig.from_pretrained(model_name)
-                self.assertIsNotNone(config)
-                self.assertIsInstance(config, BertConfig)
-
-                model = FlaxAutoModel.from_pretrained(model_name)
-                self.assertIsNotNone(model)
-                self.assertIsInstance(model, FlaxRobertaModel)
-
-    @slow
-    def test_bert_jax_jit(self):
-        for model_name in ["google-bert/bert-base-cased", "google-bert/bert-large-uncased"]:
-            tokenizer = AutoTokenizer.from_pretrained(model_name)
-            model = FlaxBertModel.from_pretrained(model_name)
-            tokens = tokenizer("Do you support jax jitted function?", return_tensors=TensorType.JAX)
-
-            @jax.jit
-            def eval(**kwargs):
-                return model(**kwargs)
-
-            eval(**tokens).block_until_ready()
-
-    @slow
-    def test_roberta_jax_jit(self):
-        for model_name in ["FacebookAI/roberta-base", "FacebookAI/roberta-large"]:
-            tokenizer = AutoTokenizer.from_pretrained(model_name)
-            model = FlaxRobertaModel.from_pretrained(model_name)
-            tokens = tokenizer("Do you support jax jitted function?", return_tensors=TensorType.JAX)
-
-            @jax.jit
-            def eval(**kwargs):
-                return model(**kwargs)
-
-            eval(**tokens).block_until_ready()
-
-    def test_repo_not_found(self):
-        with self.assertRaisesRegex(
-            EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"
-        ):
-            _ = FlaxAutoModel.from_pretrained("bert-base")
-
-    def test_revision_not_found(self):
-        with self.assertRaisesRegex(
-            EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
-        ):
-            _ = FlaxAutoModel.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
-
-    def test_model_file_not_found(self):
-        with self.assertRaisesRegex(
-            EnvironmentError,
-            "hf-internal-testing/config-no-model does not appear to have a file named flax_model.msgpack",
-        ):
-            _ = FlaxAutoModel.from_pretrained("hf-internal-testing/config-no-model")
-
-    def test_model_from_pt_suggestion(self):
-        with self.assertRaisesRegex(EnvironmentError, "Use `from_pt=True` to load this model"):
-            _ = FlaxAutoModel.from_pretrained("hf-internal-testing/tiny-bert-pt-only")
diff --git a/tests/models/auto/test_modeling_tf_auto.py b/tests/models/auto/test_modeling_tf_auto.py
deleted file mode 100644
index 9957df1629..0000000000
--- a/tests/models/auto/test_modeling_tf_auto.py
+++ /dev/null
@@ -1,310 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import copy
-import tempfile
-import unittest
-
-from transformers import CONFIG_MAPPING, AutoConfig, BertConfig, GPT2Config, T5Config, TapasConfig, is_tf_available
-from transformers.testing_utils import (
-    DUMMY_UNKNOWN_IDENTIFIER,
-    SMALL_MODEL_IDENTIFIER,
-    RequestCounter,
-    require_tensorflow_probability,
-    require_tf,
-    slow,
-)
-
-from ..bert.test_modeling_bert import BertModelTester
-
-
-if is_tf_available():
-    from transformers import (
-        TFAutoModel,
-        TFAutoModelForCausalLM,
-        TFAutoModelForMaskedLM,
-        TFAutoModelForPreTraining,
-        TFAutoModelForQuestionAnswering,
-        TFAutoModelForSeq2SeqLM,
-        TFAutoModelForSequenceClassification,
-        TFAutoModelForTableQuestionAnswering,
-        TFAutoModelForTokenClassification,
-        TFAutoModelWithLMHead,
-        TFBertForMaskedLM,
-        TFBertForPreTraining,
-        TFBertForQuestionAnswering,
-        TFBertForSequenceClassification,
-        TFBertModel,
-        TFFunnelBaseModel,
-        TFFunnelModel,
-        TFGPT2LMHeadModel,
-        TFRobertaForMaskedLM,
-        TFT5ForConditionalGeneration,
-        TFTapasForQuestionAnswering,
-    )
-    from transformers.models.auto.modeling_tf_auto import (
-        TF_MODEL_FOR_CAUSAL_LM_MAPPING,
-        TF_MODEL_FOR_MASKED_LM_MAPPING,
-        TF_MODEL_FOR_PRETRAINING_MAPPING,
-        TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        TF_MODEL_MAPPING,
-    )
-
-
-class NewModelConfig(BertConfig):
-    model_type = "new-model"
-
-
-if is_tf_available():
-
-    class TFNewModel(TFBertModel):
-        config_class = NewModelConfig
-
-
-@require_tf
-class TFAutoModelTest(unittest.TestCase):
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google-bert/bert-base-cased"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, BertConfig)
-
-        model = TFAutoModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, TFBertModel)
-
-    @slow
-    def test_model_for_pretraining_from_pretrained(self):
-        model_name = "google-bert/bert-base-cased"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, BertConfig)
-
-        model = TFAutoModelForPreTraining.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, TFBertForPreTraining)
-
-    @slow
-    def test_model_for_causal_lm(self):
-        model_name = "openai-community/gpt2"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, GPT2Config)
-
-        model = TFAutoModelForCausalLM.from_pretrained(model_name)
-        model, loading_info = TFAutoModelForCausalLM.from_pretrained(model_name, output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, TFGPT2LMHeadModel)
-
-    @slow
-    def test_lmhead_model_from_pretrained(self):
-        model_name = "openai-community/gpt2"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, GPT2Config)
-
-        model = TFAutoModelWithLMHead.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, TFGPT2LMHeadModel)
-
-    @slow
-    def test_model_for_masked_lm(self):
-        model_name = "google-bert/bert-base-uncased"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, BertConfig)
-
-        model = TFAutoModelForMaskedLM.from_pretrained(model_name)
-        model, loading_info = TFAutoModelForMaskedLM.from_pretrained(model_name, output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, TFBertForMaskedLM)
-
-    @slow
-    def test_model_for_encoder_decoder_lm(self):
-        model_name = "google-t5/t5-base"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, T5Config)
-
-        model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)
-        model, loading_info = TFAutoModelForSeq2SeqLM.from_pretrained(model_name, output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, TFT5ForConditionalGeneration)
-
-    @slow
-    def test_sequence_classification_model_from_pretrained(self):
-        #     model_name = 'openai-community/gpt2'
-        for model_name in ["google-bert/bert-base-uncased"]:
-            config = AutoConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, BertConfig)
-
-            model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, TFBertForSequenceClassification)
-
-    @slow
-    def test_question_answering_model_from_pretrained(self):
-        #     model_name = 'openai-community/gpt2'
-        for model_name in ["google-bert/bert-base-uncased"]:
-            config = AutoConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, BertConfig)
-
-            model = TFAutoModelForQuestionAnswering.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, TFBertForQuestionAnswering)
-
-    @slow
-    @require_tensorflow_probability
-    def test_table_question_answering_model_from_pretrained(self):
-        model_name = "google/tapas-base"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, TapasConfig)
-
-        model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_name)
-        model, loading_info = TFAutoModelForTableQuestionAnswering.from_pretrained(
-            model_name, output_loading_info=True
-        )
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, TFTapasForQuestionAnswering)
-
-    def test_from_pretrained_identifier(self):
-        model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
-        self.assertIsInstance(model, TFBertForMaskedLM)
-        self.assertEqual(model.num_parameters(), 14410)
-        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
-
-    def test_from_identifier_from_model_type(self):
-        model = TFAutoModelWithLMHead.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER)
-        self.assertIsInstance(model, TFRobertaForMaskedLM)
-        self.assertEqual(model.num_parameters(), 14410)
-        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
-
-    def test_from_pretrained_with_tuple_values(self):
-        # For the auto model mapping, FunnelConfig has two models: FunnelModel and FunnelBaseModel
-        model = TFAutoModel.from_pretrained("sgugger/funnel-random-tiny")
-        self.assertIsInstance(model, TFFunnelModel)
-
-        config = copy.deepcopy(model.config)
-        config.architectures = ["FunnelBaseModel"]
-        model = TFAutoModel.from_config(config)
-        model.build_in_name_scope()
-
-        self.assertIsInstance(model, TFFunnelBaseModel)
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir)
-            model = TFAutoModel.from_pretrained(tmp_dir)
-            self.assertIsInstance(model, TFFunnelBaseModel)
-
-    def test_new_model_registration(self):
-        try:
-            AutoConfig.register("new-model", NewModelConfig)
-
-            auto_classes = [
-                TFAutoModel,
-                TFAutoModelForCausalLM,
-                TFAutoModelForMaskedLM,
-                TFAutoModelForPreTraining,
-                TFAutoModelForQuestionAnswering,
-                TFAutoModelForSequenceClassification,
-                TFAutoModelForTokenClassification,
-            ]
-
-            for auto_class in auto_classes:
-                with self.subTest(auto_class.__name__):
-                    # Wrong config class will raise an error
-                    with self.assertRaises(ValueError):
-                        auto_class.register(BertConfig, TFNewModel)
-                    auto_class.register(NewModelConfig, TFNewModel)
-                    # Trying to register something existing in the Transformers library will raise an error
-                    with self.assertRaises(ValueError):
-                        auto_class.register(BertConfig, TFBertModel)
-
-                    # Now that the config is registered, it can be used as any other config with the auto-API
-                    tiny_config = BertModelTester(self).get_config()
-                    config = NewModelConfig(**tiny_config.to_dict())
-
-                    model = auto_class.from_config(config)
-                    model.build_in_name_scope()
-
-                    self.assertIsInstance(model, TFNewModel)
-
-                    with tempfile.TemporaryDirectory() as tmp_dir:
-                        model.save_pretrained(tmp_dir)
-                        new_model = auto_class.from_pretrained(tmp_dir)
-                        self.assertIsInstance(new_model, TFNewModel)
-
-        finally:
-            if "new-model" in CONFIG_MAPPING._extra_content:
-                del CONFIG_MAPPING._extra_content["new-model"]
-            for mapping in (
-                TF_MODEL_MAPPING,
-                TF_MODEL_FOR_PRETRAINING_MAPPING,
-                TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-                TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-                TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-                TF_MODEL_FOR_CAUSAL_LM_MAPPING,
-                TF_MODEL_FOR_MASKED_LM_MAPPING,
-            ):
-                if NewModelConfig in mapping._extra_content:
-                    del mapping._extra_content[NewModelConfig]
-
-    def test_repo_not_found(self):
-        with self.assertRaisesRegex(
-            EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"
-        ):
-            _ = TFAutoModel.from_pretrained("bert-base")
-
-    def test_revision_not_found(self):
-        with self.assertRaisesRegex(
-            EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
-        ):
-            _ = TFAutoModel.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
-
-    def test_model_file_not_found(self):
-        with self.assertRaisesRegex(
-            EnvironmentError,
-            "hf-internal-testing/config-no-model does not appear to have a file named pytorch_model.bin",
-        ):
-            _ = TFAutoModel.from_pretrained("hf-internal-testing/config-no-model")
-
-    def test_model_from_pt_suggestion(self):
-        with self.assertRaisesRegex(EnvironmentError, "Use `from_pt=True` to load this model"):
-            _ = TFAutoModel.from_pretrained("hf-internal-testing/tiny-bert-pt-only")
-
-    @unittest.skip("Failing on main")
-    def test_cached_model_has_minimum_calls_to_head(self):
-        # Make sure we have cached the model.
-        _ = TFAutoModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with RequestCounter() as counter:
-            _ = TFAutoModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        self.assertEqual(counter["GET"], 0)
-        self.assertEqual(counter["HEAD"], 1)
-        self.assertEqual(counter.total_calls, 1)
-
-        # With a sharded checkpoint
-        _ = TFAutoModel.from_pretrained("ArthurZ/tiny-random-bert-sharded")
-        with RequestCounter() as counter:
-            _ = TFAutoModel.from_pretrained("ArthurZ/tiny-random-bert-sharded")
-        self.assertEqual(counter["GET"], 0)
-        self.assertEqual(counter["HEAD"], 1)
-        self.assertEqual(counter.total_calls, 1)
diff --git a/tests/models/bart/test_modeling_flax_bart.py b/tests/models/bart/test_modeling_flax_bart.py
deleted file mode 100644
index 118398d3df..0000000000
--- a/tests/models/bart/test_modeling_flax_bart.py
+++ /dev/null
@@ -1,763 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-import numpy as np
-import timeout_decorator  # noqa
-
-from transformers import BartConfig, BartTokenizer, is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-if is_flax_available():
-    import os
-
-    # The slow tests are often failing with OOM error on GPU
-    # This makes JAX allocate exactly what is needed on demand, and deallocate memory that is no longer needed
-    # but will be slower as stated here https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
-    os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
-
-    import jax
-    import jax.numpy as jnp
-
-    from transformers.models.bart.modeling_flax_bart import (
-        FlaxBartForConditionalGeneration,
-        FlaxBartForQuestionAnswering,
-        FlaxBartForSequenceClassification,
-        FlaxBartModel,
-        shift_tokens_right,
-    )
-
-
-def prepare_bart_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids=None,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = np.where(input_ids != config.pad_token_id, 1, 0)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = np.where(decoder_input_ids != config.pad_token_id, 1, 0)
-    if head_mask is None:
-        head_mask = np.ones((config.encoder_layers, config.encoder_attention_heads))
-    if decoder_head_mask is None:
-        decoder_head_mask = np.ones((config.decoder_layers, config.decoder_attention_heads))
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = np.ones((config.decoder_layers, config.decoder_attention_heads))
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-    }
-
-
-class FlaxBartModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=32,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-        initializer_range=0.02,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.initializer_range = initializer_range
-
-    def prepare_config_and_inputs(self):
-        input_ids = np.clip(ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size), 3, self.vocab_size)
-        input_ids = np.concatenate((input_ids, 2 * np.ones((self.batch_size, 1), dtype=np.int64)), -1)
-
-        decoder_input_ids = shift_tokens_right(input_ids, 1, 2)
-
-        config = BartConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            initializer_range=self.initializer_range,
-            use_cache=False,
-        )
-        inputs_dict = prepare_bart_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def check_use_cache_forward(self, model_class_name, config, inputs_dict):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        encoder_outputs = model.encode(inputs_dict["input_ids"])
-
-        decoder_input_ids, decoder_attention_mask = (
-            inputs_dict["decoder_input_ids"],
-            inputs_dict["decoder_attention_mask"],
-        )
-
-        past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
-        decoder_attention_mask = jnp.ones((decoder_input_ids.shape[0], max_decoder_length), dtype="i4")
-
-        decoder_position_ids = jnp.broadcast_to(
-            jnp.arange(decoder_input_ids.shape[-1] - 1)[None, :],
-            (decoder_input_ids.shape[0], decoder_input_ids.shape[-1] - 1),
-        )
-        outputs_cache = model.decode(
-            decoder_input_ids[:, :-1],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            past_key_values=past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        decoder_position_ids = jnp.array(decoder_input_ids.shape[0] * [[decoder_input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model.decode(
-            decoder_input_ids[:, -1:],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            past_key_values=outputs_cache.past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        outputs = model.decode(decoder_input_ids, encoder_outputs)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, inputs_dict):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        encoder_outputs = model.encode(inputs_dict["input_ids"])
-
-        decoder_input_ids, decoder_attention_mask = (
-            inputs_dict["decoder_input_ids"],
-            inputs_dict["decoder_attention_mask"],
-        )
-
-        decoder_attention_mask_cache = jnp.concatenate(
-            [
-                decoder_attention_mask,
-                jnp.zeros((decoder_attention_mask.shape[0], max_decoder_length - decoder_attention_mask.shape[1])),
-            ],
-            axis=-1,
-        )
-
-        past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
-        decoder_position_ids = jnp.broadcast_to(
-            jnp.arange(decoder_input_ids.shape[-1] - 1)[None, :],
-            (decoder_input_ids.shape[0], decoder_input_ids.shape[-1] - 1),
-        )
-
-        outputs_cache = model.decode(
-            decoder_input_ids[:, :-1],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask_cache,
-            past_key_values=past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-        decoder_position_ids = jnp.array(decoder_input_ids.shape[0] * [[decoder_input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model.decode(
-            decoder_input_ids[:, -1:],
-            encoder_outputs,
-            past_key_values=outputs_cache.past_key_values,
-            decoder_attention_mask=decoder_attention_mask_cache,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        outputs = model.decode(decoder_input_ids, encoder_outputs, decoder_attention_mask=decoder_attention_mask)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-
-@require_flax
-class BartHeadTests(unittest.TestCase):
-    vocab_size = 99
-
-    def _get_config_and_data(self):
-        input_ids = np.array(
-            [
-                [71, 82, 18, 33, 46, 91, 2],
-                [68, 34, 26, 58, 30, 82, 2],
-                [5, 97, 17, 39, 94, 40, 2],
-                [76, 83, 94, 25, 70, 78, 2],
-                [87, 59, 41, 35, 48, 66, 2],
-                [55, 13, 16, 58, 5, 2, 1],  # note padding
-                [64, 27, 31, 51, 12, 75, 2],
-                [52, 64, 86, 17, 83, 39, 2],
-                [48, 61, 9, 24, 71, 82, 2],
-                [26, 1, 60, 48, 22, 13, 2],
-                [21, 5, 62, 28, 14, 76, 2],
-                [45, 98, 37, 86, 59, 48, 2],
-                [70, 70, 50, 9, 28, 0, 2],
-            ],
-            dtype=np.int64,
-        )
-
-        batch_size = input_ids.shape[0]
-        config = BartConfig(
-            vocab_size=self.vocab_size,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            eos_token_id=2,
-            pad_token_id=1,
-            bos_token_id=0,
-        )
-        return config, input_ids, batch_size
-
-    def test_sequence_classification_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        model = FlaxBartForSequenceClassification(config)
-        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
-        expected_shape = (batch_size, config.num_labels)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-
-    def test_question_answering_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        model = FlaxBartForQuestionAnswering(config)
-        outputs = model(input_ids=input_ids)
-
-        self.assertEqual(outputs["start_logits"].shape, input_ids.shape)
-        self.assertEqual(outputs["end_logits"].shape, input_ids.shape)
-
-    # @timeout_decorator.timeout(1)  # not working with the decorator so far
-    def test_lm_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        lm_model = FlaxBartForConditionalGeneration(config)
-        outputs = lm_model(input_ids=input_ids)
-        expected_shape = (batch_size, input_ids.shape[1], config.vocab_size)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-
-    def test_lm_uneven_forward(self):
-        config = BartConfig(
-            vocab_size=self.vocab_size,
-            d_model=14,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=8,
-            decoder_ffn_dim=8,
-            max_position_embeddings=48,
-        )
-        lm_model = FlaxBartForConditionalGeneration(config)
-        context = np.array([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]], dtype=np.int64)
-        summary = np.array([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]], dtype=np.int64)
-        outputs = lm_model(input_ids=context, decoder_input_ids=summary)
-        expected_shape = (*summary.shape, config.vocab_size)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-
-    def test_shift_tokens_right(self):
-        input_ids = np.array([[71, 82, 18, 33, 2, 1, 1], [68, 34, 26, 58, 30, 82, 2]], dtype=np.int64)
-        shifted = shift_tokens_right(input_ids, 1, 2)
-        n_pad_before = np.equal(input_ids, 1).astype(np.float32).sum()
-        n_pad_after = np.equal(shifted, 1).astype(np.float32).sum()
-        self.assertEqual(shifted.shape, input_ids.shape)
-        self.assertEqual(n_pad_after, n_pad_before - 1)
-        self.assertTrue(np.equal(shifted[:, 0], 2).all())
-
-
-@require_flax
-class FlaxBartModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    is_encoder_decoder = True
-    all_model_classes = (
-        (
-            FlaxBartModel,
-            FlaxBartForConditionalGeneration,
-            FlaxBartForSequenceClassification,
-            FlaxBartForQuestionAnswering,
-        )
-        if is_flax_available()
-        else ()
-    )
-
-    def setUp(self):
-        self.model_tester = FlaxBartModelTester(self)
-
-    def test_use_cache_forward(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward(model_class, config, inputs_dict)
-
-    def test_use_cache_forward_with_attn_mask(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward_with_attn_mask(model_class, config, inputs_dict)
-
-    def test_encode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def encode_jitted(input_ids, attention_mask=None, **kwargs):
-                    return model.encode(input_ids=input_ids, attention_mask=attention_mask)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    def test_decode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                model = model_class(config)
-                encoder_outputs = model.encode(inputs_dict["input_ids"], inputs_dict["attention_mask"])
-
-                prepared_inputs_dict = {
-                    "decoder_input_ids": inputs_dict["decoder_input_ids"],
-                    "decoder_attention_mask": inputs_dict["decoder_attention_mask"],
-                    "encoder_outputs": encoder_outputs,
-                }
-
-                @jax.jit
-                def decode_jitted(decoder_input_ids, decoder_attention_mask, encoder_outputs):
-                    return model.decode(
-                        decoder_input_ids=decoder_input_ids,
-                        decoder_attention_mask=decoder_attention_mask,
-                        encoder_outputs=encoder_outputs,
-                    )
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("facebook/bart-base", from_pt=True)
-            # FlaxBartForSequenceClassification expects eos token in input_ids
-            input_ids = np.ones((1, 1)) * model.config.eos_token_id
-            outputs = model(input_ids)
-            self.assertIsNotNone(outputs)
-
-    @slow
-    def test_summarization_fast(self):
-        model = FlaxBartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-6-6")
-        tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-6-6")
-
-        input_str = (
-            "This sentence is made of three parts. Each part is important on its own. One part is about animals, the"
-            " other part about planes, and the last part about housing."
-        )
-
-        input_ids = tokenizer(input_str, return_tensors="np").input_ids
-        sequences = model.generate(input_ids, num_beams=2, min_length=None, max_length=20).sequences
-
-        output_str = tokenizer.batch_decode(sequences)[0]
-
-        assert (
-            output_str == "</s><s>This sentence is made of three parts. One part is about animals, the other part</s>"
-        )
-
-    @slow
-    def test_cnn_summarization_same_as_fairseq(self):
-        model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
-        tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-
-        FRANCE_ARTICLE = (  # @noq
-            " Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings"
-            " Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane."
-            ' Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation."'
-            ' He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s'
-            " comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video"
-            " showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French"
-            " Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a"
-            " phone at the wreckage site. The two publications described the supposed video, but did not post it on"
-            " their websites. The publications said that they watched the video, which was found by a source close to"
-            " the investigation. \"One can hear cries of 'My God' in several languages,\" Paris Match reported."
-            ' "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the'
-            " cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the"
-            ' screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt,'
-            " editor-in-chief of Bild online. An official with France's accident investigation agency, the BEA, said"
-            " the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman"
-            " in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the"
-            ' reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said,'
-            ' but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be'
-            " sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by"
-            " specialized technicians working hand-in-hand with investigators. But none of the cell phones found so"
-            " far have been sent to the institute, Menichini said. Asked whether staff involved in the search could"
-            ' have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin'
-            ' Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match'
-            ' are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered'
-            ' cell phones from the crash site after Bild and Paris Match published their reports. "That is something'
-            " we did not know before. ... Overall we can say many things of the investigation weren't revealed by the"
-            ' investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline'
-            " Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the"
-            " controls of Germanwings Flight 9525, which he's accused of deliberately crashing last week in the"
-            ' French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of'
-            ' severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school'
-            " discovered in an internal investigation, Lufthansa said, included medical documents he submitted in"
-            " connection with resuming his flight training. The announcement indicates that Lufthansa, the parent"
-            " company of Germanwings, knew of Lubitz's battle with depression, allowed him to continue training and"
-            " ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100%"
-            ' fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was'
-            " sharing the information and documents -- including training and medical records -- with public"
-            " prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the"
-            " past week to recover human remains and plane debris scattered across a steep mountainside. He saw the"
-            " crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash"
-            " site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late"
-            " Tuesday that no visible human remains were left at the site but recovery teams would keep searching."
-            " French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all"
-            " the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested."
-            " In the meantime, the recovery of the victims' personal belongings will start Wednesday, Menichini said."
-            " Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew"
-            " on board. Check out the latest from our correspondents . The details about Lubitz's correspondence with"
-            " the flight school during his training were among several developments as investigators continued to"
-            " delve into what caused the crash and Lubitz's possible motive for downing the jet. A Lufthansa"
-            " spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his"
-            ' examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in'
-            " Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at"
-            " some point before his aviation career and underwent psychotherapy before he got his pilot's license."
-            " Kumpa emphasized there's no evidence suggesting Lubitz was suicidal or acting aggressively before the"
-            " crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to"
-            " lose his pilot's license, a European government official briefed on the investigation told CNN on"
-            ' Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being'
-            " considered. Another source, a law enforcement official briefed on the investigation, also told CNN that"
-            " authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would"
-            " not be allowed to fly because of his medical problems. Lubitz's girlfriend told investigators he had"
-            " seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded"
-            " he had psychological issues, the European government official said. But no matter what details emerge"
-            " about his previous mental health struggles, there's more to the story, said Brian Russell, a forensic"
-            ' psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact'
-            " that maybe they weren't going to keep doing their job and they're upset about that and so they're"
-            ' suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to'
-            " also take that rage and turn it outward on 149 other people who had nothing to do with the person's"
-            ' problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight'
-            " 9525? CNN's Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura"
-            " Smith-Spark wrote from London. CNN's Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine"
-            " Amiel and Anna-Maja Rappard contributed to this report."
-        )
-
-        SHORTER_ARTICLE = (
-            " (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on"
-            " Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The"
-            " formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based."
-            " The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its"
-            ' jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East'
-            ' Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the'
-            " situation in Palestinian territories, paving the way for possible war crimes investigations against"
-            " Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and"
-            " the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the"
-            " body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony, said it was a"
-            ' move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the'
-            ' world is also a step closer to ending a long era of impunity and injustice," he said, according to an'
-            ' ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge'
-            " Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the"
-            ' Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine'
-            " acquires all the rights as well as responsibilities that come with being a State Party to the Statute."
-            ' These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights'
-            ' Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should'
-            " immediately end their pressure, and countries that support universal acceptance of the court's treaty"
-            ' should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the'
-            " group. \"What's objectionable is the attempts to undermine international justice, not Palestine's"
-            ' decision to join a treaty to which over 100 countries around the world are members." In January, when'
-            " the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an"
-            ' outrage, saying the court was overstepping its boundaries. The United States also said it "strongly"'
-            " disagreed with the court's decision. \"As we have said repeatedly, we do not believe that Palestine is a"
-            ' state and therefore we do not believe that it is eligible to join the ICC," the State Department said in'
-            ' a statement. It urged the warring sides to resolve their differences through direct negotiations. "We'
-            ' will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace,"'
-            " it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the"
-            ' territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the'
-            " court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou"
-            ' Bensouda said her office would "conduct its analysis in full independence and impartiality." The war'
-            " between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry"
-            " will include alleged war crimes committed since June. The International Criminal Court was set up in"
-            " 2002 to prosecute genocide, crimes against humanity and war crimes. CNN's Vasco Cotovio, Kareem Khadder"
-            " and Faith Karimi contributed to this report."
-        )
-
-        # The below article tests that we don't add any hypotheses outside of the top n_beams
-        IRAN_ARTICLE = (
-            " (CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran"
-            " in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively"
-            " block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger."
-            " Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli"
-            " Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a"
-            " letter to the Iranian leadership warning them away from a deal. The debate that has already begun since"
-            " the announcement of the new framework will likely result in more heat than light. It will not be helped"
-            " by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: ."
-            " The most misleading assertion, despite universal rejection by experts, is that the negotiations'"
-            " objective at the outset was the total elimination of any nuclear program in Iran. That is the position"
-            " of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it"
-            " had been, there would have been no Iranian team at the negotiating table. Rather, the objective has"
-            " always been to structure an agreement or series of agreements so that Iran could not covertly develop a"
-            " nuclear arsenal before the United States and its allies could respond. The new framework has exceeded"
-            " expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by"
-            " two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another"
-            " dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite"
-            " sharp accusations by some in the United States and its allies, Iran denies having such a program, and"
-            " U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's"
-            " continued cooperation with International Atomic Energy Agency inspections is further evidence on this"
-            " point, and we'll know even more about Iran's program in the coming months and years because of the deal."
-            " In fact, the inspections provisions that are part of this agreement are designed to protect against any"
-            " covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that"
-            " the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter"
-            " warning that a deal might be killed by Congress or a future president). This of course is not the case."
-            " The talks were between Iran and the five permanent members of the U.N. Security Council (United States,"
-            " United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has"
-            " played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement"
-            " reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran"
-            " and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement"
-            " contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the"
-            " case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased"
-            " or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes"
-            " Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear"
-            " sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going"
-            " forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such"
-            " a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the"
-            ' agreement should be a formal treaty requiring the Senate to "advise and consent." But the issue is not'
-            " suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New"
-            " START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement"
-            " with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement"
-            " will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove"
-            " most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally"
-            " some insist that any agreement must address Iranian missile programs, human rights violations or support"
-            " for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are"
-            " unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in"
-            " the negotiations would be a poison pill. This agreement should be judged on its merits and on how it"
-            " affects the security of our negotiating partners and allies, including Israel. Those judgments should be"
-            " fact-based, not based on questionable assertions or dubious assumptions."
-        )
-
-        ARTICLE_SUBWAY = (
-            " New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A"
-            " year later, she got married again in Westchester County, but to a different man and without divorcing"
-            " her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos"
-            ' declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married'
-            " once more, this time in the Bronx. In an application for a marriage license, she stated it was her"
-            ' "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false'
-            ' instrument for filing in the first degree," referring to her false statements on the 2010 marriage'
-            " license application, according to court documents. Prosecutors said the marriages were part of an"
-            " immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to"
-            " her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was"
-            " arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New"
-            " York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total,"
-            " Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All"
-            " occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be"
-            " married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors"
-            " said the immigration scam involved some of her husbands, who filed for permanent residence status"
-            " shortly after the marriages.  Any divorces happened only after such filings were approved. It was"
-            " unclear whether any of the men will be prosecuted. The case was referred to the Bronx District"
-            " Attorney's Office by Immigration and Customs Enforcement and the Department of Homeland Security's"
-            ' Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt,'
-            " Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his"
-            " native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces"
-            " up to four years in prison.  Her next court appearance is scheduled for May 18."
-        )
-
-        dct = tokenizer.batch_encode_plus(
-            [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY],
-            max_length=1024,
-            padding="max_length",
-            truncation_strategy="only_first",
-            truncation=True,
-            return_tensors="np",
-        )
-
-        self.assertEqual(1024, dct["input_ids"].shape[1])
-        hypotheses_batch = model.generate(
-            input_ids=dct["input_ids"],
-            attention_mask=dct["attention_mask"],
-            num_beams=2,
-        ).sequences
-        assert (hypotheses_batch[:, 1] == 0).all().item()
-
-        EXPECTED = [
-            "A French prosecutor says he is not aware of any video footage from on board the plane. Two German"
-            " magazines claim to have found a cell phone video showing the crash. The publications say they watched"
-            " the video, which was found by a source close to the investigation. All 150 on board the Germanwings"
-            " flight were killed.",
-            "Palestinian Authority becomes 123rd member of the International Criminal Court. The move gives the court"
-            " jurisdiction over alleged crimes in Palestinian territories. Israel and the United States opposed the"
-            " Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki said it was a"
-            " move toward greater justice.",
-            "U.S. and its negotiating partners reached a strong framework agreement with Iran. Peter Bergen: The"
-            " debate that has already begun will likely result in more heat than light. Bergen: The most misleading"
-            " assertion is that the negotiations' objective at the outset was the total elimination of any nuclear"
-            " program.",
-            "Liana Barrientos, 39, has been married 10 times, sometimes within two weeks of each other. Prosecutors"
-            " say the marriages were part of an immigration scam. She pleaded not guilty at State Supreme Court in the"
-            " Bronx on Friday. If convicted, Barrientos faces up to four years in prison.",
-        ]
-
-        generated_summaries = tokenizer.batch_decode(
-            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
-        )
-        assert generated_summaries == EXPECTED
-
-
-class FlaxBartStandaloneDecoderModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_attention_mask=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=32,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-        initializer_range=0.02,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.initializer_range = initializer_range
-
-    def prepare_config_and_inputs(self):
-        input_ids = jnp.clip(ids_tensor([self.batch_size, self.seq_length], self.vocab_size), 3, self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = BartConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            initializer_range=self.initializer_range,
-            use_cache=False,
-        )
-
-        return config, input_ids, attention_mask
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_decoder(self):
-        config, input_ids, attention_mask = self.prepare_config_and_inputs()
-
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
diff --git a/tests/models/bart/test_modeling_tf_bart.py b/tests/models/bart/test_modeling_tf_bart.py
deleted file mode 100644
index 63b2e8bd6d..0000000000
--- a/tests/models/bart/test_modeling_tf_bart.py
+++ /dev/null
@@ -1,1131 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import copy
-import tempfile
-import unittest
-
-import numpy as np
-
-from transformers import BartConfig, BartTokenizer, is_tf_available
-from transformers.testing_utils import require_tf, slow
-from transformers.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-from ...utils.test_modeling_tf_core import TFCoreModelTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFBartForConditionalGeneration, TFBartForSequenceClassification, TFBartModel
-
-
-@require_tf
-class TFBartModelTester:
-    config_cls = BartConfig
-    config_updates = {}
-    hidden_act = "gelu"
-
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs_for_common(self):
-        # Ids are clipped to avoid "beginng of sequence", "end of sequence", and "pad" tokens
-        input_ids = tf.clip_by_value(
-            ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size),
-            clip_value_min=self.eos_token_id + 1,
-            clip_value_max=self.vocab_size + 1,
-        )
-        # Explicitly add "end of sequence" to the inputs
-        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
-        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.config_cls(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_ids=[2],
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.pad_token_id,
-            **self.config_updates,
-        )
-        inputs_dict = prepare_bart_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = TFBartModel(config=config).get_decoder()
-        input_ids = inputs_dict["input_ids"]
-
-        input_ids = input_ids[:1, :]
-        attention_mask = inputs_dict["attention_mask"][:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = tf.cast(ids_tensor((self.batch_size, 3), 2), tf.int8)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)
-        output_from_no_past = output_from_no_past[0]
-
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)
-        output_from_past = output_from_past[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-
-def prepare_bart_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = tf.concat(
-            [
-                tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int8),
-                tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int8),
-            ],
-            axis=-1,
-        )
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": decoder_attention_mask,
-    }
-
-
-@require_tf
-class TFBartModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (TFBartForConditionalGeneration, TFBartForSequenceClassification, TFBartModel) if is_tf_available() else ()
-    )
-    all_generative_model_classes = (TFBartForConditionalGeneration,) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFBartModel,
-            "summarization": TFBartForConditionalGeneration,
-            "text-classification": TFBartForSequenceClassification,
-            "text2text-generation": TFBartForConditionalGeneration,
-            "translation": TFBartForConditionalGeneration,
-            "zero-shot": TFBartForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    test_pruning = False
-    test_onnx = True
-    onnx_min_opset = 10
-
-    def setUp(self):
-        self.model_tester = TFBartModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BartConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    # TODO (Joao): fix me
-    @unittest.skip("Onnx compliance broke with TF 2.10")
-    def test_onnx_compliancy(self):
-        pass
-
-    # TFBartForSequenceClassification does not support inputs_embeds
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (TFBartForConditionalGeneration, TFBartModel):
-            model = model_class(config)
-
-            inputs = copy.deepcopy(inputs_dict)
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids)
-            else:
-                inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids)
-
-            inputs = self._prepare_for_class(inputs, model_class)
-
-            model(inputs)
-
-    # TFBartForSequenceClassification does not support inputs_embeds
-    @slow
-    def test_graph_mode_with_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (TFBartForConditionalGeneration, TFBartModel):
-            model = model_class(config)
-
-            inputs = copy.deepcopy(inputs_dict)
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids)
-            else:
-                inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids)
-
-            inputs = self._prepare_for_class(inputs, model_class)
-
-            @tf.function
-            def run_in_graph_mode():
-                return model(inputs)
-
-            outputs = run_in_graph_mode()
-            self.assertIsNotNone(outputs)
-
-    @slow
-    def test_save_load_after_resize_token_embeddings(self):
-        # Custom version of this test to ensure "end of sequence" tokens are present throughout
-        if not self.test_resize_embeddings:
-            return
-        config, original_inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            # create a model with resized (expended) embeddings
-            new_tokens_size = 10
-            old_total_size = config.vocab_size
-            new_total_size = old_total_size + new_tokens_size
-            model = model_class(config=copy.deepcopy(config))  # `resize_token_embeddings` mutates `config`
-            model.build_in_name_scope()
-            model.resize_token_embeddings(new_total_size)
-
-            # fetch the output for an input exclusively made of new members of the vocabulary
-            inputs_dict = copy.deepcopy(original_inputs_dict)
-            ids_feat_name = None
-            if "input_ids" in inputs_dict:
-                ids_feat_name = "input_ids"
-            elif "decoder_input_ids" in inputs_dict:
-                ids_feat_name = "decoder_input_ids"
-            else:
-                assert False, "No input ids feature found in the inputs dict"
-
-            new_vocab_input_ids = ids_tensor(inputs_dict[ids_feat_name].shape, new_tokens_size)
-            new_vocab_input_ids += old_total_size
-
-            # Replace last id with EOS token
-            new_vocab_input_ids = new_vocab_input_ids[:, :-1]
-            new_vocab_input_ids = tf.concat(
-                [new_vocab_input_ids, tf.ones((tf.shape(new_vocab_input_ids)[0], 1), dtype=tf.int32) * 2], axis=1
-            )
-
-            inputs_dict[ids_feat_name] = new_vocab_input_ids
-            if "input_ids" in inputs_dict:
-                inputs_dict["input_ids"] = new_vocab_input_ids
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"] = new_vocab_input_ids
-            prepared_inputs = self._prepare_for_class(inputs_dict, model_class)
-            outputs = model(**prepared_inputs)
-
-            # save and load the model
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=False)
-                model = model_class.from_pretrained(tmpdirname)
-                restored_model_outputs = model(**prepared_inputs)
-
-                # check that the output for the restored model is the same
-                self.assert_outputs_same(restored_model_outputs, outputs)
-
-
-def _long_tensor(tok_lst):
-    return tf.constant(tok_lst, dtype=tf.int32)
-
-
-@require_tf
-class TFBartHeadTests(unittest.TestCase):
-    vocab_size = 99
-
-    def _get_config_and_data(self):
-        eos_column_vector = tf.ones((4, 1), dtype=tf.int32) * 2
-        input_ids = tf.concat([ids_tensor((4, 6), self.vocab_size - 3) + 3, eos_column_vector], axis=1)
-        batch_size = input_ids.shape[0]
-        config = BartConfig(
-            vocab_size=self.vocab_size,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            eos_token_id=2,
-            pad_token_id=1,
-            bos_token_id=0,
-            decoder_start_token_id=2,
-        )
-        return config, input_ids, batch_size
-
-    def test_lm_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        decoder_lm_labels = ids_tensor([batch_size, input_ids.shape[1]], self.vocab_size)
-        lm_model = TFBartForConditionalGeneration(config)
-        outputs = lm_model(input_ids=input_ids, labels=decoder_lm_labels, decoder_input_ids=input_ids, use_cache=False)
-        expected_shape = (batch_size, input_ids.shape[1], config.vocab_size)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-    def test_lm_uneven_forward(self):
-        config = BartConfig(
-            vocab_size=10,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-        )
-        lm_model = TFBartForConditionalGeneration(config)
-        context = tf.fill((7, 2), 4)
-        summary = tf.fill((7, 7), 6)
-        outputs = lm_model(input_ids=context, decoder_input_ids=summary, use_cache=False)
-        expected_shape = (*summary.shape, config.vocab_size)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-
-@require_tf
-class TFBartForSequenceClassificationTest(unittest.TestCase):
-    def test_model_fails_for_uneven_eos_tokens(self):
-        config = BartConfig(eos_token_id=2)
-        model = TFBartForSequenceClassification(config)
-        inputs = {
-            "input_ids": tf.constant([[1, 2, 2, 2], [1, 3, 2, 2], [2, 2, 3, 3]]),
-            "attention_mask": tf.constant([[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]),
-        }
-        with self.assertRaises(tf.errors.InvalidArgumentError):
-            model(inputs)
-
-
-@slow
-@require_tf
-class TFBartModelIntegrationTest(unittest.TestCase):
-    def test_inference_no_head(self):
-        model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large").model
-
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        attention_mask = tf.cast(tf.math.not_equal(input_ids, model.config.pad_token_id), tf.int8)
-        output = model(input_ids=input_ids, attention_mask=attention_mask)[0]
-        expected_shape = (1, 11, 1024)
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = tf.convert_to_tensor(
-            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]],
-        )
-        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-3)
-
-    def test_cnn_summarization_same_as_fairseq_hard(self):
-        hf = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
-        tok = self.tok
-
-        FRANCE_ARTICLE = (  # @noqa
-            " Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings"
-            " Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane."
-            ' Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation."'
-            ' He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s'
-            " comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video"
-            " showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French"
-            " Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a"
-            " phone at the wreckage site. The two publications described the supposed video, but did not post it on"
-            " their websites. The publications said that they watched the video, which was found by a source close to"
-            " the investigation. \"One can hear cries of 'My God' in several languages,\" Paris Match reported."
-            ' "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the'
-            " cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the"
-            ' screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt,'
-            " editor-in-chief of Bild online. An official with France's accident investigation agency, the BEA, said"
-            " the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman"
-            " in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the"
-            ' reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said,'
-            ' but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be'
-            " sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by"
-            " specialized technicians working hand-in-hand with investigators. But none of the cell phones found so"
-            " far have been sent to the institute, Menichini said. Asked whether staff involved in the search could"
-            ' have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin'
-            ' Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match'
-            ' are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered'
-            ' cell phones from the crash site after Bild and Paris Match published their reports. "That is something'
-            " we did not know before. ... Overall we can say many things of the investigation weren't revealed by the"
-            ' investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline'
-            " Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the"
-            " controls of Germanwings Flight 9525, which he's accused of deliberately crashing last week in the"
-            ' French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of'
-            ' severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school'
-            " discovered in an internal investigation, Lufthansa said, included medical documents he submitted in"
-            " connection with resuming his flight training. The announcement indicates that Lufthansa, the parent"
-            " company of Germanwings, knew of Lubitz's battle with depression, allowed him to continue training and"
-            " ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100%"
-            ' fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was'
-            " sharing the information and documents -- including training and medical records -- with public"
-            " prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the"
-            " past week to recover human remains and plane debris scattered across a steep mountainside. He saw the"
-            " crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash"
-            " site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late"
-            " Tuesday that no visible human remains were left at the site but recovery teams would keep searching."
-            " French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all"
-            " the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested."
-            " In the meantime, the recovery of the victims' personal belongings will start Wednesday, Menichini said."
-            " Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew"
-            " on board. Check out the latest from our correspondents . The details about Lubitz's correspondence with"
-            " the flight school during his training were among several developments as investigators continued to"
-            " delve into what caused the crash and Lubitz's possible motive for downing the jet. A Lufthansa"
-            " spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his"
-            ' examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in'
-            " Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at"
-            " some point before his aviation career and underwent psychotherapy before he got his pilot's license."
-            " Kumpa emphasized there's no evidence suggesting Lubitz was suicidal or acting aggressively before the"
-            " crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to"
-            " lose his pilot's license, a European government official briefed on the investigation told CNN on"
-            ' Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being'
-            " considered. Another source, a law enforcement official briefed on the investigation, also told CNN that"
-            " authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would"
-            " not be allowed to fly because of his medical problems. Lubitz's girlfriend told investigators he had"
-            " seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded"
-            " he had psychological issues, the European government official said. But no matter what details emerge"
-            " about his previous mental health struggles, there's more to the story, said Brian Russell, a forensic"
-            ' psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact'
-            " that maybe they weren't going to keep doing their job and they're upset about that and so they're"
-            ' suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to'
-            " also take that rage and turn it outward on 149 other people who had nothing to do with the person's"
-            ' problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight'
-            " 9525? CNN's Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura"
-            " Smith-Spark wrote from London. CNN's Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine"
-            " Amiel and Anna-Maja Rappard contributed to this report."
-        )
-        EXPECTED_SUMMARY_FRANCE = (
-            "French prosecutor says he's not aware of any video footage from on board the plane. German daily Bild"
-            " and French Paris Match claim to have found a cell phone video of the crash. A French Gendarmerie"
-            ' spokesman calls the reports "completely wrong" and "unwarranted" German airline Lufthansa confirms'
-            " co-pilot Andreas Lubitz had battled depression."
-        )
-
-        SHORTER_ARTICLE = (
-            " (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on"
-            " Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The"
-            " formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based."
-            " The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its"
-            ' jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East'
-            ' Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the'
-            " situation in Palestinian territories, paving the way for possible war crimes investigations against"
-            " Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and"
-            " the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the"
-            " body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony, said it was a"
-            ' move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the'
-            ' world is also a step closer to ending a long era of impunity and injustice," he said, according to an'
-            ' ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge'
-            " Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the"
-            ' Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine'
-            " acquires all the rights as well as responsibilities that come with being a State Party to the Statute."
-            ' These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights'
-            ' Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should'
-            " immediately end their pressure, and countries that support universal acceptance of the court's treaty"
-            ' should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the'
-            " group. \"What's objectionable is the attempts to undermine international justice, not Palestine's"
-            ' decision to join a treaty to which over 100 countries around the world are members." In January, when'
-            " the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an"
-            ' outrage, saying the court was overstepping its boundaries. The United States also said it "strongly"'
-            " disagreed with the court's decision. \"As we have said repeatedly, we do not believe that Palestine is a"
-            ' state and therefore we do not believe that it is eligible to join the ICC," the State Department said in'
-            ' a statement. It urged the warring sides to resolve their differences through direct negotiations. "We'
-            ' will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace,"'
-            " it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the"
-            ' territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the'
-            " court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou"
-            ' Bensouda said her office would "conduct its analysis in full independence and impartiality." The war'
-            " between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry"
-            " will include alleged war crimes committed since June. The International Criminal Court was set up in"
-            " 2002 to prosecute genocide, crimes against humanity and war crimes. CNN's Vasco Cotovio, Kareem Khadder"
-            " and Faith Karimi contributed to this report."
-        )
-        EXPECTED_SUMMARY_SHORTER = (
-            "The Palestinian Authority becomes the 123rd member of the International Criminal Court. The move gives"
-            " the court jurisdiction over alleged crimes in Palestinian territories. Israel and the United States"
-            " opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki said"
-            " it was a move toward greater justice."
-        )
-
-        # The below article tests that we don't add any hypotheses outside of the top n_beams
-        IRAN_ARTICLE = (
-            " (CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran"
-            " in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively"
-            " block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger."
-            " Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli"
-            " Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a"
-            " letter to the Iranian leadership warning them away from a deal. The debate that has already begun since"
-            " the announcement of the new framework will likely result in more heat than light. It will not be helped"
-            " by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: ."
-            " The most misleading assertion, despite universal rejection by experts, is that the negotiations'"
-            " objective at the outset was the total elimination of any nuclear program in Iran. That is the position"
-            " of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it"
-            " had been, there would have been no Iranian team at the negotiating table. Rather, the objective has"
-            " always been to structure an agreement or series of agreements so that Iran could not covertly develop a"
-            " nuclear arsenal before the United States and its allies could respond. The new framework has exceeded"
-            " expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by"
-            " two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another"
-            " dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite"
-            " sharp accusations by some in the United States and its allies, Iran denies having such a program, and"
-            " U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's"
-            " continued cooperation with International Atomic Energy Agency inspections is further evidence on this"
-            " point, and we'll know even more about Iran's program in the coming months and years because of the deal."
-            " In fact, the inspections provisions that are part of this agreement are designed to protect against any"
-            " covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that"
-            " the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter"
-            " warning that a deal might be killed by Congress or a future president). This of course is not the case."
-            " The talks were between Iran and the five permanent members of the U.N. Security Council (United States,"
-            " United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has"
-            " played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement"
-            " reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran"
-            " and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement"
-            " contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the"
-            " case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased"
-            " or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes"
-            " Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear"
-            " sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going"
-            " forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such"
-            " a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the"
-            ' agreement should be a formal treaty requiring the Senate to "advise and consent." But the issue is not'
-            " suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New"
-            " START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement"
-            " with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement"
-            " will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove"
-            " most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally"
-            " some insist that any agreement must address Iranian missile programs, human rights violations or support"
-            " for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are"
-            " unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in"
-            " the negotiations would be a poison pill. This agreement should be judged on its merits and on how it"
-            " affects the security of our negotiating partners and allies, including Israel. Those judgments should be"
-            " fact-based, not based on questionable assertions or dubious assumptions."
-        )
-        EXPECTED_SUMMARY_IRAN = (
-            "The U.S. and its negotiating partners reached a very strong framework agreement with Iran. Peter Bergen:"
-            " The debate that has already begun will likely result in more heat than light. He says the agreement"
-            " limits Iran's nuclear program in such a way as to effectively block it from building a nuclear weapon."
-            " Bergen says the most important aim of a nuclear deal is preventing a nuclear Iran."
-        )
-
-        ARTICLE_SUBWAY = (
-            " New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A"
-            " year later, she got married again in Westchester County, but to a different man and without divorcing"
-            " her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos"
-            ' declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married'
-            " once more, this time in the Bronx. In an application for a marriage license, she stated it was her"
-            ' "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false'
-            ' instrument for filing in the first degree," referring to her false statements on the 2010 marriage'
-            " license application, according to court documents. Prosecutors said the marriages were part of an"
-            " immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to"
-            " her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was"
-            " arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New"
-            " York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total,"
-            " Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All"
-            " occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be"
-            " married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors"
-            " said the immigration scam involved some of her husbands, who filed for permanent residence status"
-            " shortly after the marriages.  Any divorces happened only after such filings were approved. It was"
-            " unclear whether any of the men will be prosecuted. The case was referred to the Bronx District"
-            " Attorney's Office by Immigration and Customs Enforcement and the Department of Homeland Security's"
-            ' Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt,'
-            " Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his"
-            " native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces"
-            " up to four years in prison.  Her next court appearance is scheduled for May 18."
-        )
-        EXPECTED_SUMMARY_SUBWAY = (
-            "Liana Barrientos has been married 10 times, sometimes within two weeks of each other. Prosecutors say the"
-            " marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in"
-            " the Bronx. She was arrested and charged with theft of service and criminal trespass for allegedly"
-            " sneaking into the subway."
-        )
-
-        dct = tok(
-            [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY],
-            max_length=1024,
-            truncation_strategy="only_first",
-            padding="longest",
-            truncation=True,
-            return_tensors="tf",
-        )
-        self.assertEqual(1024, dct["input_ids"].shape[1])
-        hypotheses_batch = hf.generate(
-            input_ids=dct["input_ids"],
-            attention_mask=dct["attention_mask"],
-        )
-
-        assert hypotheses_batch[:, 1].numpy().tolist() == [0, 0, 0, 0]  # test force_bos_token_to_be_generated
-        decoded = tok.batch_decode(hypotheses_batch, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        expected_batch = [
-            EXPECTED_SUMMARY_FRANCE,
-            EXPECTED_SUMMARY_SHORTER,
-            EXPECTED_SUMMARY_IRAN,
-            EXPECTED_SUMMARY_SUBWAY,
-        ]
-        assert decoded == expected_batch
-
-    @cached_property
-    def tok(self):
-        return BartTokenizer.from_pretrained("facebook/bart-large")
-
-    @slow
-    def test_contrastive_search_bart(self):
-        article = (
-            " New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A"
-            " year later, she got married again in Westchester County, but to a different man and without divorcing"
-            " her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos"
-            ' declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married'
-            " once more, this time in the Bronx. In an application for a marriage license, she stated it was her"
-            ' "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false'
-            ' instrument for filing in the first degree," referring to her false statements on the 2010 marriage'
-            " license application, according to court documents. Prosecutors said the marriages were part of an"
-            " immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to"
-            " her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was"
-            " arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New"
-            " York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total,"
-            " Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All"
-            " occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be"
-            " married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors"
-            " said the immigration scam involved some of her husbands, who filed for permanent residence status"
-            " shortly after the marriages.  Any divorces happened only after such filings were approved. It was"
-            " unclear whether any of the men will be prosecuted. The case was referred to the Bronx District"
-            " Attorney's Office by Immigration and Customs Enforcement and the Department of Homeland Security's"
-            ' Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt,'
-            " Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his"
-            " native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces"
-            " up to four years in prison.  Her next court appearance is scheduled for May 18."
-        )
-        bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        bart_model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
-        input_ids = bart_tokenizer(
-            article, add_special_tokens=False, truncation=True, max_length=512, return_tensors="tf"
-        ).input_ids
-
-        outputs = bart_model.generate(input_ids, penalty_alpha=0.5, top_k=5, max_length=64)
-        generated_text = bart_tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "Liana Barrientos, 39, pleaded not guilty to charges related to false marriage statements. "
-                "Prosecutors say she married at least 10 times, sometimes within two weeks of each other. She is "
-                "accused of being part of an immigration scam to get permanent residency. If convicted, she faces up "
-                "to four years in"
-            ],
-        )
-
-    @slow
-    def test_contrastive_search_bart_xla(self):
-        article = (
-            " New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A"
-            " year later, she got married again in Westchester County, but to a different man and without divorcing"
-            " her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos"
-            ' declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married'
-            " once more, this time in the Bronx. In an application for a marriage license, she stated it was her"
-            ' "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false'
-            ' instrument for filing in the first degree," referring to her false statements on the 2010 marriage'
-            " license application, according to court documents. Prosecutors said the marriages were part of an"
-            " immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to"
-            " her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was"
-            " arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New"
-            " York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total,"
-            " Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All"
-            " occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be"
-            " married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors"
-            " said the immigration scam involved some of her husbands, who filed for permanent residence status"
-            " shortly after the marriages.  Any divorces happened only after such filings were approved. It was"
-            " unclear whether any of the men will be prosecuted. The case was referred to the Bronx District"
-            " Attorney's Office by Immigration and Customs Enforcement and the Department of Homeland Security's"
-            ' Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt,'
-            " Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his"
-            " native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces"
-            " up to four years in prison.  Her next court appearance is scheduled for May 18."
-        )
-        bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        bart_model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
-        input_ids = bart_tokenizer(
-            article, add_special_tokens=False, truncation=True, max_length=512, return_tensors="tf"
-        ).input_ids
-
-        xla_generate = tf.function(bart_model.generate, jit_compile=True)
-        # no_repeat_ngram_size set to 0 because it isn't compatible with XLA, but doesn't change the original output
-        outputs = xla_generate(input_ids, penalty_alpha=0.5, top_k=5, max_length=64, no_repeat_ngram_size=0)
-        generated_text = bart_tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "Liana Barrientos, 39, pleaded not guilty to charges related to false marriage statements. "
-                "Prosecutors say she married at least 10 times, sometimes within two weeks of each other. She is "
-                "accused of being part of an immigration scam to get permanent residency. If convicted, she faces up "
-                "to four years in"
-            ],
-        )
-
-
-@slow
-@require_tf
-class FasterTFBartModelIntegrationTests(unittest.TestCase):
-    """These tests are useful for debugging since they operate on a model with 1 encoder layer and 1 decoder layer."""
-
-    @cached_property
-    def tok(self):
-        return BartTokenizer.from_pretrained("facebook/bart-large")
-
-    @cached_property
-    def xsum_1_1_model(self):
-        return TFBartForConditionalGeneration.from_pretrained("sshleifer/distilbart-xsum-1-1")
-
-    def test_xsum_1_1_generation(self):
-        model = self.xsum_1_1_model
-        assert model.model.decoder.embed_tokens == model.model.shared
-        ARTICLE = (
-            "The Palestinian Authority officially became the 123rd member of the International Criminal Court on"
-            " Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The"
-            " formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based."
-            " The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its"
-            ' jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East'
-            ' Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the'
-            " situation in Palestinian territories, paving the way for possible war crimes investigations against"
-            " Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and"
-            " the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the"
-            " body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony, said it was a"
-            ' move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the'
-            ' world is also a step closer to ending a long era of impunity and injustice," he said, according to an'
-            ' ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge'
-            " Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the"
-            ' Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine'
-            " acquires all the rights as well as responsibilities that come with being a State Party to the Statute."
-            ' These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights'
-            ' Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should'
-            " immediately end their pressure, and countries that support universal acceptance of the court's treaty"
-            ' should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the'
-            " group. \"What's objectionable is the attempts to undermine international justice, not Palestine's"
-            ' decision to join a treaty to which over 100 countries around the world are members." In January, when'
-            " the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an"
-            ' outrage, saying the court was overstepping its boundaries. The United States also said it "strongly"'
-            " disagreed with the court's decision. \"As we have said repeatedly, we do not believe that Palestine is a"
-            ' state and therefore we do not believe that it is eligible to join the ICC," the State Department said in'
-            ' a statement. It urged the warring sides to resolve their differences through direct negotiations. "We'
-            ' will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace,"'
-            " it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the"
-            ' territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the'
-            " court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou"
-            ' Bensouda said her office would "conduct its analysis in full independence and impartiality." The war'
-            " between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry"
-            " will include alleged war crimes committed since June. The International Criminal Court was set up in"
-            " 2002 to prosecute genocide, crimes against humanity and war crimes."
-        )
-        EXPECTED = (
-            " The International Criminal Court (ICC) has announced that it has been announced by the International"
-            " Criminal court."
-        )
-        dct = self.tok(ARTICLE, return_tensors="tf")
-        generated_ids = model.generate(**dct, num_beams=4)
-        result = self.tok.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        assert result == EXPECTED
-
-    def test_xsum_1_1_xla_generation(self):
-        # same test as above, but with `no_repeat_ngram_size=0` (not compatible with XLA) and XLA comparison enabled
-        model = self.xsum_1_1_model
-        assert model.model.decoder.embed_tokens == model.model.shared
-        ARTICLE = (
-            "The Palestinian Authority officially became the 123rd member of the International Criminal Court on"
-            " Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The"
-            " formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based."
-            " The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its"
-            ' jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East'
-            ' Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the'
-            " situation in Palestinian territories, paving the way for possible war crimes investigations against"
-            " Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and"
-            " the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the"
-            " body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony, said it was a"
-            ' move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the'
-            ' world is also a step closer to ending a long era of impunity and injustice," he said, according to an'
-            ' ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge'
-            " Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the"
-            ' Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine'
-            " acquires all the rights as well as responsibilities that come with being a State Party to the Statute."
-            ' These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights'
-            ' Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should'
-            " immediately end their pressure, and countries that support universal acceptance of the court's treaty"
-            ' should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the'
-            " group. \"What's objectionable is the attempts to undermine international justice, not Palestine's"
-            ' decision to join a treaty to which over 100 countries around the world are members." In January, when'
-            " the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an"
-            ' outrage, saying the court was overstepping its boundaries. The United States also said it "strongly"'
-            " disagreed with the court's decision. \"As we have said repeatedly, we do not believe that Palestine is a"
-            ' state and therefore we do not believe that it is eligible to join the ICC," the State Department said in'
-            ' a statement. It urged the warring sides to resolve their differences through direct negotiations. "We'
-            ' will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace,"'
-            " it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the"
-            ' territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the'
-            " court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou"
-            ' Bensouda said her office would "conduct its analysis in full independence and impartiality." The war'
-            " between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry"
-            " will include alleged war crimes committed since June. The International Criminal Court was set up in"
-            " 2002 to prosecute genocide, crimes against humanity and war crimes."
-        )
-        EXPECTED = (
-            " The International Criminal Court (ICC) has announced that it is to be investigated by the International"
-            " Criminal Court (ICC) over allegations of war crimes."
-        )
-
-        dct = self.tok(ARTICLE, return_tensors="tf")
-        generated_ids = model.generate(**dct, num_beams=4, no_repeat_ngram_size=0)
-        result = self.tok.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        assert result == EXPECTED
-
-        xla_generate = tf.function(model.generate, jit_compile=True)
-        generated_ids = xla_generate(**dct, num_beams=4, no_repeat_ngram_size=0)
-        result = self.tok.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        assert result == EXPECTED
-
-    def test_xsum_1_1_batch_generation(self):
-        batch = self.tok(
-            [
-                "The Palestinian Authority officially became the 123rd member of the International Criminal Court on"
-                " Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories."
-                " The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is"
-                " based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted"
-                ' its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including'
-                ' East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination'
-                " into the situation in Palestinian territories, paving the way for possible war crimes investigations"
-                " against Israelis. As members of the court, Palestinians may be subject to counter-charges as well."
-                " Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts"
-                " to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony,"
-                ' said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome'
-                ' Statute today, the world is also a step closer to ending a long era of impunity and injustice," he'
-                ' said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of'
-                ' justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was'
-                ' just the first step for the Palestinians. "As the Rome Statute today enters into force for the State'
-                " of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a"
-                ' State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she'
-                ' said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize'
-                " Palestine for joining the ICC should immediately end their pressure, and countries that support"
-                " universal acceptance of the court's treaty should speak out to welcome its membership,\" said"
-                " Balkees Jarrah, international justice counsel for the group. \"What's objectionable is the attempts"
-                " to undermine international justice, not Palestine's decision to join a treaty to which over 100"
-                ' countries around the world are members." In January, when the preliminary ICC examination was'
-                " opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was"
-                ' overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s'
-                ' decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we'
-                ' do not believe that it is eligible to join the ICC," the State Department said in a statement. It'
-                ' urged the warring sides to resolve their differences through direct negotiations. "We will continue'
-                ' to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said.'
-                " But the ICC begs to differ with the definition of a state for its purposes and refers to the"
-                ' territories as "Palestine." While a preliminary examination is not a formal investigation, it allows'
-                " the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor"
-                ' Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality."'
-                " The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The"
-                " inquiry will include alleged war crimes committed since June. The International Criminal Court was"
-                " set up in 2002 to prosecute genocide, crimes against humanity and war crimes.",
-                "The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted"
-                " Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor"
-                ' Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A'
-                " person who has such a video needs to immediately give it to the investigators.\" Robin's comments"
-                " follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video"
-                " showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the"
-                " French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was"
-                " recovered from a phone at the wreckage site. The two publications described the supposed video, but"
-                " did not post it on their websites. The publications said that they watched the video, which was"
-                " found by a source close to the investigation. \"One can hear cries of 'My God' in several"
-                ' languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps'
-                " of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy"
-                ' shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing'
-                " scene,\" said Julian Reichelt, editor-in-chief of Bild online. An official with France's accident"
-                " investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc"
-                " Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the"
-                ' Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell'
-                ' phones have been collected at the site, he said, but that they "hadn\'t been exploited yet."'
-                " Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute"
-                " in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working"
-                " hand-in-hand with investigators. But none of the cell phones found so far have been sent to the"
-                " institute, Menichini said. Asked whether staff involved in the search could have leaked a memory"
-                ' card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett:'
-                ' Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are'
-                ' "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered'
-                ' cell phones from the crash site after Bild and Paris Match published their reports. "That is'
-                " something we did not know before. ... Overall we can say many things of the investigation weren't"
-                ' revealed by the investigation at the beginning," he said. What was mental state of Germanwings'
-                " co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled"
-                " depression years before he took the controls of Germanwings Flight 9525, which he's accused of"
-                " deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school"
-                ' in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email'
-                " correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa"
-                " said, included medical documents he submitted in connection with resuming his flight training. The"
-                " announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz's battle"
-                " with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa,"
-                " whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday"
-                ' as a "swift and seamless clarification" and said it was sharing the information and documents --'
-                " including training and medical records -- with public prosecutors. Spohr traveled to the crash site"
-                " Wednesday, where recovery teams have been working for the past week to recover human remains and"
-                " plane debris scattered across a steep mountainside. He saw the crisis center set up in"
-                " Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving"
-                " families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no"
-                " visible human remains were left at the site but recovery teams would keep searching. French"
-                " President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the"
-                " victims using DNA analysis by the end of the week, sooner than authorities had previously suggested."
-                " In the meantime, the recovery of the victims' personal belongings will start Wednesday, Menichini"
-                " said. Among those personal belongings could be more cell phones belonging to the 144 passengers and"
-                " six crew on board. Check out the latest from our correspondents . The details about Lubitz's"
-                " correspondence with the flight school during his training were among several developments as"
-                " investigators continued to delve into what caused the crash and Lubitz's possible motive for"
-                " downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical"
-                ' certificate, had passed all his examinations and "held all the licenses required." Earlier, a'
-                " spokesman for the prosecutor's office in Dusseldorf, Christoph Kumpa, said medical records reveal"
-                " Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent"
-                " psychotherapy before he got his pilot's license. Kumpa emphasized there's no evidence suggesting"
-                " Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether"
-                " Lubitz feared his medical condition would cause him to lose his pilot's license, a European"
-                ' government official briefed on the investigation told CNN on Tuesday. While flying was "a big part'
-                " of his life,\" the source said, it's only one theory being considered. Another source, a law"
-                " enforcement official briefed on the investigation, also told CNN that authorities believe the"
-                " primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly"
-                " because of his medical problems. Lubitz's girlfriend told investigators he had seen an eye doctor"
-                " and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had"
-                " psychological issues, the European government official said. But no matter what details emerge about"
-                " his previous mental health struggles, there's more to the story, said Brian Russell, a forensic"
-                ' psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the'
-                " fact that maybe they weren't going to keep doing their job and they're upset about that and so"
-                ' they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels'
-                " entitled to also take that rage and turn it outward on 149 other people who had nothing to do with"
-                " the person's problems.\" Germanwings crash compensation: What we know . Who was the captain of"
-                " Germanwings Flight 9525? CNN's Margot Haddad reported from Marseille and Pamela Brown from"
-                " Dusseldorf, while Laura Smith-Spark wrote from London. CNN's Frederik Pleitgen, Pamela Boykoff,"
-                " Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.",
-            ],
-            return_tensors="tf",
-            padding="longest",
-            truncation=True,
-        )
-        generated_ids = self.xsum_1_1_model.generate(**batch, num_beams=4)
-        result = self.tok.batch_decode(generated_ids, skip_special_tokens=True)
-        assert (
-            result[0]
-            == " The International Criminal Court (ICC) has announced that it has been announced by the International"
-            " Criminal court."
-        )
-        assert (
-            result[1]
-            == " An investigation into the crash that killed at least 10 people in the French capital has been"
-            " released by the French police investigating the crash."
-        )
-
-    def test_encoder_equiv(self):
-        batch = self.tok(
-            [
-                "The Palestinian Authority officially became the 123rd member of the International Criminal Court on"
-                " Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories."
-                " The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is"
-                " based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted"
-                ' its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including'
-                ' East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination'
-                " into the situation in Palestinian territories, paving the way for possible war crimes investigations"
-                " against Israelis. As members of the court, Palestinians may be subject to counter-charges as well."
-                " Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts"
-                " to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony,"
-                ' said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome'
-                ' Statute today, the world is also a step closer to ending a long era of impunity and injustice," he'
-                ' said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of'
-                ' justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was'
-                ' just the first step for the Palestinians. "As the Rome Statute today enters into force for the State'
-                " of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a"
-                ' State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she'
-                ' said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize'
-                " Palestine for joining the ICC should immediately end their pressure, and countries that support"
-                " universal acceptance of the court's treaty should speak out to welcome its membership,\" said"
-                " Balkees Jarrah, international justice counsel for the group. \"What's objectionable is the attempts"
-                " to undermine international justice, not Palestine's decision to join a treaty to which over 100"
-                ' countries around the world are members." In January, when the preliminary ICC examination was'
-                " opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was"
-                ' overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s'
-                ' decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we'
-                ' do not believe that it is eligible to join the ICC," the State Department said in a statement. It'
-                ' urged the warring sides to resolve their differences through direct negotiations. "We will continue'
-                ' to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said.'
-                " But the ICC begs to differ with the definition of a state for its purposes and refers to the"
-                ' territories as "Palestine." While a preliminary examination is not a formal investigation, it allows'
-                " the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor"
-                ' Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality."'
-                " The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The"
-                " inquiry will include alleged war crimes committed since June. The International Criminal Court was"
-                " set up in 2002 to prosecute genocide, crimes against humanity and war crimes.",
-                "The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted"
-                " Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor"
-                ' Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A'
-                " person who has such a video needs to immediately give it to the investigators.\" Robin's comments"
-                " follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video"
-                " showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the"
-                " French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was"
-                " recovered from a phone at the wreckage site. The two publications described the supposed video, but"
-                " did not post it on their websites. The publications said that they watched the video, which was"
-                " found by a source close to the investigation. \"One can hear cries of 'My God' in several"
-                ' languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps'
-                " of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy"
-                ' shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing'
-                " scene,\" said Julian Reichelt, editor-in-chief of Bild online. An official with France's accident"
-                " investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc"
-                " Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the"
-                ' Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell'
-                ' phones have been collected at the site, he said, but that they "hadn\'t been exploited yet."'
-                " Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute"
-                " in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working"
-                " hand-in-hand with investigators. But none of the cell phones found so far have been sent to the"
-                " institute, Menichini said. Asked whether staff involved in the search could have leaked a memory"
-                ' card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett:'
-                ' Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are'
-                ' "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered'
-                ' cell phones from the crash site after Bild and Paris Match published their reports. "That is'
-                " something we did not know before. ... Overall we can say many things of the investigation weren't"
-                ' revealed by the investigation at the beginning," he said. What was mental state of Germanwings'
-                " co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled"
-                " depression years before he took the controls of Germanwings Flight 9525, which he's accused of"
-                " deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school"
-                ' in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email'
-                " correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa"
-                " said, included medical documents he submitted in connection with resuming his flight training. The"
-                " announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz's battle"
-                " with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa,"
-                " whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday"
-                ' as a "swift and seamless clarification" and said it was sharing the information and documents --'
-                " including training and medical records -- with public prosecutors. Spohr traveled to the crash site"
-                " Wednesday, where recovery teams have been working for the past week to recover human remains and"
-                " plane debris scattered across a steep mountainside. He saw the crisis center set up in"
-                " Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving"
-                " families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no"
-                " visible human remains were left at the site but recovery teams would keep searching. French"
-                " President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the"
-                " victims using DNA analysis by the end of the week, sooner than authorities had previously suggested."
-                " In the meantime, the recovery of the victims' personal belongings will start Wednesday, Menichini"
-                " said. Among those personal belongings could be more cell phones belonging to the 144 passengers and"
-                " six crew on board. Check out the latest from our correspondents . The details about Lubitz's"
-                " correspondence with the flight school during his training were among several developments as"
-                " investigators continued to delve into what caused the crash and Lubitz's possible motive for"
-                " downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical"
-                ' certificate, had passed all his examinations and "held all the licenses required." Earlier, a'
-                " spokesman for the prosecutor's office in Dusseldorf, Christoph Kumpa, said medical records reveal"
-                " Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent"
-                " psychotherapy before he got his pilot's license. Kumpa emphasized there's no evidence suggesting"
-                " Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether"
-                " Lubitz feared his medical condition would cause him to lose his pilot's license, a European"
-                ' government official briefed on the investigation told CNN on Tuesday. While flying was "a big part'
-                " of his life,\" the source said, it's only one theory being considered. Another source, a law"
-                " enforcement official briefed on the investigation, also told CNN that authorities believe the"
-                " primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly"
-                " because of his medical problems. Lubitz's girlfriend told investigators he had seen an eye doctor"
-                " and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had"
-                " psychological issues, the European government official said. But no matter what details emerge about"
-                " his previous mental health struggles, there's more to the story, said Brian Russell, a forensic"
-                ' psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the'
-                " fact that maybe they weren't going to keep doing their job and they're upset about that and so"
-                ' they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels'
-                " entitled to also take that rage and turn it outward on 149 other people who had nothing to do with"
-                " the person's problems.\" Germanwings crash compensation: What we know . Who was the captain of"
-                " Germanwings Flight 9525? CNN's Margot Haddad reported from Marseille and Pamela Brown from"
-                " Dusseldorf, while Laura Smith-Spark wrote from London. CNN's Frederik Pleitgen, Pamela Boykoff,"
-                " Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.",
-            ],
-            return_tensors="tf",
-            padding="longest",
-            truncation=True,
-        )
-        features = self.xsum_1_1_model.get_encoder()(**batch).last_hidden_state
-
-        expected = np.array([[-0.0828, -0.0251, -0.0674], [0.1277, 0.3311, -0.0255], [0.2613, -0.0840, -0.2763]])
-        assert np.allclose(features[0, :3, :3].numpy(), expected, atol=1e-3)
diff --git a/tests/models/beit/test_modeling_flax_beit.py b/tests/models/beit/test_modeling_flax_beit.py
deleted file mode 100644
index 2ac3668d3b..0000000000
--- a/tests/models/beit/test_modeling_flax_beit.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import unittest
-
-import numpy as np
-
-from transformers import BeitConfig
-from transformers.testing_utils import require_flax, require_vision, slow
-from transformers.utils import cached_property, is_flax_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor
-
-
-if is_flax_available():
-    import jax
-
-    from transformers import FlaxBeitForImageClassification, FlaxBeitForMaskedImageModeling, FlaxBeitModel
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import BeitImageProcessor
-
-
-class FlaxBeitModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=100,
-        batch_size=13,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        num_labels=3,
-    ):
-        self.parent = parent
-        self.vocab_size = vocab_size
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-
-        # in BeiT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = BeitConfig(
-            vocab_size=self.vocab_size,
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, pixel_values, labels
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = FlaxBeitModel(config=config)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_masked_lm(self, config, pixel_values, labels):
-        model = FlaxBeitForMaskedImageModeling(config=config)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length - 1, self.vocab_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = FlaxBeitForImageClassification(config=config)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = FlaxBeitForImageClassification(config)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_flax
-class FlaxBeitModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (FlaxBeitModel, FlaxBeitForImageClassification, FlaxBeitForMaskedImageModeling) if is_flax_available() else ()
-    )
-
-    def setUp(self) -> None:
-        self.model_tester = FlaxBeitModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BeitConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    # We need to override this test because Beit's forward signature is different than text models.
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.__call__)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    # We need to override this test because Beit expects pixel_values instead of input_ids
-    def test_jit_compilation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def model_jitted(pixel_values, **kwargs):
-                    return model(pixel_values=pixel_values, **kwargs)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("microsoft/beit-base-patch16-224")
-            outputs = model(np.ones((1, 3, 224, 224)))
-            self.assertIsNotNone(outputs)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_vision
-@require_flax
-class FlaxBeitModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return BeitImageProcessor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None
-
-    @slow
-    def test_inference_masked_image_modeling_head(self):
-        model = FlaxBeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        pixel_values = image_processor(images=image, return_tensors="np").pixel_values
-
-        # prepare bool_masked_pos
-        bool_masked_pos = np.ones((1, 196), dtype=bool)
-
-        # forward pass
-        outputs = model(pixel_values=pixel_values, bool_masked_pos=bool_masked_pos)
-        logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (1, 196, 8192)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = np.array(
-            [[-3.2437, 0.5072, -13.9174], [-3.2456, 0.4948, -13.9401], [-3.2033, 0.5121, -13.8550]]
-        )
-
-        self.assertTrue(np.allclose(logits[bool_masked_pos][:3, :3], expected_slice, atol=1e-2))
-
-    @slow
-    def test_inference_image_classification_head_imagenet_1k(self):
-        model = FlaxBeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="np")
-
-        # forward pass
-        outputs = model(**inputs)
-        logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = np.array([-1.2385, -1.0987, -1.0108])
-
-        self.assertTrue(np.allclose(logits[0, :3], expected_slice, atol=1e-4))
-
-        expected_class_idx = 281
-        self.assertEqual(logits.argmax(-1).item(), expected_class_idx)
-
-    @slow
-    def test_inference_image_classification_head_imagenet_22k(self):
-        model = FlaxBeitForImageClassification.from_pretrained("microsoft/beit-large-patch16-224-pt22k-ft22k")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="np")
-
-        # forward pass
-        outputs = model(**inputs)
-        logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (1, 21841)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = np.array([1.6881, -0.2787, 0.5901])
-
-        self.assertTrue(np.allclose(logits[0, :3], expected_slice, atol=1e-4))
-
-        expected_class_idx = 2396
-        self.assertEqual(logits.argmax(-1).item(), expected_class_idx)
diff --git a/tests/models/bert/test_modeling_flax_bert.py b/tests/models/bert/test_modeling_flax_bert.py
deleted file mode 100644
index 72d5c951e6..0000000000
--- a/tests/models/bert/test_modeling_flax_bert.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from transformers import BertConfig, is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-if is_flax_available():
-    from transformers.models.bert.modeling_flax_bert import (
-        FlaxBertForMaskedLM,
-        FlaxBertForMultipleChoice,
-        FlaxBertForNextSentencePrediction,
-        FlaxBertForPreTraining,
-        FlaxBertForQuestionAnswering,
-        FlaxBertForSequenceClassification,
-        FlaxBertForTokenClassification,
-        FlaxBertModel,
-    )
-
-
-class FlaxBertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_attention_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_choices=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_choices = num_choices
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        config = BertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, token_type_ids, attention_mask
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, token_type_ids, attention_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_decoder(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, token_type_ids, attention_mask = config_and_inputs
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-
-@require_flax
-class FlaxBertModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    test_head_masking = True
-
-    all_model_classes = (
-        (
-            FlaxBertModel,
-            FlaxBertForPreTraining,
-            FlaxBertForMaskedLM,
-            FlaxBertForMultipleChoice,
-            FlaxBertForQuestionAnswering,
-            FlaxBertForNextSentencePrediction,
-            FlaxBertForSequenceClassification,
-            FlaxBertForTokenClassification,
-            FlaxBertForQuestionAnswering,
-        )
-        if is_flax_available()
-        else ()
-    )
-
-    def setUp(self):
-        self.model_tester = FlaxBertModelTester(self)
-
-    @slow
-    def test_model_from_pretrained(self):
-        # Only check this for base model, not necessary for all model classes.
-        # This will also help speed-up tests.
-        model = FlaxBertModel.from_pretrained("google-bert/bert-base-cased")
-        outputs = model(np.ones((1, 1)))
-        self.assertIsNotNone(outputs)
diff --git a/tests/models/bert/test_modeling_tf_bert.py b/tests/models/bert/test_modeling_tf_bert.py
deleted file mode 100644
index b9fbdc9d43..0000000000
--- a/tests/models/bert/test_modeling_tf_bert.py
+++ /dev/null
@@ -1,764 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import BertConfig, is_tf_available
-from transformers.models.auto import get_values
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-from ...utils.test_modeling_tf_core import TFCoreModelTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TF_MODEL_FOR_PRETRAINING_MAPPING
-    from transformers.models.bert.modeling_tf_bert import (
-        TFBertForMaskedLM,
-        TFBertForMultipleChoice,
-        TFBertForNextSentencePrediction,
-        TFBertForPreTraining,
-        TFBertForQuestionAnswering,
-        TFBertForSequenceClassification,
-        TFBertForTokenClassification,
-        TFBertLMHeadModel,
-        TFBertModel,
-    )
-
-
-class TFBertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = BertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFBertModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_causal_lm_base_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.is_decoder = True
-
-        model = TFBertModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFBertModel(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "encoder_hidden_states": encoder_hidden_states,
-            "encoder_attention_mask": encoder_attention_mask,
-        }
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
-
-        # Also check the case where encoder outputs are not passed
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_causal_lm_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.is_decoder = True
-
-        model = TFBertLMHeadModel(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        prediction_scores = model(inputs)["logits"]
-        self.parent.assertListEqual(
-            list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
-        )
-
-    def create_and_check_causal_lm_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFBertLMHeadModel(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "encoder_hidden_states": encoder_hidden_states,
-            "encoder_attention_mask": encoder_attention_mask,
-        }
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
-
-        prediction_scores = result["logits"]
-        self.parent.assertListEqual(
-            list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
-        )
-
-    def create_and_check_causal_lm_model_past(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFBertLMHeadModel(config=config)
-
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and attn_mask
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-
-        output_from_no_past = model(next_input_ids, output_hidden_states=True).hidden_states[0]
-        output_from_past = model(
-            next_tokens, past_key_values=past_key_values, output_hidden_states=True
-        ).hidden_states[0]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
-    def create_and_check_causal_lm_model_past_with_attn_mask(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFBertLMHeadModel(config=config)
-
-        # create attention mask
-        half_seq_length = self.seq_length // 2
-        attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
-        attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
-        attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attn_mask, use_cache=True)
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        past_key_values = outputs.past_key_values
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
-        vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
-        condition = tf.transpose(
-            tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
-        )
-        input_ids = tf.where(condition, random_other_next_tokens, input_ids)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        attn_mask = tf.concat(
-            [attn_mask, tf.ones((attn_mask.shape[0], 1), dtype=tf.int32)],
-            axis=1,
-        )
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=attn_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens, past_key_values=past_key_values, attention_mask=attn_mask, output_hidden_states=True
-        ).hidden_states[0]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
-    def create_and_check_causal_lm_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFBertLMHeadModel(config=config)
-
-        input_ids = input_ids[:1, :]
-        input_mask = input_mask[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=input_mask, use_cache=True)
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        ).hidden_states[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFBertLMHeadModel(config=config)
-
-        input_ids = input_ids[:1, :]
-        input_mask = input_mask[:1, :]
-        encoder_hidden_states = encoder_hidden_states[:1, :, :]
-        encoder_attention_mask = encoder_attention_mask[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        ).hidden_states[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFBertForMaskedLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_next_sequence_prediction(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFBertForNextSentencePrediction(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_pretraining(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFBertForPreTraining(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFBertForSequenceClassification(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = TFBertForMultipleChoice(config=config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFBertForTokenClassification(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFBertForQuestionAnswering(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-
-        result = model(inputs)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFBertModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFBertModel,
-            TFBertForMaskedLM,
-            TFBertLMHeadModel,
-            TFBertForNextSentencePrediction,
-            TFBertForPreTraining,
-            TFBertForQuestionAnswering,
-            TFBertForSequenceClassification,
-            TFBertForTokenClassification,
-            TFBertForMultipleChoice,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFBertModel,
-            "fill-mask": TFBertForMaskedLM,
-            "question-answering": TFBertForQuestionAnswering,
-            "text-classification": TFBertForSequenceClassification,
-            "text-generation": TFBertLMHeadModel,
-            "token-classification": TFBertForTokenClassification,
-            "zero-shot": TFBertForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = True
-    onnx_min_opset = 10
-
-    # special case for ForPreTraining model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class in get_values(TF_MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = TFBertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        """Test the base model"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_causal_lm_base_model(self):
-        """Test the base model of the causal LM model
-
-        is_decoder=True, no cross_attention, no encoder outputs
-        """
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_base_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        """Test the base model as a decoder (of an encoder-decoder architecture)
-
-        is_decoder=True + cross_attention + pass encoder outputs
-        """
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_causal_lm(self):
-        """Test the causal LM model"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model(*config_and_inputs)
-
-    def test_causal_lm_model_as_decoder(self):
-        """Test the causal LM model as a decoder"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_causal_lm_model_as_decoder(*config_and_inputs)
-
-    def test_causal_lm_model_past(self):
-        """Test causal LM model with `past_key_values`"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past(*config_and_inputs)
-
-    def test_causal_lm_model_past_with_attn_mask(self):
-        """Test the causal LM model with `past_key_values` and `attention_mask`"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past_with_attn_mask(*config_and_inputs)
-
-    def test_causal_lm_model_past_with_large_inputs(self):
-        """Test the causal LM model with `past_key_values` and a longer decoder sequence length"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past_large_inputs(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        """Similar to `test_causal_lm_model_past_with_large_inputs` but with cross-attention"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_next_sequence_prediction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_next_sequence_prediction(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_model_from_pretrained(self):
-        model = TFBertModel.from_pretrained("jplu/tiny-tf-bert-random")
-        self.assertIsNotNone(model)
-
-    def test_custom_load_tf_weights(self):
-        model, output_loading_info = TFBertForTokenClassification.from_pretrained(
-            "jplu/tiny-tf-bert-random", output_loading_info=True
-        )
-        self.assertEqual(sorted(output_loading_info["unexpected_keys"]), [])
-        for layer in output_loading_info["missing_keys"]:
-            self.assertTrue(layer.split("_")[0] in ["dropout", "classifier"])
-
-    # TODO (Joao): fix me
-    @unittest.skip("Onnx compliance broke with TF 2.10")
-    def test_onnx_compliancy(self):
-        pass
-
-
-@require_tf
-class TFBertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = TFBertForPreTraining.from_pretrained("lysandre/tiny-bert-random")
-        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        expected_shape = [1, 6, 32000]
-        self.assertEqual(output.shape, expected_shape)
-
-        print(output[:, :3, :3])
-
-        expected_slice = tf.constant(
-            [
-                [
-                    [-0.05243197, -0.04498899, 0.05512108],
-                    [-0.07444685, -0.01064632, 0.04352357],
-                    [-0.05020351, 0.05530146, 0.00700043],
-                ]
-            ]
-        )
-        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
diff --git a/tests/models/big_bird/test_modeling_flax_big_bird.py b/tests/models/big_bird/test_modeling_flax_big_bird.py
deleted file mode 100644
index fe1790bf75..0000000000
--- a/tests/models/big_bird/test_modeling_flax_big_bird.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers import BigBirdConfig, is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_flax_available():
-    import jax
-
-    from transformers.models.big_bird.modeling_flax_big_bird import (
-        FlaxBigBirdForCausalLM,
-        FlaxBigBirdForMaskedLM,
-        FlaxBigBirdForMultipleChoice,
-        FlaxBigBirdForPreTraining,
-        FlaxBigBirdForQuestionAnswering,
-        FlaxBigBirdForSequenceClassification,
-        FlaxBigBirdForTokenClassification,
-        FlaxBigBirdModel,
-    )
-
-
-class FlaxBigBirdModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        seq_length=56,
-        is_training=True,
-        use_attention_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        intermediate_size=7,
-        hidden_act="gelu_new",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_choices=4,
-        attention_type="block_sparse",
-        use_bias=True,
-        rescale_embeddings=False,
-        block_size=2,
-        num_random_blocks=3,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_choices = num_choices
-
-        self.rescale_embeddings = rescale_embeddings
-        self.attention_type = attention_type
-        self.use_bias = use_bias
-        self.block_size = block_size
-        self.num_random_blocks = num_random_blocks
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        config = BigBirdConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            attention_type=self.attention_type,
-            block_size=self.block_size,
-            num_random_blocks=self.num_random_blocks,
-            use_bias=self.use_bias,
-            rescale_embeddings=self.rescale_embeddings,
-        )
-
-        return config, input_ids, token_type_ids, attention_mask
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, token_type_ids, attention_mask = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_flax
-class FlaxBigBirdModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            FlaxBigBirdForCausalLM,
-            FlaxBigBirdModel,
-            FlaxBigBirdForPreTraining,
-            FlaxBigBirdForMaskedLM,
-            FlaxBigBirdForMultipleChoice,
-            FlaxBigBirdForQuestionAnswering,
-            FlaxBigBirdForSequenceClassification,
-            FlaxBigBirdForTokenClassification,
-        )
-        if is_flax_available()
-        else ()
-    )
-
-    test_attn_probs = False
-    test_mismatched_shapes = False
-
-    def setUp(self):
-        self.model_tester = FlaxBigBirdModelTester(self)
-
-    @slow
-    # copied from `test_modeling_flax_common` because it takes much longer than other models
-    def test_from_pretrained_save_pretrained(self):
-        super().test_from_pretrained_save_pretrained()
-
-    @slow
-    # copied from `test_modeling_flax_common` because it takes much longer than other models
-    def test_from_pretrained_with_no_automatic_init(self):
-        super().test_from_pretrained_with_no_automatic_init()
-
-    @slow
-    # copied from `test_modeling_flax_common` because it takes much longer than other models
-    def test_no_automatic_init(self):
-        super().test_no_automatic_init()
-
-    @slow
-    # copied from `test_modeling_flax_common` because it takes much longer than other models
-    def test_hidden_states_output(self):
-        super().test_hidden_states_output()
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("google/bigbird-roberta-base")
-            self.assertIsNotNone(model)
-
-    def test_attention_outputs(self):
-        if self.test_attn_probs:
-            super().test_attention_outputs()
-
-    @slow
-    # copied from `test_modeling_flax_common` because it takes much longer than other models
-    def test_jit_compilation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def model_jitted(input_ids, attention_mask=None, **kwargs):
-                    return model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
diff --git a/tests/models/blenderbot/test_modeling_flax_blenderbot.py b/tests/models/blenderbot/test_modeling_flax_blenderbot.py
deleted file mode 100644
index 1d3f77ee38..0000000000
--- a/tests/models/blenderbot/test_modeling_flax_blenderbot.py
+++ /dev/null
@@ -1,416 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import timeout_decorator  # noqa
-
-from transformers import BlenderbotConfig, is_flax_available
-from transformers.testing_utils import jax_device, require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
-
-
-if is_flax_available():
-    import os
-
-    # The slow tests are often failing with OOM error on GPU
-    # This makes JAX allocate exactly what is needed on demand, and deallocate memory that is no longer needed
-    # but will be slower as stated here https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
-    os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
-
-    import jax
-    import jax.numpy as jnp
-
-    from transformers import BlenderbotTokenizer
-    from transformers.models.blenderbot.modeling_flax_blenderbot import (
-        FlaxBlenderbotForConditionalGeneration,
-        FlaxBlenderbotModel,
-        shift_tokens_right,
-    )
-
-
-def prepare_blenderbot_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids=None,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = np.where(input_ids != config.pad_token_id, 1, 0)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = np.where(decoder_input_ids != config.pad_token_id, 1, 0)
-    if head_mask is None:
-        head_mask = np.ones((config.encoder_layers, config.encoder_attention_heads))
-    if decoder_head_mask is None:
-        decoder_head_mask = np.ones((config.decoder_layers, config.decoder_attention_heads))
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = np.ones((config.decoder_layers, config.decoder_attention_heads))
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-    }
-
-
-class FlaxBlenderbotModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=50,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-        initializer_range=0.02,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.initializer_range = initializer_range
-
-    def prepare_config_and_inputs(self):
-        input_ids = np.clip(ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size), 3, self.vocab_size)
-        input_ids = np.concatenate((input_ids, 2 * np.ones((self.batch_size, 1), dtype=np.int64)), -1)
-
-        decoder_input_ids = shift_tokens_right(input_ids, 1, 2)
-
-        config = BlenderbotConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            initializer_range=self.initializer_range,
-            use_cache=False,
-        )
-        inputs_dict = prepare_blenderbot_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def check_use_cache_forward(self, model_class_name, config, inputs_dict):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        encoder_outputs = model.encode(inputs_dict["input_ids"])
-
-        decoder_input_ids, decoder_attention_mask = (
-            inputs_dict["decoder_input_ids"],
-            inputs_dict["decoder_attention_mask"],
-        )
-
-        past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
-        decoder_attention_mask = jnp.ones((decoder_input_ids.shape[0], max_decoder_length), dtype="i4")
-
-        decoder_position_ids = jnp.broadcast_to(
-            jnp.arange(decoder_input_ids.shape[-1] - 1)[None, :],
-            (decoder_input_ids.shape[0], decoder_input_ids.shape[-1] - 1),
-        )
-        outputs_cache = model.decode(
-            decoder_input_ids[:, :-1],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            past_key_values=past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        decoder_position_ids = jnp.array(decoder_input_ids.shape[0] * [[decoder_input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model.decode(
-            decoder_input_ids[:, -1:],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            past_key_values=outputs_cache.past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        outputs = model.decode(decoder_input_ids, encoder_outputs)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, inputs_dict):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        encoder_outputs = model.encode(inputs_dict["input_ids"])
-
-        decoder_input_ids, decoder_attention_mask = (
-            inputs_dict["decoder_input_ids"],
-            inputs_dict["decoder_attention_mask"],
-        )
-
-        decoder_attention_mask_cache = jnp.concatenate(
-            [
-                decoder_attention_mask,
-                jnp.zeros((decoder_attention_mask.shape[0], max_decoder_length - decoder_attention_mask.shape[1])),
-            ],
-            axis=-1,
-        )
-
-        past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
-        decoder_position_ids = jnp.broadcast_to(
-            jnp.arange(decoder_input_ids.shape[-1] - 1)[None, :],
-            (decoder_input_ids.shape[0], decoder_input_ids.shape[-1] - 1),
-        )
-
-        outputs_cache = model.decode(
-            decoder_input_ids[:, :-1],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask_cache,
-            past_key_values=past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-        decoder_position_ids = jnp.array(decoder_input_ids.shape[0] * [[decoder_input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model.decode(
-            decoder_input_ids[:, -1:],
-            encoder_outputs,
-            past_key_values=outputs_cache.past_key_values,
-            decoder_attention_mask=decoder_attention_mask_cache,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        outputs = model.decode(decoder_input_ids, encoder_outputs, decoder_attention_mask=decoder_attention_mask)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-
-@require_flax
-class BlenderbotHeadTests(unittest.TestCase):
-    vocab_size = 99
-
-    def _get_config_and_data(self):
-        input_ids = np.array(
-            [
-                [71, 82, 18, 33, 46, 91, 2],
-                [68, 34, 26, 58, 30, 82, 2],
-                [5, 97, 17, 39, 94, 40, 2],
-                [76, 83, 94, 25, 70, 78, 2],
-                [87, 59, 41, 35, 48, 66, 2],
-                [55, 13, 16, 58, 5, 2, 1],  # note padding
-                [64, 27, 31, 51, 12, 75, 2],
-                [52, 64, 86, 17, 83, 39, 2],
-                [48, 61, 9, 24, 71, 82, 2],
-                [26, 1, 60, 48, 22, 13, 2],
-                [21, 5, 62, 28, 14, 76, 2],
-                [45, 98, 37, 86, 59, 48, 2],
-                [70, 70, 50, 9, 28, 0, 2],
-            ],
-            dtype=np.int64,
-        )
-
-        batch_size = input_ids.shape[0]
-        config = BlenderbotConfig(
-            vocab_size=self.vocab_size,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            eos_token_id=2,
-            pad_token_id=1,
-            bos_token_id=0,
-        )
-        return config, input_ids, batch_size
-
-    # @timeout_decorator.timeout(1)  # not working with the decorator so far
-    def test_lm_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        lm_model = FlaxBlenderbotForConditionalGeneration(config)
-        outputs = lm_model(input_ids=input_ids)
-        expected_shape = (batch_size, input_ids.shape[1], config.vocab_size)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-
-    def test_lm_uneven_forward(self):
-        config = BlenderbotConfig(
-            vocab_size=self.vocab_size,
-            d_model=14,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=8,
-            decoder_ffn_dim=8,
-            max_position_embeddings=48,
-        )
-        lm_model = FlaxBlenderbotForConditionalGeneration(config)
-        context = np.array([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]], dtype=np.int64)
-        summary = np.array([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]], dtype=np.int64)
-        outputs = lm_model(input_ids=context, decoder_input_ids=summary)
-        expected_shape = (*summary.shape, config.vocab_size)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-
-    def test_shift_tokens_right(self):
-        input_ids = np.array([[71, 82, 18, 33, 2, 1, 1], [68, 34, 26, 58, 30, 82, 2]], dtype=np.int64)
-        shifted = shift_tokens_right(input_ids, 1, 2)
-        n_pad_before = np.equal(input_ids, 1).astype(np.float32).sum()
-        n_pad_after = np.equal(shifted, 1).astype(np.float32).sum()
-        self.assertEqual(shifted.shape, input_ids.shape)
-        self.assertEqual(n_pad_after, n_pad_before - 1)
-        self.assertTrue(np.equal(shifted[:, 0], 2).all())
-
-
-@require_flax
-class FlaxBlenderbotModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    is_encoder_decoder = True
-    all_model_classes = (
-        (
-            FlaxBlenderbotModel,
-            FlaxBlenderbotForConditionalGeneration,
-        )
-        if is_flax_available()
-        else ()
-    )
-
-    def setUp(self):
-        self.model_tester = FlaxBlenderbotModelTester(self)
-
-    def test_use_cache_forward(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward(model_class, config, inputs_dict)
-
-    def test_use_cache_forward_with_attn_mask(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward_with_attn_mask(model_class, config, inputs_dict)
-
-    def test_encode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def encode_jitted(input_ids, attention_mask=None, **kwargs):
-                    return model.encode(input_ids=input_ids, attention_mask=attention_mask)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    def test_decode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                model = model_class(config)
-                encoder_outputs = model.encode(inputs_dict["input_ids"], inputs_dict["attention_mask"])
-
-                prepared_inputs_dict = {
-                    "decoder_input_ids": inputs_dict["decoder_input_ids"],
-                    "decoder_attention_mask": inputs_dict["decoder_attention_mask"],
-                    "encoder_outputs": encoder_outputs,
-                }
-
-                @jax.jit
-                def decode_jitted(decoder_input_ids, decoder_attention_mask, encoder_outputs):
-                    return model.decode(
-                        decoder_input_ids=decoder_input_ids,
-                        decoder_attention_mask=decoder_attention_mask,
-                        encoder_outputs=encoder_outputs,
-                    )
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("facebook/blenderbot-400M-distill")
-            # FlaxBlenderbotForSequenceClassification expects eos token in input_ids
-            input_ids = np.ones((1, 1)) * model.config.eos_token_id
-            outputs = model(input_ids)
-            self.assertIsNotNone(outputs)
-
-    @unittest.skipUnless(jax_device != "cpu", "3B test too slow on CPU.")
-    @slow
-    def test_generation_from_short_input_same_as_parlai_3B(self):
-        FASTER_GEN_KWARGS = {"num_beams": 1, "early_stopping": True, "min_length": 15, "max_length": 25}
-        TOK_DECODE_KW = {"skip_special_tokens": True, "clean_up_tokenization_spaces": True}
-
-        model = FlaxBlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-3B", from_pt=True)
-        tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B")
-
-        src_text = ["Sam"]
-        model_inputs = tokenizer(src_text, return_tensors="jax")
-
-        generated_utterances = model.generate(**model_inputs, **FASTER_GEN_KWARGS)
-        tgt_text = 'Sam is a great name. It means "sun" in Gaelic.'
-
-        generated_txt = tokenizer.batch_decode(generated_utterances, **TOK_DECODE_KW)
-        assert generated_txt[0].strip() == tgt_text
diff --git a/tests/models/blenderbot/test_modeling_tf_blenderbot.py b/tests/models/blenderbot/test_modeling_tf_blenderbot.py
deleted file mode 100644
index 435c37d5fc..0000000000
--- a/tests/models/blenderbot/test_modeling_tf_blenderbot.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import BlenderbotConfig, BlenderbotTokenizer, is_tf_available
-from transformers.testing_utils import require_tf, require_tokenizers, slow
-from transformers.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFAutoModelForSeq2SeqLM, TFBlenderbotForConditionalGeneration, TFBlenderbotModel
-
-
-@require_tf
-class TFBlenderbotModelTester:
-    config_cls = BlenderbotConfig
-    config_updates = {}
-    hidden_act = "gelu"
-
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=50,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
-        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
-        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.config_cls(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_ids=[2],
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.pad_token_id,
-            **self.config_updates,
-        )
-        inputs_dict = prepare_blenderbot_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = TFBlenderbotModel(config=config).get_decoder()
-        input_ids = inputs_dict["input_ids"]
-
-        input_ids = input_ids[:1, :]
-        attention_mask = inputs_dict["attention_mask"][:1, :]
-        head_mask = inputs_dict["head_mask"]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = tf.cast(ids_tensor((self.batch_size, 3), 2), tf.int8)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-
-def prepare_blenderbot_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = tf.concat(
-            [
-                tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int8),
-                tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int8),
-            ],
-            axis=-1,
-        )
-    if head_mask is None:
-        head_mask = tf.ones((config.encoder_layers, config.encoder_attention_heads))
-    if decoder_head_mask is None:
-        decoder_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": decoder_attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-@require_tf
-class TFBlenderbotModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TFBlenderbotForConditionalGeneration, TFBlenderbotModel) if is_tf_available() else ()
-    all_generative_model_classes = (TFBlenderbotForConditionalGeneration,) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFBlenderbotModel,
-            "summarization": TFBlenderbotForConditionalGeneration,
-            "text2text-generation": TFBlenderbotForConditionalGeneration,
-            "translation": TFBlenderbotForConditionalGeneration,
-        }
-        if is_tf_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    test_pruning = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFBlenderbotModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BlenderbotConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
-
-
-@require_tokenizers
-@require_tf
-class TFBlenderbot400MIntegrationTests(unittest.TestCase):
-    src_text = ["My friends are cool but they eat too many carbs."]
-    model_name = "facebook/blenderbot-400M-distill"
-
-    @cached_property
-    def tokenizer(self):
-        return BlenderbotTokenizer.from_pretrained(self.model_name)
-
-    @cached_property
-    def model(self):
-        model = TFAutoModelForSeq2SeqLM.from_pretrained(self.model_name)
-        return model
-
-    @slow
-    def test_generation_from_long_input(self):
-        model_inputs = self.tokenizer(self.src_text, return_tensors="tf")
-        generated_ids = self.model.generate(
-            model_inputs.input_ids,
-        )
-        generated_words = self.tokenizer.batch_decode(generated_ids.numpy(), skip_special_tokens=True)[0]
-        assert (
-            generated_words
-            == " That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?"
-        )
diff --git a/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
deleted file mode 100644
index 7d810feb8d..0000000000
--- a/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
+++ /dev/null
@@ -1,409 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import timeout_decorator  # noqa
-
-from transformers import BlenderbotSmallConfig, is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
-
-
-if is_flax_available():
-    import os
-
-    # The slow tests are often failing with OOM error on GPU
-    # This makes JAX allocate exactly what is needed on demand, and deallocate memory that is no longer needed
-    # but will be slower as stated here https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
-    os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
-
-    import jax
-    import jax.numpy as jnp
-
-    from transformers.models.blenderbot_small.modeling_flax_blenderbot_small import (
-        FlaxBlenderbotSmallForConditionalGeneration,
-        FlaxBlenderbotSmallModel,
-        shift_tokens_right,
-    )
-
-
-def prepare_blenderbot_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids=None,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = np.where(input_ids != config.pad_token_id, 1, 0)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = np.where(decoder_input_ids != config.pad_token_id, 1, 0)
-    if head_mask is None:
-        head_mask = np.ones((config.encoder_layers, config.encoder_attention_heads))
-    if decoder_head_mask is None:
-        decoder_head_mask = np.ones((config.decoder_layers, config.decoder_attention_heads))
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = np.ones((config.decoder_layers, config.decoder_attention_heads))
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-    }
-
-
-class FlaxBlenderbotSmallModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=50,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-        initializer_range=0.02,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.initializer_range = initializer_range
-
-    def prepare_config_and_inputs(self):
-        input_ids = np.clip(ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size), 3, self.vocab_size)
-        input_ids = np.concatenate((input_ids, 2 * np.ones((self.batch_size, 1), dtype=np.int64)), -1)
-
-        decoder_input_ids = shift_tokens_right(input_ids, 1, 2)
-
-        config = BlenderbotSmallConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            initializer_range=self.initializer_range,
-            use_cache=False,
-        )
-        inputs_dict = prepare_blenderbot_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def check_use_cache_forward(self, model_class_name, config, inputs_dict):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        encoder_outputs = model.encode(inputs_dict["input_ids"])
-
-        decoder_input_ids, decoder_attention_mask = (
-            inputs_dict["decoder_input_ids"],
-            inputs_dict["decoder_attention_mask"],
-        )
-
-        past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
-        decoder_attention_mask = jnp.ones((decoder_input_ids.shape[0], max_decoder_length), dtype="i4")
-
-        decoder_position_ids = jnp.broadcast_to(
-            jnp.arange(decoder_input_ids.shape[-1] - 1)[None, :],
-            (decoder_input_ids.shape[0], decoder_input_ids.shape[-1] - 1),
-        )
-        outputs_cache = model.decode(
-            decoder_input_ids[:, :-1],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            past_key_values=past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        decoder_position_ids = jnp.array(decoder_input_ids.shape[0] * [[decoder_input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model.decode(
-            decoder_input_ids[:, -1:],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            past_key_values=outputs_cache.past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        outputs = model.decode(decoder_input_ids, encoder_outputs)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, inputs_dict):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        encoder_outputs = model.encode(inputs_dict["input_ids"])
-
-        decoder_input_ids, decoder_attention_mask = (
-            inputs_dict["decoder_input_ids"],
-            inputs_dict["decoder_attention_mask"],
-        )
-
-        decoder_attention_mask_cache = jnp.concatenate(
-            [
-                decoder_attention_mask,
-                jnp.zeros((decoder_attention_mask.shape[0], max_decoder_length - decoder_attention_mask.shape[1])),
-            ],
-            axis=-1,
-        )
-
-        past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
-        decoder_position_ids = jnp.broadcast_to(
-            jnp.arange(decoder_input_ids.shape[-1] - 1)[None, :],
-            (decoder_input_ids.shape[0], decoder_input_ids.shape[-1] - 1),
-        )
-
-        outputs_cache = model.decode(
-            decoder_input_ids[:, :-1],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask_cache,
-            past_key_values=past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-        decoder_position_ids = jnp.array(decoder_input_ids.shape[0] * [[decoder_input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model.decode(
-            decoder_input_ids[:, -1:],
-            encoder_outputs,
-            past_key_values=outputs_cache.past_key_values,
-            decoder_attention_mask=decoder_attention_mask_cache,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        outputs = model.decode(decoder_input_ids, encoder_outputs, decoder_attention_mask=decoder_attention_mask)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-
-@require_flax
-class BlenderbotHeadTests(unittest.TestCase):
-    vocab_size = 99
-
-    def _get_config_and_data(self):
-        input_ids = np.array(
-            [
-                [71, 82, 18, 33, 46, 91, 2],
-                [68, 34, 26, 58, 30, 82, 2],
-                [5, 97, 17, 39, 94, 40, 2],
-                [76, 83, 94, 25, 70, 78, 2],
-                [87, 59, 41, 35, 48, 66, 2],
-                [55, 13, 16, 58, 5, 2, 1],  # note padding
-                [64, 27, 31, 51, 12, 75, 2],
-                [52, 64, 86, 17, 83, 39, 2],
-                [48, 61, 9, 24, 71, 82, 2],
-                [26, 1, 60, 48, 22, 13, 2],
-                [21, 5, 62, 28, 14, 76, 2],
-                [45, 98, 37, 86, 59, 48, 2],
-                [70, 70, 50, 9, 28, 0, 2],
-            ],
-            dtype=np.int64,
-        )
-
-        batch_size = input_ids.shape[0]
-        config = BlenderbotSmallConfig(
-            vocab_size=self.vocab_size,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            eos_token_id=2,
-            pad_token_id=1,
-            bos_token_id=0,
-        )
-        return config, input_ids, batch_size
-
-    # @timeout_decorator.timeout(1)  # not working with the decorator so far
-    def test_lm_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        lm_model = FlaxBlenderbotSmallForConditionalGeneration(config)
-        outputs = lm_model(input_ids=input_ids)
-        expected_shape = (batch_size, input_ids.shape[1], config.vocab_size)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-
-    def test_lm_uneven_forward(self):
-        config = BlenderbotSmallConfig(
-            vocab_size=self.vocab_size,
-            d_model=14,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=8,
-            decoder_ffn_dim=8,
-            max_position_embeddings=48,
-        )
-        lm_model = FlaxBlenderbotSmallForConditionalGeneration(config)
-        context = np.array([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]], dtype=np.int64)
-        summary = np.array([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]], dtype=np.int64)
-        outputs = lm_model(input_ids=context, decoder_input_ids=summary)
-        expected_shape = (*summary.shape, config.vocab_size)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-
-    def test_shift_tokens_right(self):
-        input_ids = np.array([[71, 82, 18, 33, 2, 1, 1], [68, 34, 26, 58, 30, 82, 2]], dtype=np.int64)
-        shifted = shift_tokens_right(input_ids, 1, 2)
-        n_pad_before = np.equal(input_ids, 1).astype(np.float32).sum()
-        n_pad_after = np.equal(shifted, 1).astype(np.float32).sum()
-        self.assertEqual(shifted.shape, input_ids.shape)
-        self.assertEqual(n_pad_after, n_pad_before - 1)
-        self.assertTrue(np.equal(shifted[:, 0], 2).all())
-
-
-@require_flax
-class FlaxBlenderbotSmallModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    is_encoder_decoder = True
-    all_model_classes = (
-        (
-            FlaxBlenderbotSmallModel,
-            FlaxBlenderbotSmallForConditionalGeneration,
-        )
-        if is_flax_available()
-        else ()
-    )
-
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_case_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        image_processor_name,
-        feature_extractor_name,
-        processor_name,
-    ):
-        return pipeline_test_case_name == "TextGenerationPipelineTests"
-
-    def setUp(self):
-        self.model_tester = FlaxBlenderbotSmallModelTester(self)
-
-    def test_use_cache_forward(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward(model_class, config, inputs_dict)
-
-    def test_use_cache_forward_with_attn_mask(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward_with_attn_mask(model_class, config, inputs_dict)
-
-    def test_encode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def encode_jitted(input_ids, attention_mask=None, **kwargs):
-                    return model.encode(input_ids=input_ids, attention_mask=attention_mask)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    def test_decode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                model = model_class(config)
-                encoder_outputs = model.encode(inputs_dict["input_ids"], inputs_dict["attention_mask"])
-
-                prepared_inputs_dict = {
-                    "decoder_input_ids": inputs_dict["decoder_input_ids"],
-                    "decoder_attention_mask": inputs_dict["decoder_attention_mask"],
-                    "encoder_outputs": encoder_outputs,
-                }
-
-                @jax.jit
-                def decode_jitted(decoder_input_ids, decoder_attention_mask, encoder_outputs):
-                    return model.decode(
-                        decoder_input_ids=decoder_input_ids,
-                        decoder_attention_mask=decoder_attention_mask,
-                        encoder_outputs=encoder_outputs,
-                    )
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("facebook/blenderbot_small-90M")
-            # FlaxBlenderbotForSequenceClassification expects eos token in input_ids
-            input_ids = np.ones((1, 1)) * model.config.eos_token_id
-            outputs = model(input_ids)
-            self.assertIsNotNone(outputs)
diff --git a/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
deleted file mode 100644
index 70cb580cd6..0000000000
--- a/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import BlenderbotSmallConfig, BlenderbotSmallTokenizer, is_tf_available
-from transformers.testing_utils import require_tf, require_tokenizers, slow
-from transformers.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFAutoModelForSeq2SeqLM, TFBlenderbotSmallForConditionalGeneration, TFBlenderbotSmallModel
-
-
-@require_tf
-class TFBlenderbotSmallModelTester:
-    config_cls = BlenderbotSmallConfig
-    config_updates = {}
-    hidden_act = "gelu"
-
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=50,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
-        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
-        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.config_cls(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_ids=[2],
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.pad_token_id,
-            **self.config_updates,
-        )
-        inputs_dict = prepare_blenderbot_small_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = TFBlenderbotSmallModel(config=config).get_decoder()
-        input_ids = inputs_dict["input_ids"]
-
-        input_ids = input_ids[:1, :]
-        attention_mask = inputs_dict["attention_mask"][:1, :]
-        head_mask = inputs_dict["head_mask"]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = tf.cast(ids_tensor((self.batch_size, 3), 2), tf.int8)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-
-def prepare_blenderbot_small_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = tf.concat(
-            [
-                tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int8),
-                tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int8),
-            ],
-            axis=-1,
-        )
-    if head_mask is None:
-        head_mask = tf.ones((config.encoder_layers, config.encoder_attention_heads))
-    if decoder_head_mask is None:
-        decoder_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": decoder_attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-@require_tf
-class TFBlenderbotSmallModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (TFBlenderbotSmallForConditionalGeneration, TFBlenderbotSmallModel) if is_tf_available() else ()
-    )
-    all_generative_model_classes = (TFBlenderbotSmallForConditionalGeneration,) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFBlenderbotSmallModel,
-            "summarization": TFBlenderbotSmallForConditionalGeneration,
-            "text2text-generation": TFBlenderbotSmallForConditionalGeneration,
-            "translation": TFBlenderbotSmallForConditionalGeneration,
-        }
-        if is_tf_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    test_pruning = False
-    test_onnx = False
-
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_case_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        image_processor_name,
-        feature_extractor_name,
-        processor_name,
-    ):
-        return pipeline_test_case_name == "TextGenerationPipelineTests"
-
-    def setUp(self):
-        self.model_tester = TFBlenderbotSmallModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BlenderbotSmallConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
-
-
-@require_tokenizers
-@require_tf
-class TFBlenderbot90MIntegrationTests(unittest.TestCase):
-    src_text = [
-        "Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel like "
-        "  i'm going to throw up.\nand why is that?"
-    ]
-    model_name = "facebook/blenderbot_small-90M"
-
-    @cached_property
-    def tokenizer(self):
-        # use "old" tokenizer here because of bug when downloading new tokenizer
-        return BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")
-
-    @cached_property
-    def model(self):
-        model = TFAutoModelForSeq2SeqLM.from_pretrained(self.model_name)
-        return model
-
-    @slow
-    def test_90_generation_from_long_input(self):
-        model_inputs = self.tokenizer(self.src_text, return_tensors="tf")
-        generated_ids = self.model.generate(
-            model_inputs.input_ids,
-            attention_mask=model_inputs.attention_mask,
-            num_beams=2,
-            use_cache=True,
-        )
-        generated_words = self.tokenizer.batch_decode(generated_ids.numpy(), skip_special_tokens=True)[0]
-        assert generated_words in (
-            "i don't know. i just feel like i'm going to throw up. it's not fun.",
-            "i'm not sure. i just feel like i've been feeling like i have to be in a certain place",
-            "i'm not sure. i just feel like i've been in a bad situation.",
-        )
diff --git a/tests/models/blip/test_modeling_tf_blip.py b/tests/models/blip/test_modeling_tf_blip.py
deleted file mode 100644
index 71269048a3..0000000000
--- a/tests/models/blip/test_modeling_tf_blip.py
+++ /dev/null
@@ -1,878 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow Blip model."""
-
-from __future__ import annotations
-
-import inspect
-import tempfile
-import unittest
-
-import numpy as np
-import requests
-
-from transformers import BlipConfig, BlipTextConfig, BlipVisionConfig
-from transformers.testing_utils import require_tf, require_vision, slow
-from transformers.utils import is_tf_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TFBlipForConditionalGeneration,
-        TFBlipForImageTextRetrieval,
-        TFBlipForQuestionAnswering,
-        TFBlipModel,
-        TFBlipTextModel,
-        TFBlipVisionModel,
-    )
-    from transformers.modeling_tf_utils import keras
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import BlipProcessor
-
-
-class TFBlipVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=1e-10,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return BlipVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = TFBlipVisionModel(config=config)
-        result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFBlipVisionModelTest(TFModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as Blip does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (TFBlipVisionModel,) if is_tf_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFBlipVisionModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BlipVisionConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="Blip does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, keras.layers.Layer))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/blip-vqa-base"
-        model = TFBlipVisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class TFBlipTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        bos_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            input_mask = input_mask.numpy()
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                input_mask[batch_idx, :start_index] = 1
-                input_mask[batch_idx, start_index:] = 0
-            input_mask = tf.convert_to_tensor(input_mask)
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return BlipTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            bos_token_id=self.bos_token_id,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = TFBlipTextModel(config=config)
-        result = model(input_ids, attention_mask=input_mask, training=False)
-        result = model(input_ids, training=False)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFBlipTextModelTest(TFModelTesterMixin, unittest.TestCase):
-    all_model_classes = (TFBlipTextModel,) if is_tf_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFBlipTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BlipTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Blip does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/blip-vqa-base"
-        model = TFBlipTextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class TFBlipModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = TFBlipTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = TFBlipVisionModelTester(parent, **vision_kwargs)
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return BlipConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = TFBlipModel(config)
-        result = model(input_ids, pixel_values, attention_mask, training=False)
-        self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-            "return_loss": True,
-        }
-        return config, inputs_dict
-
-
-@require_tf
-class TFBlipModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TFBlipModel,) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": TFBlipModel, "image-to-text": TFBlipForConditionalGeneration}
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFBlipModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="BlipModel does not have input/output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_load_vision_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save BlipConfig and check if we can load BlipVisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = BlipVisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save BlipConfig and check if we can load BlipTextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = BlipTextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/blip-vqa-base"
-        model = TFBlipModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip("Matt: Re-enable this test when we have a proper export function for TF models.")
-    def test_saved_model_creation(self):
-        # This fails because the if return_loss: conditional can return None or a Tensor and TF hates that.
-        # We could fix that by setting the bool to a constant when exporting, but that requires a dedicated export
-        # function that we don't have yet.
-        pass
-
-
-class BlipTextRetrievalModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = TFBlipTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = TFBlipVisionModelTester(parent, **vision_kwargs)
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return BlipConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = TFBlipModel(config)
-        result = model(input_ids, pixel_values, attention_mask, training=False)
-        self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-        }
-        return config, inputs_dict
-
-
-class BlipTextImageModelsModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = TFBlipTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = TFBlipVisionModelTester(parent, **vision_kwargs)
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return BlipConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = TFBlipModel(config)
-        result = model(input_ids, pixel_values, attention_mask, training=False)
-        self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "labels": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-        }
-        return config, inputs_dict
-
-
-class BlipVQAModelsModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = TFBlipTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = TFBlipVisionModelTester(parent, **vision_kwargs)
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return BlipConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = TFBlipModel(config)
-        result = model(input_ids, pixel_values, attention_mask, training=False)
-        self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "decoder_input_ids": input_ids,
-            "labels": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-        }
-        return config, inputs_dict
-
-
-@require_tf
-@require_vision
-class TFBlipVQAModelTest(TFModelTesterMixin, unittest.TestCase):
-    all_model_classes = (TFBlipForQuestionAnswering,) if is_tf_available() else ()
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = BlipVQAModelsModelTester(self)
-
-    def _prepare_inputs_for_vqa(self):
-        _, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        inputs_dict["labels"] = inputs_dict["input_ids"]
-        inputs_dict["decoder_input_ids"] = inputs_dict["input_ids"]
-        inputs_dict.pop("return_loss")
-        return inputs_dict
-
-    def test_class_name_consistency(self):
-        """
-        Tests that all VQA models have a class name that ends with "ForQuestionAnswering"
-        """
-        for model_class in self.all_model_classes:
-            model = model_class(self.model_tester.get_config())
-            self.assertTrue(
-                model.__class__.__name__.endswith("ForQuestionAnswering"),
-                f"Class name should end with 'ForVisualQuestionAnswering' got {model.__class__.__name__}",
-            )
-
-    def test_training(self):
-        """
-        Tests that all VQA models can be trained on a single batch
-        """
-        for model_class in self.all_model_classes:
-            model = model_class(self.model_tester.get_config())
-            loss = model(**self.model_tester.prepare_config_and_inputs_for_common()[1], training=True).loss
-
-            self.assertIsNotNone(loss, "Loss should not be None")
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="BlipModel does not have input/output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    @unittest.skip(reason="Tested in individual model tests")
-    def test_compile_tf_model(self):
-        pass
-
-    @unittest.skip("Model doesn't have a clean loss output.")
-    def test_keras_fit(self):
-        pass
-
-
-@require_tf
-class TFBlipTextRetrievalModelTest(TFModelTesterMixin, unittest.TestCase):
-    all_model_classes = (TFBlipForImageTextRetrieval,) if is_tf_available() else ()
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = BlipTextRetrievalModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="BlipModel does not have input/output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        for model_class in self.all_model_classes[:-1]:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            model = model_class(config)
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-            # hardcode labels to be the same as input_ids
-            inputs["labels"] = inputs["input_ids"]
-
-            loss = model(**inputs, training=True).loss
-            self.assertTrue(loss is not None)
-
-    def test_load_vision_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save BlipConfig and check if we can load BlipVisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = BlipVisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save BlipConfig and check if we can load BlipTextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = BlipTextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/blip-vqa-base"
-        model = TFBlipModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="Tested in individual model tests")
-    def test_compile_tf_model(self):
-        pass
-
-    @unittest.skip("Model doesn't have a clean loss output.")
-    def test_keras_fit(self):
-        pass
-
-
-@require_tf
-class TFBlipTextImageModelTest(TFModelTesterMixin, unittest.TestCase):
-    all_model_classes = (TFBlipForConditionalGeneration,) if is_tf_available() else ()
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = BlipTextImageModelsModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            if model.config.is_encoder_decoder:
-                expected_arg_names = [
-                    "input_ids",
-                    "attention_mask",
-                    "decoder_input_ids",
-                    "decoder_attention_mask",
-                ]
-                expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
-                    if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
-                    else ["encoder_outputs"]
-                )
-                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-            else:
-                expected_arg_names = (
-                    ["input_ids"] if model_class != TFBlipForConditionalGeneration else ["pixel_values"]
-                )
-                self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    @unittest.skip(reason="Tested in individual model tests")
-    def test_compile_tf_model(self):
-        pass
-
-    @unittest.skip("Has some odd input names!")
-    def test_keras_fit(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="BlipModel does not have input/output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        for model_class in self.all_model_classes[:-1]:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            model = model_class(config)
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-            # hardcode labels to be the same as input_ids
-            inputs["labels"] = inputs["input_ids"]
-
-            loss = model(**inputs, training=True).loss
-            self.assertIsNotNone(loss)
-
-    def test_load_vision_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save BlipConfig and check if we can load BlipVisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = BlipVisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save BlipConfig and check if we can load BlipTextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = BlipTextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/blip-vqa-base"
-        model = TFBlipModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "https://huggingface.co/hf-internal-testing/blip-test-image/resolve/main/demo.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@require_vision
-@require_tf
-@slow
-class TFBlipModelIntegrationTest(unittest.TestCase):
-    def test_inference_image_captioning(self):
-        model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-        image = prepare_img()
-
-        # image only
-        inputs = processor(images=image, return_tensors="tf")
-
-        predictions = model.generate(**inputs)
-
-        # Test output
-        self.assertEqual(
-            predictions[0].numpy().tolist(), [30522, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102]
-        )
-
-        # image and context
-        context = ["a picture of"]
-        inputs = processor(images=image, text=context, return_tensors="tf")
-
-        predictions = model.generate(**inputs)
-
-        # Test output
-        self.assertEqual(
-            predictions[0].numpy().tolist(),
-            [30522, 1037, 3861, 1997, 1037, 2450, 1998, 2014, 3899, 2006, 1996, 3509, 102],
-        )
-
-    def test_inference_vqa(self):
-        model = TFBlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
-        processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
-
-        image = prepare_img()
-        text = "how many dogs are in the picture?"
-        inputs = processor(image, text=text, return_tensors="tf")
-        out = model.generate(**inputs)
-
-        # Test output
-        self.assertEqual(out[0].numpy().tolist(), [30522, 1015, 102])
-
-    def test_inference_itm(self):
-        model = TFBlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
-        processor = BlipProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
-
-        image = prepare_img()
-        text = "A woman and her dog sitting in a beach"
-
-        inputs = processor(image, text, return_tensors="tf")
-
-        out_itm = model(**inputs)
-        out = model(**inputs, use_itm_head=False, training=False)
-
-        expected_scores = tf.convert_to_tensor([[0.0029, 0.9971]])
-        self.assertTrue(np.allclose(tf.nn.softmax(out_itm[0]).numpy(), expected_scores, rtol=1e-3, atol=1e-3))
-        self.assertTrue(np.allclose(out[0], tf.convert_to_tensor([[0.5162]]), rtol=1e-3, atol=1e-3))
diff --git a/tests/models/blip/test_modeling_tf_blip_text.py b/tests/models/blip/test_modeling_tf_blip_text.py
deleted file mode 100644
index 757ce98eed..0000000000
--- a/tests/models/blip/test_modeling_tf_blip_text.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow Blip model."""
-
-from __future__ import annotations
-
-import unittest
-
-import numpy as np
-
-from transformers import BlipTextConfig
-from transformers.testing_utils import require_tf, slow
-from transformers.utils import is_tf_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFBlipTextModel
-
-
-class BlipTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        bos_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            input_mask = input_mask.numpy()
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                input_mask[batch_idx, :start_index] = 1
-                input_mask[batch_idx, start_index:] = 0
-
-        config = self.get_config()
-
-        return config, input_ids, tf.convert_to_tensor(input_mask)
-
-    def get_config(self):
-        return BlipTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            bos_token_id=self.bos_token_id,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = TFBlipTextModel(config=config)
-        result = model(input_ids, attention_mask=input_mask, training=False)
-        result = model(input_ids, training=False)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class BlipTextModelTest(TFModelTesterMixin, unittest.TestCase):
-    all_model_classes = (TFBlipTextModel,) if is_tf_available() else ()
-    test_onnx = False
-    test_pruning = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = BlipTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BlipTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="Blip does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/blip-vqa-base"
-        model = TFBlipTextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
diff --git a/tests/models/bloom/test_modeling_flax_bloom.py b/tests/models/bloom/test_modeling_flax_bloom.py
deleted file mode 100644
index 77b30fe19e..0000000000
--- a/tests/models/bloom/test_modeling_flax_bloom.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-import numpy as np
-
-from transformers import BloomConfig, BloomTokenizerFast, is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
-
-
-if is_flax_available():
-    import os
-
-    # The slow tests are often failing with OOM error on GPU
-    # This makes JAX allocate exactly what is needed on demand, and deallocate memory that is no longer needed
-    # but will be slower as stated here https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
-    os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
-
-    import jax.numpy as jnp
-
-    from transformers import FlaxBloomForCausalLM, FlaxBloomModel
-
-
-def prepare_bloom_inputs_dict(config, input_ids, attention_mask=None):
-    if attention_mask is None:
-        attention_mask = np.where(input_ids != config.pad_token_id, 1, 0)
-    return {"input_ids": input_ids, "attention_mask": attention_mask}
-
-
-@require_flax
-class FlaxBloomModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        n_layer=2,
-        n_head=4,
-        hidden_act="gelu",
-        hidden_dropout=0.1,
-        attention_probs_dropout_prob=0.1,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-        initializer_range=0.02,
-        apply_residual_connection_post_layernorm=False,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = n_layer
-        self.num_attention_heads = n_head
-        self.hidden_act = hidden_act
-        self.hidden_dropout = hidden_dropout
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.initializer_range = initializer_range
-        self.is_encoder_decoder = False
-        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
-
-    def prepare_config_and_inputs(self):
-        input_ids = np.clip(ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size), 3, self.vocab_size)
-        input_ids = np.concatenate((input_ids, 2 * np.ones((self.batch_size, 1), dtype=np.int64)), -1)
-
-        config = BloomConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            hidden_dropout=self.hidden_dropout,
-            attention_dropout=self.attention_probs_dropout_prob,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            is_encoder_decoder=False,
-            use_cache=False,
-        )
-        inputs_dict = prepare_bloom_inputs_dict(config, input_ids)
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def check_use_cache_forward(self, model_class_name, config, inputs_dict):
-        max_length = 20
-        model = model_class_name(config)
-
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = jnp.ones((input_ids.shape[0], max_length), dtype="i4")
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_length)
-
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-        )
-
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            attention_mask=attention_mask,
-            past_key_values=outputs_cache.past_key_values,
-        )
-
-        outputs = model(input_ids)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, inputs_dict):
-        max_length = 20
-        model = model_class_name(config)
-
-        input_ids, attention_mask = (
-            inputs_dict["input_ids"],
-            inputs_dict["attention_mask"],
-        )
-
-        attention_mask_cache = jnp.concatenate(
-            [
-                attention_mask,
-                jnp.zeros((attention_mask.shape[0], max_length - attention_mask.shape[1])),
-            ],
-            axis=-1,
-        )
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_length)
-
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask_cache,
-            past_key_values=past_key_values,
-        )
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            past_key_values=outputs_cache.past_key_values,
-            attention_mask=attention_mask_cache,
-        )
-
-        outputs = model(input_ids, attention_mask=attention_mask)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-
-@require_flax
-class FlaxBloomModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxBloomModel, FlaxBloomForCausalLM) if is_flax_available() else ()
-
-    def setUp(self):
-        self.model_tester = FlaxBloomModelTester(self)
-
-    def test_use_cache_forward(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward(model_class, config, inputs_dict)
-
-    def test_use_cache_forward_with_attn_mask(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward_with_attn_mask(model_class, config, inputs_dict)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("bigscience/bloom-560m")
-            input_ids = np.ones((1, 1)) * model.config.eos_token_id
-            outputs = model(input_ids)
-            self.assertIsNotNone(outputs)
-
-
-@slow
-@require_flax
-class FlaxBloomGenerationTest(unittest.TestCase):
-    all_model_classes = (FlaxBloomForCausalLM,) if is_flax_available() else ()
-
-    def setUp(self):
-        self.model_id = "bigscience/bloom-560m"
-        self.tokenizer = BloomTokenizerFast.from_pretrained(self.model_id, padding_side="left")
-        self.model_tester = FlaxBloomModelTester(self)
-        self.model = FlaxBloomForCausalLM.from_pretrained(self.model_id, from_pt=True, revision="gs555750")
-
-    def test_model_batched_gen(self):
-        # tests if the model outputs the same generation for the same batched input
-        input_sentences = [
-            "Hello there is this string is definitely longer I believe that",
-            "Hello there is this string is definitely longer I believe that",
-        ]
-        inputs = self.tokenizer(input_sentences, return_tensors="np", padding=True, truncation=True)
-        sequences_fx = self.model.generate(**inputs, max_length=20).sequences
-        self.assertEqual(sequences_fx[0].tolist(), sequences_fx[1].tolist())
-
-    def test_model_batched_padding_left(self):
-        # tests if the model outputs the same generation for an input that is part of a batch
-        # and a single input
-        input_sentences_batch = [
-            "Hello there is this string is definitely longer I believe that",
-            "Hi I want to order",
-        ]
-        inputs = self.tokenizer(input_sentences_batch, return_tensors="np", padding=True, truncation=True)
-        sequences_fx_batch = self.model.generate(**inputs, max_length=20).sequences
-
-        input_sentence_simple = "Hi I want to order"
-        inputs_simple = self.tokenizer(input_sentence_simple, return_tensors="np")
-        sequences_fx_simple = self.model.generate(**inputs_simple, max_length=20).sequences
-
-        self.assertEqual(sequences_fx_batch[1][6:].tolist(), sequences_fx_simple[0][:-6].tolist())
-
-    def test_batch_generated_text(self):
-        input_sentences = [
-            "Hello what is",
-            "Running a quick test with the",
-        ]
-        inputs = self.tokenizer(input_sentences, return_tensors="np", padding=True, truncation=True)
-        generated_ids = self.model.generate(**inputs, max_length=20).sequences
-        generated_text = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        # these generations match those of the PyTorch model, ensuring correctness
-        EXPECTED_GENERATIONS = [
-            "Hello what is the best way to get the data from the server? I have tried",
-            "Running a quick test with the following command:\nsudo apt-get install python3\nsudo apt-get install python2",
-        ]
-
-        self.assertListEqual(generated_text, EXPECTED_GENERATIONS)
diff --git a/tests/models/camembert/test_modeling_tf_camembert.py b/tests/models/camembert/test_modeling_tf_camembert.py
deleted file mode 100644
index f9f8ba61d0..0000000000
--- a/tests/models/camembert/test_modeling_tf_camembert.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-
-
-if is_tf_available():
-    import numpy as np
-    import tensorflow as tf
-
-    from transformers import TFCamembertModel
-
-
-@require_tf
-@require_sentencepiece
-@require_tokenizers
-class TFCamembertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_output_embeds_base_model(self):
-        model = TFCamembertModel.from_pretrained("jplu/tf-camembert-base")
-
-        input_ids = tf.convert_to_tensor(
-            [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]],
-            dtype=tf.int32,
-        )  # J'aime le camembert !"
-
-        output = model(input_ids)["last_hidden_state"]
-        expected_shape = tf.TensorShape((1, 10, 768))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = tf.convert_to_tensor(
-            [[[-0.0254, 0.0235, 0.1027], [0.0606, -0.1811, -0.0418], [-0.1561, -0.1127, 0.2687]]],
-            dtype=tf.float32,
-        )
-        # camembert = torch.hub.load('pytorch/fairseq', 'camembert.v0')
-        # camembert.eval()
-        # expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach()
-
-        self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
diff --git a/tests/models/clip/test_modeling_flax_clip.py b/tests/models/clip/test_modeling_flax_clip.py
deleted file mode 100644
index d499f4bf7d..0000000000
--- a/tests/models/clip/test_modeling_flax_clip.py
+++ /dev/null
@@ -1,468 +0,0 @@
-import inspect
-import tempfile
-import unittest
-
-import numpy as np
-
-from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig, is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-if is_flax_available():
-    import jax
-
-    from transformers.models.clip.modeling_flax_clip import (
-        FlaxCLIPModel,
-        FlaxCLIPTextModel,
-        FlaxCLIPTextModelWithProjection,
-        FlaxCLIPVisionModel,
-    )
-
-
-class FlaxCLIPVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = CLIPVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, pixel_values
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_flax
-class FlaxCLIPVisionModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (FlaxCLIPVisionModel,) if is_flax_available() else ()
-
-    def setUp(self):
-        self.model_tester = FlaxCLIPVisionModelTester(self)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.__call__)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_jit_compilation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def model_jitted(pixel_values, **kwargs):
-                    return model(pixel_values=pixel_values, **kwargs).to_tuple()
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = model_jitted(**prepared_inputs_dict)
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = model_jitted(**prepared_inputs_dict)
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            hidden_states = outputs.hidden_states
-
-            self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
-
-            # CLIP has a different seq_length
-            image_size = (self.model_tester.image_size, self.model_tester.image_size)
-            patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
-            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-            seq_length = num_patches + 1
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        # in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.model_tester.image_size, self.model_tester.image_size)
-        patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        seq_length = num_patches + 1
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_length, seq_length],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_length, seq_length],
-            )
-
-    # FlaxCLIPVisionModel does not have any base model
-    def test_save_load_from_base(self):
-        pass
-
-    # FlaxCLIPVisionModel does not have any base model
-    def test_save_load_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("openai/clip-vit-base-patch32", from_pt=True)
-            outputs = model(np.ones((1, 3, 224, 224)))
-            self.assertIsNotNone(outputs)
-
-
-class FlaxCLIPTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                input_mask[batch_idx, :start_index] = 1
-                input_mask[batch_idx, start_index:] = 0
-
-        config = CLIPTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, input_mask
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_flax
-class FlaxCLIPTextModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxCLIPTextModel, FlaxCLIPTextModelWithProjection) if is_flax_available() else ()
-
-    def setUp(self):
-        self.model_tester = FlaxCLIPTextModelTester(self)
-
-    # FlaxCLIPTextModel does not have any base model
-    def test_save_load_from_base(self):
-        pass
-
-    # FlaxCLIPVisionModel does not have any base model
-    def test_save_load_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("openai/clip-vit-base-patch32", from_pt=True)
-            outputs = model(np.ones((1, 1)))
-            self.assertIsNotNone(outputs)
-
-
-class FlaxCLIPModelTester:
-    def __init__(self, parent, is_training=True):
-        self.parent = parent
-        self.text_model_tester = FlaxCLIPTextModelTester(parent)
-        self.vision_model_tester = FlaxCLIPVisionModelTester(parent)
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = CLIPConfig.from_text_vision_configs(text_config, vision_config, projection_dim=64)
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-        }
-        return config, inputs_dict
-
-
-@require_flax
-class FlaxCLIPModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxCLIPModel,) if is_flax_available() else ()
-    test_attention_outputs = False
-
-    def setUp(self):
-        self.model_tester = FlaxCLIPModelTester(self)
-
-    # hidden_states are tested in individual model tests
-    def test_hidden_states_output(self):
-        pass
-
-    def test_jit_compilation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def model_jitted(input_ids, pixel_values, **kwargs):
-                    return model(input_ids=input_ids, pixel_values=pixel_values, **kwargs).to_tuple()
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = model_jitted(**prepared_inputs_dict)
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = model_jitted(**prepared_inputs_dict)
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs[:4], outputs[:4]):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.__call__)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["input_ids", "pixel_values", "attention_mask", "position_ids"]
-            self.assertListEqual(arg_names[:4], expected_arg_names)
-
-    def test_get_image_features(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        model = FlaxCLIPModel(config)
-
-        @jax.jit
-        def model_jitted(pixel_values):
-            return model.get_image_features(pixel_values=pixel_values)
-
-        with self.subTest("JIT Enabled"):
-            jitted_output = model_jitted(inputs_dict["pixel_values"])
-
-        with self.subTest("JIT Disabled"):
-            with jax.disable_jit():
-                output = model_jitted(inputs_dict["pixel_values"])
-
-        self.assertEqual(jitted_output.shape, output.shape)
-        self.assertTrue(np.allclose(jitted_output, output, atol=1e-3))
-
-    def test_get_text_features(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        model = FlaxCLIPModel(config)
-
-        @jax.jit
-        def model_jitted(input_ids, attention_mask, **kwargs):
-            return model.get_text_features(input_ids=input_ids, attention_mask=attention_mask)
-
-        with self.subTest("JIT Enabled"):
-            jitted_output = model_jitted(**inputs_dict)
-
-        with self.subTest("JIT Disabled"):
-            with jax.disable_jit():
-                output = model_jitted(**inputs_dict)
-
-        self.assertEqual(jitted_output.shape, output.shape)
-        self.assertTrue(np.allclose(jitted_output, output, atol=1e-3))
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("openai/clip-vit-base-patch32", from_pt=True)
-            outputs = model(input_ids=np.ones((1, 1)), pixel_values=np.ones((1, 3, 224, 224)))
-            self.assertIsNotNone(outputs)
-
-    # overwrite from common since FlaxCLIPModel returns nested output
-    # which is not supported in the common test
-    def test_from_pretrained_save_pretrained(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ != "FlaxBertModel":
-                continue
-
-            with self.subTest(model_class.__name__):
-                model = model_class(config)
-
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                outputs = model(**prepared_inputs_dict).to_tuple()
-
-                # verify that normal save_pretrained works as expected
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    model.save_pretrained(tmpdirname)
-                    model_loaded = model_class.from_pretrained(tmpdirname)
-
-                outputs_loaded = model_loaded(**prepared_inputs_dict).to_tuple()[:4]
-                for output_loaded, output in zip(outputs_loaded, outputs):
-                    self.assert_almost_equals(output_loaded, output, 1e-3)
-
-                # verify that save_pretrained for distributed training
-                # with `params=params` works as expected
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    model.save_pretrained(tmpdirname, params=model.params)
-                    model_loaded = model_class.from_pretrained(tmpdirname)
-
-                outputs_loaded = model_loaded(**prepared_inputs_dict).to_tuple()[:4]
-                for output_loaded, output in zip(outputs_loaded, outputs):
-                    self.assert_almost_equals(output_loaded, output, 1e-3)
diff --git a/tests/models/clip/test_modeling_tf_clip.py b/tests/models/clip/test_modeling_tf_clip.py
deleted file mode 100644
index 27db72e397..0000000000
--- a/tests/models/clip/test_modeling_tf_clip.py
+++ /dev/null
@@ -1,662 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow CLIP model."""
-
-from __future__ import annotations
-
-import inspect
-import os
-import tempfile
-import unittest
-from importlib import import_module
-
-import requests
-
-from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
-from transformers.testing_utils import require_tf, require_vision, slow
-from transformers.utils import is_tf_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFCLIPModel, TFCLIPTextModel, TFCLIPVisionModel, TFSharedEmbeddings
-    from transformers.modeling_tf_utils import keras
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import CLIPProcessor
-
-
-class TFCLIPVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return CLIPVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = TFCLIPVisionModel(config=config)
-        result = model(pixel_values, training=False)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (TFCLIPVisionModel,) if is_tf_available() else ()
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFCLIPVisionModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CLIPVisionConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_inputs_embeds(self):
-        # CLIP does not use inputs_embeds
-        pass
-
-    def test_graph_mode_with_inputs_embeds(self):
-        # CLIP does not use inputs_embeds
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, keras.layers.Layer))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        # in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.model_tester.image_size, self.model_tester.image_size)
-        patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        seq_len = num_patches + 1
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-
-            added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_len, seq_len],
-            )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            # CLIP has a different seq_length
-            image_size = (self.model_tester.image_size, self.model_tester.image_size)
-            patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
-            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-            seq_length = num_patches + 1
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "openai/clip-vit-base-patch32"
-        model = TFCLIPVisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_saved_model_creation_extended(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        if hasattr(config, "use_cache"):
-            config.use_cache = True
-
-        # in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.model_tester.image_size, self.model_tester.image_size)
-        patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        seq_len = num_patches + 1
-
-        for model_class in self.all_model_classes:
-            class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config)
-            num_out = len(model(class_inputs_dict))
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=True)
-                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
-                model = keras.models.load_model(saved_model_dir)
-                outputs = model(class_inputs_dict)
-                output_hidden_states = outputs["hidden_states"]
-                output_attentions = outputs["attentions"]
-
-                # Check num outputs
-                self.assertEqual(len(outputs), num_out)
-
-                # Check num layers
-                expected_num_layers = getattr(
-                    self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-                )
-
-                self.assertEqual(len(output_hidden_states), expected_num_layers)
-                self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers)
-
-                # Check attention outputs
-                image_size = (self.model_tester.image_size, self.model_tester.image_size)
-                patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
-                num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-                seq_len = num_patches + 1
-
-                self.assertListEqual(
-                    list(output_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_len, seq_len],
-                )
-
-                # Check hidden states
-                self.assertListEqual(
-                    list(output_hidden_states[0].shape[-2:]),
-                    [seq_len, self.model_tester.hidden_size],
-                )
-
-
-class TFCLIPTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-            # make sure the first token has attention mask `1` to ensure that, after combining the causal mask, there
-            # is still at least one token being attended to for each batch.
-            # TODO: Change `random_attention_mask` in PT/TF/Flax common test file, after a discussion with the team.
-            input_mask = tf.concat(
-                [tf.ones_like(input_mask[:, :1], dtype=input_mask.dtype), input_mask[:, 1:]], axis=-1
-            )
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return CLIPTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = TFCLIPTextModel(config=config)
-        result = model(input_ids, attention_mask=input_mask, training=False)
-        result = model(input_ids, training=False)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase):
-    all_model_classes = (TFCLIPTextModel,) if is_tf_available() else ()
-    test_pruning = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFCLIPTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CLIPTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_inputs_embeds(self):
-        # CLIP does not use inputs_embeds
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "openai/clip-vit-base-patch32"
-        model = TFCLIPTextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_saved_model_creation_extended(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        if hasattr(config, "use_cache"):
-            config.use_cache = True
-
-        for model_class in self.all_model_classes:
-            class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config)
-            num_out = len(model(class_inputs_dict))
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=True)
-                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
-                model = keras.models.load_model(saved_model_dir)
-                outputs = model(class_inputs_dict)
-                output_hidden_states = outputs["hidden_states"]
-                output_attentions = outputs["attentions"]
-
-                # Check number of outputs
-                self.assertEqual(len(outputs), num_out)
-
-                # Check number of layers
-                expected_num_layers = getattr(
-                    self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-                )
-
-                # Check hidden states
-                self.assertEqual(len(output_hidden_states), expected_num_layers)
-                self.assertListEqual(
-                    list(output_hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size],
-                )
-
-                # Check attention outputs
-                self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers)
-
-                seq_length = self.model_tester.seq_length
-                key_length = getattr(self.model_tester, "key_length", seq_length)
-
-                self.assertListEqual(
-                    list(output_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_length, key_length],
-                )
-
-
-class TFCLIPModelTester:
-    def __init__(self, parent, is_training=True):
-        self.parent = parent
-        self.text_model_tester = TFCLIPTextModelTester(parent)
-        self.vision_model_tester = TFCLIPVisionModelTester(parent)
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return CLIPConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = TFCLIPModel(config)
-        result = model(input_ids, pixel_values, attention_mask, training=False)
-        self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-            "return_loss": True,
-        }
-        return config, inputs_dict
-
-
-@require_tf
-class TFCLIPModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TFCLIPModel,) if is_tf_available() else ()
-    pipeline_model_mapping = {"feature-extraction": TFCLIPModel} if is_tf_available() else {}
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFCLIPModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    # hidden_states are tested in individual model tests
-    def test_hidden_states_output(self):
-        pass
-
-    # input_embeds are tested in individual model tests
-    def test_inputs_embeds(self):
-        pass
-
-    # CLIPModel does not have input/output embeddings
-    def test_model_common_attributes(self):
-        pass
-
-    # overwrite from common since `TFCLIPModelTester` set `return_loss` to `True` and causes the preparation of
-    # `symbolic_inputs` failed.
-    def test_keras_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # remove `return_loss` to make code work
-        if self.__class__.__name__ == "TFCLIPModelTest":
-            inputs_dict.pop("return_loss", None)
-
-        tf_main_layer_classes = {
-            module_member
-            for model_class in self.all_model_classes
-            for module in (import_module(model_class.__module__),)
-            for module_member_name in dir(module)
-            if module_member_name.endswith("MainLayer")
-            # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`.
-            and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
-            for module_member in (getattr(module, module_member_name),)
-            if isinstance(module_member, type)
-            and keras.layers.Layer in module_member.__bases__
-            and getattr(module_member, "_keras_serializable", False)
-        }
-        for main_layer_class in tf_main_layer_classes:
-            # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
-            if "T5" in main_layer_class.__name__:
-                # Take the same values than in TFT5ModelTester for this shared layer
-                shared = TFSharedEmbeddings(99, 32, name="shared")
-                config.use_cache = inputs_dict.pop("use_cache", None)
-                main_layer = main_layer_class(config, embed_tokens=shared)
-            else:
-                main_layer = main_layer_class(config)
-
-            symbolic_inputs = {
-                name: keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
-            }
-
-            model = keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
-            outputs = model(inputs_dict)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                filepath = os.path.join(tmpdirname, "keras_model.h5")
-                model.save(filepath)
-                if "T5" in main_layer_class.__name__:
-                    model = keras.models.load_model(
-                        filepath,
-                        custom_objects={
-                            main_layer_class.__name__: main_layer_class,
-                            "TFSharedEmbeddings": TFSharedEmbeddings,
-                        },
-                    )
-                else:
-                    model = keras.models.load_model(
-                        filepath, custom_objects={main_layer_class.__name__: main_layer_class}
-                    )
-                assert isinstance(model, keras.Model)
-                after_outputs = model(inputs_dict)
-                self.assert_outputs_same(after_outputs, outputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "openai/clip-vit-base-patch32"
-        model = TFCLIPModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="Currently `saved_model` doesn't work with nested outputs.")
-    @slow
-    def test_saved_model_creation(self):
-        pass
-
-    @unittest.skip(reason="Currently `saved_model` doesn't work with nested outputs.")
-    @slow
-    def test_saved_model_creation_extended(self):
-        pass
-
-    @unittest.skip(reason="`saved_model` doesn't work with nested outputs so no preparation happens.")
-    @slow
-    def test_prepare_serving_output(self):
-        pass
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@require_vision
-@require_tf
-class TFCLIPModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model_name = "openai/clip-vit-base-patch32"
-        model = TFCLIPModel.from_pretrained(model_name)
-        processor = CLIPProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        inputs = processor(
-            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="tf"
-        )
-
-        outputs = model(**inputs, training=False)
-
-        # verify the logits
-        self.assertEqual(
-            outputs.logits_per_image.shape,
-            tf.TensorShape((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
-        )
-        self.assertEqual(
-            outputs.logits_per_text.shape,
-            tf.TensorShape((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
-        )
-
-        expected_logits = tf.constant([[24.5701, 19.3049]])
-
-        tf.debugging.assert_near(outputs.logits_per_image, expected_logits, atol=1e-3)
diff --git a/tests/models/convbert/test_modeling_tf_convbert.py b/tests/models/convbert/test_modeling_tf_convbert.py
deleted file mode 100644
index 7bd21778eb..0000000000
--- a/tests/models/convbert/test_modeling_tf_convbert.py
+++ /dev/null
@@ -1,424 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-
-import os
-import tempfile
-import unittest
-
-from transformers import ConvBertConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TFConvBertForMaskedLM,
-        TFConvBertForMultipleChoice,
-        TFConvBertForQuestionAnswering,
-        TFConvBertForSequenceClassification,
-        TFConvBertForTokenClassification,
-        TFConvBertModel,
-    )
-    from transformers.modeling_tf_utils import keras
-
-
-class TFConvBertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 384
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.embedding_size = 128
-        self.head_ratio = 2
-        self.conv_kernel_size = 9
-        self.num_groups = 1
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = ConvBertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            return_dict=True,
-        )
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFConvBertModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFConvBertForMaskedLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFConvBertForSequenceClassification(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = TFConvBertForMultipleChoice(config=config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFConvBertForTokenClassification(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFConvBertForQuestionAnswering(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-
-        result = model(inputs)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFConvBertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFConvBertModel,
-            TFConvBertForMaskedLM,
-            TFConvBertForQuestionAnswering,
-            TFConvBertForSequenceClassification,
-            TFConvBertForTokenClassification,
-            TFConvBertForMultipleChoice,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFConvBertModel,
-            "fill-mask": TFConvBertForMaskedLM,
-            "question-answering": TFConvBertForQuestionAnswering,
-            "text-classification": TFConvBertForSequenceClassification,
-            "token-classification": TFConvBertForTokenClassification,
-            "zero-shot": TFConvBertForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_pruning = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFConvBertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ConvBertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_saved_model_creation_extended(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        if hasattr(config, "use_cache"):
-            config.use_cache = True
-
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-
-        for model_class in self.all_model_classes:
-            class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config)
-            num_out = len(model(class_inputs_dict))
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=True)
-                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
-                model = keras.models.load_model(saved_model_dir)
-                outputs = model(class_inputs_dict)
-
-                if self.is_encoder_decoder:
-                    output_hidden_states = outputs["encoder_hidden_states"]
-                    output_attentions = outputs["encoder_attentions"]
-                else:
-                    output_hidden_states = outputs["hidden_states"]
-                    output_attentions = outputs["attentions"]
-
-                self.assertEqual(len(outputs), num_out)
-
-                expected_num_layers = getattr(
-                    self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-                )
-
-                self.assertEqual(len(output_hidden_states), expected_num_layers)
-                self.assertListEqual(
-                    list(output_hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size],
-                )
-
-                self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(output_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads / 2, encoder_seq_length, encoder_key_length],
-                )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TFConvBertModel.from_pretrained("YituTech/conv-bert-base")
-        self.assertIsNotNone(model)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", self.model_tester.seq_length)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
-        decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-
-        def check_decoder_attentions_output(outputs):
-            out_len = len(outputs)
-            self.assertEqual(out_len % 2, 0)
-            decoder_attentions = outputs.decoder_attentions
-            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads / 2, decoder_seq_length, decoder_key_length],
-            )
-
-        def check_encoder_attentions_output(outputs):
-            attentions = [
-                t.numpy() for t in (outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions)
-            ]
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads / 2, encoder_seq_length, encoder_key_length],
-            )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = False
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            out_len = len(outputs)
-            self.assertEqual(config.output_hidden_states, False)
-            check_encoder_attentions_output(outputs)
-
-            if self.is_encoder_decoder:
-                model = model_class(config)
-                outputs = model(self._prepare_for_class(inputs_dict, model_class))
-                self.assertEqual(config.output_hidden_states, False)
-                check_decoder_attentions_output(outputs)
-
-            # Check that output attentions can also be changed via the config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            self.assertEqual(config.output_hidden_states, False)
-            check_encoder_attentions_output(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = True
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-
-            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
-            self.assertEqual(model.config.output_hidden_states, True)
-            check_encoder_attentions_output(outputs)
-
-
-@require_tf
-class TFConvBertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = TFConvBertModel.from_pretrained("YituTech/conv-bert-base")
-        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        expected_shape = [1, 6, 768]
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = tf.constant(
-            [
-                [
-                    [-0.03475493, -0.4686034, -0.30638832],
-                    [0.22637248, -0.26988646, -0.7423424],
-                    [0.10324868, -0.45013508, -0.58280784],
-                ]
-            ]
-        )
-        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
diff --git a/tests/models/convnext/test_modeling_tf_convnext.py b/tests/models/convnext/test_modeling_tf_convnext.py
deleted file mode 100644
index 1e46e57fb2..0000000000
--- a/tests/models/convnext/test_modeling_tf_convnext.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow ConvNext model."""
-
-from __future__ import annotations
-
-import inspect
-import unittest
-
-from transformers import ConvNextConfig
-from transformers.testing_utils import require_tf, require_vision, slow
-from transformers.utils import cached_property, is_tf_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFConvNextForImageClassification, TFConvNextModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import ConvNextImageProcessor
-
-
-class TFConvNextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=32,
-        num_channels=3,
-        num_stages=4,
-        hidden_sizes=[10, 20, 30, 40],
-        depths=[2, 2, 3, 2],
-        is_training=True,
-        use_labels=True,
-        intermediate_size=37,
-        hidden_act="gelu",
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        num_labels=3,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.num_stages = num_stages
-        self.hidden_sizes = hidden_sizes
-        self.depths = depths
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return ConvNextConfig(
-            num_channels=self.num_channels,
-            hidden_sizes=self.hidden_sizes,
-            depths=self.depths,
-            num_stages=self.num_stages,
-            hidden_act=self.hidden_act,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = TFConvNextModel(config=config)
-        result = model(pixel_values, training=False)
-        # expected last hidden states: B, C, H // 32, W // 32
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = TFConvNextForImageClassification(config)
-        result = model(pixel_values, labels=labels, training=False)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFConvNextModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ConvNext does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (TFConvNextModel, TFConvNextForImageClassification) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": TFConvNextModel, "image-classification": TFConvNextForImageClassification}
-        if is_tf_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_onnx = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = TFConvNextModelTester(self)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=ConvNextConfig,
-            has_text_modality=False,
-            hidden_size=37,
-        )
-
-    @unittest.skip(reason="ConvNext does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skipIf(
-        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
-        reason="TF does not support backprop for grouped convolutions on CPU.",
-    )
-    @slow
-    def test_keras_fit(self):
-        super().test_keras_fit()
-
-    @unittest.skip(reason="ConvNext does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skipIf(
-        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
-        reason="TF does not support backprop for grouped convolutions on CPU.",
-    )
-    def test_dataset_conversion(self):
-        super().test_dataset_conversion()
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_stages = self.model_tester.num_stages
-            self.assertEqual(len(hidden_states), expected_num_stages + 1)
-
-            # ConvNext's feature maps are of shape (batch_size, num_channels, height, width)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    # Since ConvNext does not have any attention we need to rewrite this test.
-    def test_model_outputs_equivalence(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
-            dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
-
-            def recursive_check(tuple_object, dict_object):
-                if isinstance(tuple_object, (list, tuple)):
-                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
-                        recursive_check(tuple_iterable_value, dict_iterable_value)
-                elif tuple_object is None:
-                    return
-                else:
-                    self.assertTrue(
-                        all(tf.equal(tuple_object, dict_object)),
-                        msg=(
-                            "Tuple and dict output are not equal. Difference:"
-                            f" {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}"
-                        ),
-                    )
-
-                recursive_check(tuple_output, dict_output)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TFConvNextModel.from_pretrained("facebook/convnext-tiny-224")
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_tf
-@require_vision
-class TFConvNextModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return ConvNextImageProcessor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = TFConvNextForImageClassification.from_pretrained("facebook/convnext-tiny-224")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="tf")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = tf.constant([-0.0260, -0.4739, 0.1911])
-
-        tf.debugging.assert_near(outputs.logits[0, :3], expected_slice, atol=1e-4)
diff --git a/tests/models/convnextv2/test_modeling_tf_convnextv2.py b/tests/models/convnextv2/test_modeling_tf_convnextv2.py
deleted file mode 100644
index 08e458609c..0000000000
--- a/tests/models/convnextv2/test_modeling_tf_convnextv2.py
+++ /dev/null
@@ -1,306 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow ConvNext model."""
-
-from __future__ import annotations
-
-import inspect
-import unittest
-
-import numpy as np
-
-from transformers import ConvNextV2Config
-from transformers.testing_utils import require_tf, require_vision, slow
-from transformers.utils import cached_property, is_tf_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFConvNextV2ForImageClassification, TFConvNextV2Model
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import ConvNextImageProcessor
-
-
-class TFConvNextV2ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=32,
-        num_channels=3,
-        num_stages=4,
-        hidden_sizes=[10, 20, 30, 40],
-        depths=[2, 2, 3, 2],
-        is_training=True,
-        use_labels=True,
-        intermediate_size=37,
-        hidden_act="gelu",
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        num_labels=3,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.num_stages = num_stages
-        self.hidden_sizes = hidden_sizes
-        self.depths = depths
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return ConvNextV2Config(
-            num_channels=self.num_channels,
-            hidden_sizes=self.hidden_sizes,
-            depths=self.depths,
-            num_stages=self.num_stages,
-            hidden_act=self.hidden_act,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = TFConvNextV2Model(config=config)
-        result = model(pixel_values, training=False)
-        # expected last hidden states: batch_size, channels, height // 32, width // 32
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = TFConvNextV2ForImageClassification(config)
-        result = model(pixel_values, labels=labels, training=False)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFConvNextV2ModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ConvNext does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (TFConvNextV2Model, TFConvNextV2ForImageClassification) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": TFConvNextV2Model, "image-classification": TFConvNextV2ForImageClassification}
-        if is_tf_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_onnx = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = TFConvNextV2ModelTester(self)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=ConvNextV2Config,
-            has_text_modality=False,
-            hidden_size=37,
-        )
-
-    @unittest.skip(reason="ConvNext does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skipIf(
-        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
-        reason="TF does not support backprop for grouped convolutions on CPU.",
-    )
-    @slow
-    def test_keras_fit(self):
-        super().test_keras_fit()
-
-    @unittest.skip(reason="ConvNext does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skipIf(
-        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
-        reason="TF does not support backprop for grouped convolutions on CPU.",
-    )
-    def test_dataset_conversion(self):
-        super().test_dataset_conversion()
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_stages = self.model_tester.num_stages
-            self.assertEqual(len(hidden_states), expected_num_stages + 1)
-
-            # ConvNext's feature maps are of shape (batch_size, num_channels, height, width)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    # Since ConvNext does not have any attention we need to rewrite this test.
-    def test_model_outputs_equivalence(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
-            dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
-
-            def recursive_check(tuple_object, dict_object):
-                if isinstance(tuple_object, (list, tuple)):
-                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
-                        recursive_check(tuple_iterable_value, dict_iterable_value)
-                elif tuple_object is None:
-                    return
-                else:
-                    self.assertTrue(
-                        all(tf.equal(tuple_object, dict_object)),
-                        msg=(
-                            "Tuple and dict output are not equal. Difference:"
-                            f" {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}"
-                        ),
-                    )
-
-                recursive_check(tuple_output, dict_output)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TFConvNextV2Model.from_pretrained("facebook/convnextv2-tiny-1k-224")
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_tf
-@require_vision
-class TFConvNextV2ModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            ConvNextImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = TFConvNextV2ForImageClassification.from_pretrained("facebook/convnextv2-tiny-1k-224")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="tf")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = np.array([0.9996, 0.1966, -0.4386])
-
-        self.assertTrue(np.allclose(outputs.logits[0, :3].numpy(), expected_slice, atol=1e-4))
diff --git a/tests/models/ctrl/test_modeling_tf_ctrl.py b/tests/models/ctrl/test_modeling_tf_ctrl.py
deleted file mode 100644
index 38623d442a..0000000000
--- a/tests/models/ctrl/test_modeling_tf_ctrl.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import CTRLConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers.modeling_tf_utils import keras
-    from transformers.models.ctrl.modeling_tf_ctrl import (
-        TFCTRLForSequenceClassification,
-        TFCTRLLMHeadModel,
-        TFCTRLModel,
-    )
-
-
-class TFCTRLModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_token_type_ids = True
-        self.use_input_mask = True
-        self.use_labels = True
-        self.use_mc_token_ids = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-        self.pad_token_id = self.vocab_size - 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        mc_token_ids = None
-        if self.use_mc_token_ids:
-            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = CTRLConfig(
-            vocab_size=self.vocab_size,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            dff=self.intermediate_size,
-            # hidden_act=self.hidden_act,
-            # hidden_dropout_prob=self.hidden_dropout_prob,
-            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            n_positions=self.max_position_embeddings,
-            # type_vocab_size=self.type_vocab_size,
-            # initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
-
-        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = TFCTRLModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, None, input_mask]  # None is the input for 'past'
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = TFCTRLLMHeadModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_ctrl_for_sequence_classification(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        config.num_labels = self.num_labels
-        sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-        inputs = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "labels": sequence_labels,
-        }
-        model = TFCTRLForSequenceClassification(config)
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFCTRLModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel, TFCTRLForSequenceClassification) if is_tf_available() else ()
-    all_generative_model_classes = (TFCTRLLMHeadModel,) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFCTRLModel,
-            "text-classification": TFCTRLForSequenceClassification,
-            "text-generation": TFCTRLLMHeadModel,
-            "zero-shot": TFCTRLForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_case_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        image_processor_name,
-        feature_extractor_name,
-        processor_name,
-    ):
-        if pipeline_test_case_name == "ZeroShotClassificationPipelineTests":
-            # Get `tokenizer does not have a padding token` error for both fast/slow tokenizers.
-            # `CTRLConfig` was never used in pipeline tests, either because of a missing checkpoint or because a tiny
-            # config could not be created.
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = TFCTRLModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_ctrl_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_ctrl_model(*config_and_inputs)
-
-    def test_ctrl_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_ctrl_lm_head(*config_and_inputs)
-
-    def test_ctrl_sequence_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_ctrl_for_sequence_classification(*config_and_inputs)
-
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        list_lm_models = [TFCTRLLMHeadModel]
-        list_other_models_with_output_ebd = [TFCTRLForSequenceClassification]
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.build_in_name_scope()  # may be needed for the get_bias() call below
-            assert isinstance(model.get_input_embeddings(), keras.layers.Layer)
-
-            if model_class in list_lm_models:
-                x = model.get_output_embeddings()
-                assert isinstance(x, keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            elif model_class in list_other_models_with_output_ebd:
-                x = model.get_output_embeddings()
-                assert isinstance(x, keras.layers.Layer)
-                name = model.get_bias()
-                assert name is None
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/ctrl"
-        model = TFCTRLModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_tf
-class TFCTRLModelLanguageGenerationTest(unittest.TestCase):
-    @slow
-    def test_lm_generate_ctrl(self):
-        model = TFCTRLLMHeadModel.from_pretrained("Salesforce/ctrl")
-        input_ids = tf.convert_to_tensor([[11859, 0, 1611, 8]], dtype=tf.int32)  # Legal the president is
-        expected_output_ids = [
-            11859,
-            0,
-            1611,
-            8,
-            5,
-            150,
-            26449,
-            2,
-            19,
-            348,
-            469,
-            3,
-            2595,
-            48,
-            20740,
-            246533,
-            246533,
-            19,
-            30,
-            5,
-        ]  # Legal the president is a good guy and I don't want to lose my job. \n \n I have a
-
-        output_ids = model.generate(input_ids, do_sample=False)
-        self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
diff --git a/tests/models/cvt/test_modeling_tf_cvt.py b/tests/models/cvt/test_modeling_tf_cvt.py
deleted file mode 100644
index 211529719a..0000000000
--- a/tests/models/cvt/test_modeling_tf_cvt.py
+++ /dev/null
@@ -1,286 +0,0 @@
-"""Testing suite for the Tensorflow CvT model."""
-
-from __future__ import annotations
-
-import inspect
-import unittest
-from math import floor
-
-import numpy as np
-
-from transformers import CvtConfig
-from transformers.testing_utils import require_tf, require_vision, slow
-from transformers.utils import cached_property, is_tf_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFCvtForImageClassification, TFCvtModel
-    from transformers.modeling_tf_utils import keras
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import AutoImageProcessor
-
-
-class TFCvtConfigTester(ConfigTester):
-    def create_and_test_config_common_properties(self):
-        config = self.config_class(**self.inputs_dict)
-        self.parent.assertTrue(hasattr(config, "embed_dim"))
-        self.parent.assertTrue(hasattr(config, "num_heads"))
-
-
-class TFCvtModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=64,
-        num_channels=3,
-        embed_dim=[16, 32, 48],
-        num_heads=[1, 2, 3],
-        depth=[1, 2, 10],
-        patch_sizes=[7, 3, 3],
-        patch_stride=[4, 2, 2],
-        patch_padding=[2, 1, 1],
-        stride_kv=[2, 2, 2],
-        cls_token=[False, False, True],
-        attention_drop_rate=[0.0, 0.0, 0.0],
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        is_training=True,
-        use_labels=True,
-        num_labels=2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_sizes = patch_sizes
-        self.patch_stride = patch_stride
-        self.patch_padding = patch_padding
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.num_labels = num_labels
-        self.num_channels = num_channels
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.stride_kv = stride_kv
-        self.depth = depth
-        self.cls_token = cls_token
-        self.attention_drop_rate = attention_drop_rate
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            # create a random int32 tensor of given shape
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return CvtConfig(
-            image_size=self.image_size,
-            num_labels=self.num_labels,
-            num_channels=self.num_channels,
-            embed_dim=self.embed_dim,
-            num_heads=self.num_heads,
-            patch_sizes=self.patch_sizes,
-            patch_padding=self.patch_padding,
-            patch_stride=self.patch_stride,
-            stride_kv=self.stride_kv,
-            depth=self.depth,
-            cls_token=self.cls_token,
-            attention_drop_rate=self.attention_drop_rate,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = TFCvtModel(config=config)
-        result = model(pixel_values, training=False)
-        image_size = (self.image_size, self.image_size)
-        height, width = image_size[0], image_size[1]
-        for i in range(len(self.depth)):
-            height = floor(((height + 2 * self.patch_padding[i] - self.patch_sizes[i]) / self.patch_stride[i]) + 1)
-            width = floor(((width + 2 * self.patch_padding[i] - self.patch_sizes[i]) / self.patch_stride[i]) + 1)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.embed_dim[-1], height, width))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
-        model = TFCvtForImageClassification(config)
-        result = model(pixel_values, labels=labels, training=False)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFCvtModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as Cvt
-    does not use input_ids, inputs_embeds, attention_mask and seq_length.
-    """
-
-    all_model_classes = (TFCvtModel, TFCvtForImageClassification) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": TFCvtModel, "image-classification": TFCvtForImageClassification}
-        if is_tf_available()
-        else {}
-    )
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFCvtModelTester(self)
-        self.config_tester = TFCvtConfigTester(self, config_class=CvtConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    @unittest.skip(reason="Cvt does not output attentions")
-    def test_attention_outputs(self):
-        pass
-
-    @unittest.skip(reason="Cvt does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Cvt does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    @unittest.skipIf(
-        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
-        reason="TF does not support backprop for grouped convolutions on CPU.",
-    )
-    def test_dataset_conversion(self):
-        super().test_dataset_conversion()
-
-    @unittest.skipIf(
-        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
-        reason="TF does not support backprop for grouped convolutions on CPU.",
-    )
-    @slow
-    def test_keras_fit(self):
-        super().test_keras_fit()
-
-    @unittest.skip(reason="Get `Failed to determine best cudnn convolution algo.` error after using TF 2.12+cuda 11.8")
-    def test_keras_fit_mixed_precision(self):
-        policy = keras.mixed_precision.Policy("mixed_float16")
-        keras.mixed_precision.set_global_policy(policy)
-        super().test_keras_fit()
-        keras.mixed_precision.set_global_policy("float32")
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            hidden_states = outputs.hidden_states
-
-            expected_num_layers = len(self.model_tester.depth)
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            # verify the first hidden states (first block)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-3:]),
-                [
-                    self.model_tester.embed_dim[0],
-                    self.model_tester.image_size // 4,
-                    self.model_tester.image_size // 4,
-                ],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/cvt-13"
-        model = TFCvtModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_tf
-@require_vision
-class TFCvtModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("microsoft/cvt-13")
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = TFCvtForImageClassification.from_pretrained("microsoft/cvt-13")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="tf")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = tf.constant([0.9285, 0.9015, -0.3150])
-        self.assertTrue(np.allclose(outputs.logits[0, :3].numpy(), expected_slice, atol=1e-4))
diff --git a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
deleted file mode 100644
index 3f88801534..0000000000
--- a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
+++ /dev/null
@@ -1,491 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow Data2VecVision model."""
-
-from __future__ import annotations
-
-import collections.abc
-import inspect
-import unittest
-
-import numpy as np
-
-from transformers import Data2VecVisionConfig
-from transformers.file_utils import cached_property, is_tf_available, is_vision_available
-from transformers.testing_utils import require_tf, require_vision, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TFData2VecVisionForImageClassification,
-        TFData2VecVisionForSemanticSegmentation,
-        TFData2VecVisionModel,
-    )
-    from transformers.modeling_tf_utils import keras
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import BeitImageProcessor
-
-
-class TFData2VecVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=100,
-        batch_size=13,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        num_labels=3,
-        scope=None,
-        out_indices=[0, 1, 2, 3],
-    ):
-        self.parent = parent
-        self.vocab_size = 100
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.out_indices = out_indices
-        self.num_labels = num_labels
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        pixel_labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            pixel_labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels, pixel_labels
-
-    def get_config(self):
-        return Data2VecVisionConfig(
-            vocab_size=self.vocab_size,
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            out_indices=self.out_indices,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
-        model = TFData2VecVisionModel(config=config)
-        result = model(pixel_values, training=False)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (
-            self.image_size
-            if isinstance(self.image_size, collections.abc.Iterable)
-            else (self.image_size, self.image_size)
-        )
-        patch_size = (
-            self.patch_size
-            if isinstance(self.image_size, collections.abc.Iterable)
-            else (self.patch_size, self.patch_size)
-        )
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels, pixel_labels):
-        config.num_labels = self.type_sequence_label_size
-        model = TFData2VecVisionForImageClassification(config)
-
-        result = model(pixel_values, labels=labels, training=False)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def create_and_check_for_image_segmentation(self, config, pixel_values, labels, pixel_labels):
-        config.num_labels = self.num_labels
-        model = TFData2VecVisionForSemanticSegmentation(config)
-        result = model(pixel_values, training=False)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, self.image_size * 2, self.image_size * 2)
-        )
-        result = model(pixel_values, labels=pixel_labels)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, self.image_size * 2, self.image_size * 2)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels, pixel_labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_keras_fit(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, _, _ = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values, "labels": tf.zeros(self.batch_size)}
-        return config, inputs_dict
-
-
-@require_tf
-class TFData2VecVisionModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as Data2VecVision does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (TFData2VecVisionModel, TFData2VecVisionForImageClassification, TFData2VecVisionForSemanticSegmentation)
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"feature-extraction": TFData2VecVisionModel, "image-classification": TFData2VecVisionForImageClassification}
-        if is_tf_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_onnx = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = TFData2VecVisionModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=Data2VecVisionConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="Data2VecVision does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        # Data2VecVision does not use inputs_embeds
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, keras.layers.Layer))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_segmentation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_segmentation(*config_and_inputs)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        # in Data2VecVision, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
-        image_size = (
-            self.model_tester.image_size
-            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
-            else (self.model_tester.image_size, self.model_tester.image_size)
-        )
-        patch_size = (
-            self.model_tester.patch_size
-            if isinstance(self.model_tester.patch_size, collections.abc.Iterable)
-            else (self.model_tester.patch_size, self.model_tester.patch_size)
-        )
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        seq_len = num_patches + 1
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-
-            self.assertEqual(out_len + 1, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            # Data2VecVision has a different seq_length
-            image_size = (
-                self.model_tester.image_size
-                if isinstance(self.model_tester.image_size, collections.abc.Iterable)
-                else (self.model_tester.image_size, self.model_tester.image_size)
-            )
-            patch_size = (
-                self.model_tester.patch_size
-                if isinstance(self.model_tester.patch_size, collections.abc.Iterable)
-                else (self.model_tester.patch_size, self.model_tester.patch_size)
-            )
-            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-            seq_length = num_patches + 1
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    # Overriding this method since the base method won't be compatible with Data2VecVision.
-    @slow
-    def test_keras_fit(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            # Since `TFData2VecVisionModel` cannot operate with the default `fit()` method.
-            if model_class.__name__ != "TFData2VecVisionModel":
-                model = model_class(config)
-                if getattr(model, "hf_compute_loss", None):
-                    # Test that model correctly compute the loss with kwargs
-                    _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit()
-
-                    label_names = {"labels"}
-                    self.assertGreater(len(label_names), 0, msg="No matching label names found!")
-                    labels = {key: val for key, val in prepared_for_class.items() if key in label_names}
-                    inputs_minus_labels = {
-                        key: val for key, val in prepared_for_class.items() if key not in label_names
-                    }
-                    self.assertGreater(len(inputs_minus_labels), 0)
-                    model.compile(optimizer=keras.optimizers.SGD(0.0), run_eagerly=True)
-
-                    # Make sure the model fits without crashing regardless of where we pass the labels
-                    history1 = model.fit(
-                        prepared_for_class,
-                        validation_data=prepared_for_class,
-                        steps_per_epoch=1,
-                        validation_steps=1,
-                        shuffle=False,
-                    )
-                    val_loss1 = history1.history["val_loss"][0]
-                    history2 = model.fit(
-                        inputs_minus_labels,
-                        labels,
-                        validation_data=(inputs_minus_labels, labels),
-                        steps_per_epoch=1,
-                        validation_steps=1,
-                        shuffle=False,
-                    )
-                    val_loss2 = history2.history["val_loss"][0]
-                    self.assertTrue(np.allclose(val_loss1, val_loss2, atol=1e-2, rtol=1e-3))
-
-    # Overriding this method since the base method won't be compatible with Data2VecVision.
-    def test_loss_computation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            # Since `TFData2VecVisionModel` won't have labels against which we
-            # could compute loss.
-            if model_class.__name__ != "TFData2VecVisionModel":
-                model = model_class(config)
-                if getattr(model, "hf_compute_loss", None):
-                    # The number of elements in the loss should be the same as the number of elements in the label
-                    _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit()
-                    added_label = prepared_for_class[
-                        sorted(prepared_for_class.keys() - inputs_dict.keys(), reverse=True)[0]
-                    ]
-                    loss_size = tf.size(added_label)
-
-                    # Test that model correctly compute the loss with kwargs
-                    possible_input_names = {"input_ids", "pixel_values", "input_features"}
-                    input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
-                    model_input = prepared_for_class.pop(input_name)
-
-                    loss = model(model_input, **prepared_for_class)[0]
-                    self.assertEqual(loss.shape, [loss_size])
-
-                    # Test that model correctly compute the loss with a dict
-                    _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit()
-                    loss = model(**prepared_for_class)[0]
-                    self.assertEqual(loss.shape, [loss_size])
-
-                    # Test that model correctly compute the loss with a tuple
-                    label_keys = prepared_for_class.keys() - inputs_dict.keys()
-                    signature = inspect.signature(model.call).parameters
-                    signature_names = list(signature.keys())
-
-                    # Create a dictionary holding the location of the tensors in the tuple
-                    tuple_index_mapping = {0: input_name}
-                    for label_key in label_keys:
-                        label_key_index = signature_names.index(label_key)
-                        tuple_index_mapping[label_key_index] = label_key
-                    sorted_tuple_index_mapping = sorted(tuple_index_mapping.items())
-                    # Initialize a list with their default values, update the values and convert to a tuple
-                    list_input = []
-
-                    for name in signature_names:
-                        if name != "kwargs":
-                            list_input.append(signature[name].default)
-
-                    for index, value in sorted_tuple_index_mapping:
-                        list_input[index] = prepared_for_class[value]
-
-                    tuple_input = tuple(list_input)
-
-                    # Send to model
-                    loss = model(tuple_input[:-1])[0]
-
-                    self.assertEqual(loss.shape, [loss_size])
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/data2vec-vision-base-ft1k"
-        model = TFData2VecVisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_tf
-@require_vision
-class TFData2VecVisionModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            BeitImageProcessor.from_pretrained("facebook/data2vec-vision-base-ft1k") if is_vision_available() else None
-        )
-
-    @slow
-    def test_inference_image_classification_head_imagenet_1k(self):
-        model = TFData2VecVisionForImageClassification.from_pretrained("facebook/data2vec-vision-base-ft1k")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="tf")
-
-        # forward pass
-        outputs = model(**inputs)
-        logits = outputs.logits
-
-        # verify the logits
-        expected_shape = tf.convert_to_tensor([1, 1000])
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = tf.convert_to_tensor([0.3277, -0.1395, 0.0911])
-
-        tf.debugging.assert_near(logits[0, :3], expected_slice, atol=1e-4)
-
-        expected_top2 = [model.config.label2id[i] for i in ["remote control, remote", "tabby, tabby cat"]]
-        self.assertEqual(tf.nn.top_k(outputs.logits[0], 2).indices.numpy().tolist(), expected_top2)
diff --git a/tests/models/deberta/test_modeling_tf_deberta.py b/tests/models/deberta/test_modeling_tf_deberta.py
deleted file mode 100644
index ea1e716dd6..0000000000
--- a/tests/models/deberta/test_modeling_tf_deberta.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import DebertaConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TFDebertaForMaskedLM,
-        TFDebertaForQuestionAnswering,
-        TFDebertaForSequenceClassification,
-        TFDebertaForTokenClassification,
-        TFDebertaModel,
-    )
-
-
-class TFDebertaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.relative_attention = False
-        self.max_relative_positions = -1
-        self.position_biased_input = True
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-
-        config = DebertaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            relative_attention=self.relative_attention,
-            max_relative_positions=self.max_relative_positions,
-            position_biased_input=self.position_biased_input,
-            initializer_range=self.initializer_range,
-            return_dict=True,
-        )
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFDebertaModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFDebertaForMaskedLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFDebertaForSequenceClassification(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFDebertaForTokenClassification(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFDebertaForQuestionAnswering(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-
-        result = model(inputs)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFDebertaModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFDebertaModel,
-            TFDebertaForMaskedLM,
-            TFDebertaForQuestionAnswering,
-            TFDebertaForSequenceClassification,
-            TFDebertaForTokenClassification,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFDebertaModel,
-            "fill-mask": TFDebertaForMaskedLM,
-            "question-answering": TFDebertaForQuestionAnswering,
-            "text-classification": TFDebertaForSequenceClassification,
-            "token-classification": TFDebertaForTokenClassification,
-            "zero-shot": TFDebertaForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFDebertaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DebertaConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TFDebertaModel.from_pretrained("kamalkraj/deberta-base")
-        self.assertIsNotNone(model)
-
-
-@require_tf
-class TFDeBERTaModelIntegrationTest(unittest.TestCase):
-    @unittest.skip(reason="Model not available yet")
-    def test_inference_masked_lm(self):
-        pass
-
-    @slow
-    def test_inference_no_head(self):
-        model = TFDebertaModel.from_pretrained("kamalkraj/deberta-base")
-        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        attention_mask = tf.constant([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        output = model(input_ids, attention_mask=attention_mask)[0]
-
-        expected_slice = tf.constant(
-            [
-                [
-                    [-0.59855896, -0.80552566, -0.8462135],
-                    [1.4484025, -0.93483794, -0.80593085],
-                    [0.3122741, 0.00316059, -1.4131377],
-                ]
-            ]
-        )
-        tf.debugging.assert_near(output[:, 1:4, 1:4], expected_slice, atol=1e-4)
diff --git a/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py b/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py
deleted file mode 100644
index b69e2eb489..0000000000
--- a/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py
+++ /dev/null
@@ -1,309 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import DebertaV2Config, is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TFDebertaV2ForMaskedLM,
-        TFDebertaV2ForMultipleChoice,
-        TFDebertaV2ForQuestionAnswering,
-        TFDebertaV2ForSequenceClassification,
-        TFDebertaV2ForTokenClassification,
-        TFDebertaV2Model,
-    )
-
-
-class TFDebertaV2ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        relative_attention=False,
-        position_biased_input=True,
-        pos_att_type="None",
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.relative_attention = relative_attention
-        self.position_biased_input = position_biased_input
-        self.pos_att_type = pos_att_type
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-
-        config = DebertaV2Config(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            relative_attention=self.relative_attention,
-            position_biased_input=self.position_biased_input,
-            initializer_range=self.initializer_range,
-            return_dict=True,
-        )
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFDebertaV2Model(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFDebertaV2ForMaskedLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFDebertaV2ForSequenceClassification(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFDebertaV2ForTokenClassification(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFDebertaV2ForQuestionAnswering(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-
-        result = model(inputs)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = TFDebertaV2ForMultipleChoice(config=config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFDebertaModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFDebertaV2Model,
-            TFDebertaV2ForMaskedLM,
-            TFDebertaV2ForQuestionAnswering,
-            TFDebertaV2ForMultipleChoice,
-            TFDebertaV2ForSequenceClassification,
-            TFDebertaV2ForTokenClassification,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFDebertaV2Model,
-            "fill-mask": TFDebertaV2ForMaskedLM,
-            "question-answering": TFDebertaV2ForQuestionAnswering,
-            "text-classification": TFDebertaV2ForSequenceClassification,
-            "token-classification": TFDebertaV2ForTokenClassification,
-            "zero-shot": TFDebertaV2ForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFDebertaV2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DebertaV2Config, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TFDebertaV2Model.from_pretrained("kamalkraj/deberta-v2-xlarge")
-        self.assertIsNotNone(model)
-
-
-@require_tf
-class TFDeBERTaV2ModelIntegrationTest(unittest.TestCase):
-    @unittest.skip(reason="Model not available yet")
-    def test_inference_masked_lm(self):
-        pass
-
-    @slow
-    def test_inference_no_head(self):
-        model = TFDebertaV2Model.from_pretrained("kamalkraj/deberta-v2-xlarge")
-        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        attention_mask = tf.constant([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        output = model(input_ids, attention_mask=attention_mask)[0]
-
-        expected_slice = tf.constant(
-            [[[0.2356, 0.1948, 0.0369], [-0.1063, 0.3586, -0.5152], [-0.6399, -0.0259, -0.2525]]]
-        )
-        tf.debugging.assert_near(output[:, 1:4, 1:4], expected_slice, atol=1e-4)
diff --git a/tests/models/deit/test_modeling_tf_deit.py b/tests/models/deit/test_modeling_tf_deit.py
deleted file mode 100644
index 1ca091f526..0000000000
--- a/tests/models/deit/test_modeling_tf_deit.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow DeiT model."""
-
-from __future__ import annotations
-
-import inspect
-import unittest
-
-import numpy as np
-
-from transformers import DeiTConfig
-from transformers.testing_utils import require_tf, require_vision, slow
-from transformers.utils import cached_property, is_tf_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TFDeiTForImageClassification,
-        TFDeiTForImageClassificationWithTeacher,
-        TFDeiTForMaskedImageModeling,
-        TFDeiTModel,
-    )
-    from transformers.modeling_tf_utils import keras
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import DeiTImageProcessor
-
-
-class TFDeiTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        num_labels=3,
-        scope=None,
-        encoder_stride=2,
-        attn_implementation="eager",
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.encoder_stride = encoder_stride
-        self.attn_implementation = attn_implementation
-
-        # in DeiT, the seq length equals the number of patches + 2 (we add 2 for the [CLS] and distilation tokens)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 2
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return DeiTConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
-            attn_implementation=self.attn_implementation,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = TFDeiTModel(config=config)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels):
-        model = TFDeiTForMaskedImageModeling(config=config)
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.reconstruction.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
-        )
-
-        # test greyscale images
-        config.num_channels = 1
-        model = TFDeiTForMaskedImageModeling(config)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.reconstruction.shape, (self.batch_size, 1, self.image_size, self.image_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = TFDeiTForImageClassification(config)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = TFDeiTForImageClassification(config)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFDeiTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_tf_common.py, as DeiT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (
-            TFDeiTModel,
-            TFDeiTForImageClassification,
-            TFDeiTForImageClassificationWithTeacher,
-            TFDeiTForMaskedImageModeling,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFDeiTModel,
-            "image-classification": (TFDeiTForImageClassification, TFDeiTForImageClassificationWithTeacher),
-        }
-        if is_tf_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFDeiTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DeiTConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="DeiT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, keras.layers.Dense))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_image_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    # special case for DeiTForImageClassificationWithTeacher model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if "labels" in inputs_dict and "labels" not in inspect.signature(model_class.call).parameters:
-                del inputs_dict["labels"]
-
-        return inputs_dict
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/deit-base-distilled-patch16-224"
-        model = TFDeiTModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_tf
-@require_vision
-class DeiTModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            DeiTImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = TFDeiTForImageClassificationWithTeacher.from_pretrained("facebook/deit-base-distilled-patch16-224")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="tf")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = tf.constant([-1.0266, 0.1912, -1.2861])
-
-        self.assertTrue(np.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_interpolate_pos_encoding(self):
-        model = TFDeiTForImageClassificationWithTeacher.from_pretrained("facebook/deit-base-distilled-patch16-224")
-
-        image_processor = self.default_image_processor
-        # image size is {"height": 480, "width": 640}
-        image = prepare_img()
-        image_processor.size = {"height": 480, "width": 640}
-        # center crop set to False so image is not center cropped to 224x224
-        inputs = image_processor(images=image, return_tensors="tf", do_center_crop=False)
-        # forward pass
-        outputs = model(**inputs, interpolate_pos_encoding=True)
-
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
diff --git a/tests/models/dinov2/test_modeling_flax_dinov2.py b/tests/models/dinov2/test_modeling_flax_dinov2.py
deleted file mode 100644
index 161e49e3db..0000000000
--- a/tests/models/dinov2/test_modeling_flax_dinov2.py
+++ /dev/null
@@ -1,270 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the Flax Dinov2 model."""
-
-import inspect
-import unittest
-
-import numpy as np
-
-from transformers import Dinov2Config
-from transformers.testing_utils import require_flax, require_vision, slow
-from transformers.utils import cached_property, is_flax_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor
-
-
-if is_flax_available():
-    import jax
-
-    from transformers.models.dinov2.modeling_flax_dinov2 import FlaxDinov2ForImageClassification, FlaxDinov2Model
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import AutoImageProcessor
-
-
-class FlaxDinov2ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-
-        # in Dinov2, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        config = Dinov2Config(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, pixel_values
-
-    # Copied from transformers.models.vit.test_modeling_flax_vit.FlaxViTModelTester.prepare_config_and_inputs with ViT -> Dinov2
-    def create_and_check_model(self, config, pixel_values):
-        model = FlaxDinov2Model(config=config)
-        result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-
-    # Copied from transformers.models.vit.test_modeling_flax_vit.FlaxViTModelTester.create_and_check_for_image_classification with ViT -> Dinov2
-    def create_and_check_for_image_classification(self, config, pixel_values):
-        config.num_labels = self.type_sequence_label_size
-        model = FlaxDinov2ForImageClassification(config=config)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = FlaxDinov2ForImageClassification(config)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-
-    # Copied from transformers.models.vit.test_modeling_flax_vit.FlaxViTModelTester.prepare_config_and_inputs_for_common
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-        ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_flax
-# Copied from transformers.models.vit.test_modeling_flax_vit.FlaxViTModelTest with google/vit-base-patch16-224 -> facebook/dinov2-base
-class FlaxDionv2ModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxDinov2Model, FlaxDinov2ForImageClassification) if is_flax_available() else ()
-
-    def setUp(self) -> None:
-        self.model_tester = FlaxDinov2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Dinov2Config, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    # We need to override this test because Dinov2's forward signature is different than text models.
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.__call__)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    # We need to override this test because Dinov2 expects pixel_values instead of input_ids
-    def test_jit_compilation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def model_jitted(pixel_values, **kwargs):
-                    return model(pixel_values=pixel_values, **kwargs)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("facebook/dinov2-base")
-            outputs = model(np.ones((1, 3, 224, 224)))
-            self.assertIsNotNone(outputs)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return [image, image]
-
-
-@require_vision
-@require_flax
-class FlaxDinov2ModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("facebook/dinov2-base") if is_vision_available() else None
-
-    @slow
-    def test_inference_no_head(self):
-        model = FlaxDinov2Model.from_pretrained("facebook/dinov2-base")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        pixel_values = image_processor(images=image, return_tensors="np").pixel_values
-
-        # forward pass
-        outputs = model(pixel_values=pixel_values)
-
-        # verify the logits
-        expected_shape = (2, 257, 768)
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
-
-        expected_slice = np.array(
-            [
-                [
-                    [-2.1629121, -0.46566057, 1.0925977],
-                    [-3.5971704, -1.0283585, -1.1780515],
-                    [-2.900407, 1.1334689, -0.74357724],
-                ],
-                [
-                    [-2.1629121, -0.46566057, 1.0925977],
-                    [-3.5971704, -1.0283585, -1.1780515],
-                    [-2.900407, 1.1334689, -0.74357724],
-                ],
-            ]
-        )
-
-        self.assertTrue(np.allclose(outputs.last_hidden_state[:2, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_image_classification_head_imagenet_1k(self):
-        model = FlaxDinov2ForImageClassification.from_pretrained(
-            "facebook/dinov2-base-imagenet1k-1-layer", from_pt=True
-        )
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="np")
-
-        # forward pass
-        outputs = model(**inputs)
-        logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (2, 1000)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = np.array([[-2.1776447, 0.36716992, 0.13870952], [-2.1776447, 0.36716992, 0.13870952]])
-
-        self.assertTrue(np.allclose(logits[:2, :3], expected_slice, atol=1e-3))
-
-        expected_class_idx = 281
-        self.assertEqual(logits[0].argmax(-1).item(), expected_class_idx)
-        self.assertEqual(logits[1].argmax(-1).item(), expected_class_idx)
diff --git a/tests/models/distilbert/test_modeling_flax_distilbert.py b/tests/models/distilbert/test_modeling_flax_distilbert.py
deleted file mode 100644
index 50655771ed..0000000000
--- a/tests/models/distilbert/test_modeling_flax_distilbert.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from transformers import DistilBertConfig, is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_flax_available():
-    import jax.numpy as jnp
-
-    from transformers.models.distilbert.modeling_flax_distilbert import (
-        FlaxDistilBertForMaskedLM,
-        FlaxDistilBertForMultipleChoice,
-        FlaxDistilBertForQuestionAnswering,
-        FlaxDistilBertForSequenceClassification,
-        FlaxDistilBertForTokenClassification,
-        FlaxDistilBertModel,
-    )
-
-
-class FlaxDistilBertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_attention_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_choices=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_choices = num_choices
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = DistilBertConfig(
-            vocab_size=self.vocab_size,
-            dim=self.hidden_size,
-            n_layers=self.num_hidden_layers,
-            n_heads=self.num_attention_heads,
-            hidden_dim=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            tie_weights_=True,
-        )
-
-        return config, input_ids, attention_mask
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_flax
-class FlaxDistilBertModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            FlaxDistilBertModel,
-            FlaxDistilBertForMaskedLM,
-            FlaxDistilBertForMultipleChoice,
-            FlaxDistilBertForQuestionAnswering,
-            FlaxDistilBertForSequenceClassification,
-            FlaxDistilBertForTokenClassification,
-            FlaxDistilBertForQuestionAnswering,
-        )
-        if is_flax_available()
-        else ()
-    )
-
-    def setUp(self):
-        self.model_tester = FlaxDistilBertModelTester(self)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("distilbert-base-uncased")
-            outputs = model(np.ones((1, 1)))
-            self.assertIsNotNone(outputs)
-
-
-@require_flax
-class FlaxDistilBertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head_absolute_embedding(self):
-        model = FlaxDistilBertModel.from_pretrained("distilbert-base-uncased")
-        input_ids = np.array([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-        attention_mask = np.array([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = (1, 11, 768)
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = np.array([[[-0.1639, 0.3299, 0.1648], [-0.1746, 0.3289, 0.1710], [-0.1884, 0.3357, 0.1810]]])
-
-        self.assertTrue(jnp.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
diff --git a/tests/models/distilbert/test_modeling_tf_distilbert.py b/tests/models/distilbert/test_modeling_tf_distilbert.py
deleted file mode 100644
index 674acdad26..0000000000
--- a/tests/models/distilbert/test_modeling_tf_distilbert.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import DistilBertConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers.models.distilbert.modeling_tf_distilbert import (
-        TFDistilBertForMaskedLM,
-        TFDistilBertForMultipleChoice,
-        TFDistilBertForQuestionAnswering,
-        TFDistilBertForSequenceClassification,
-        TFDistilBertForTokenClassification,
-        TFDistilBertModel,
-    )
-
-
-class TFDistilBertModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = False
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = DistilBertConfig(
-            vocab_size=self.vocab_size,
-            dim=self.hidden_size,
-            n_layers=self.num_hidden_layers,
-            n_heads=self.num_attention_heads,
-            hidden_dim=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def create_and_check_distilbert_model(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFDistilBertModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-
-        result = model(inputs)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_distilbert_for_masked_lm(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFDistilBertForMaskedLM(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_distilbert_for_question_answering(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFDistilBertForQuestionAnswering(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_distilbert_for_sequence_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFDistilBertForSequenceClassification(config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_distilbert_for_multiple_choice(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = TFDistilBertForMultipleChoice(config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "attention_mask": multiple_choice_input_mask,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_distilbert_for_token_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFDistilBertForTokenClassification(config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFDistilBertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFDistilBertModel,
-            TFDistilBertForMaskedLM,
-            TFDistilBertForQuestionAnswering,
-            TFDistilBertForSequenceClassification,
-            TFDistilBertForTokenClassification,
-            TFDistilBertForMultipleChoice,
-        )
-        if is_tf_available()
-        else None
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFDistilBertModel,
-            "fill-mask": TFDistilBertForMaskedLM,
-            "question-answering": TFDistilBertForQuestionAnswering,
-            "text-classification": TFDistilBertForSequenceClassification,
-            "token-classification": TFDistilBertForTokenClassification,
-            "zero-shot": TFDistilBertForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFDistilBertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_distilbert_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_for_multiple_choice(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "distilbert/distilbert-base-cased"
-        model = TFDistilBertModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_tf
-class TFDistilBertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
-        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        expected_shape = [1, 6, 768]
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = tf.constant(
-            [
-                [
-                    [0.19261885, -0.13732955, 0.4119799],
-                    [0.22150156, -0.07422661, 0.39037204],
-                    [0.22756018, -0.0896414, 0.3701467],
-                ]
-            ]
-        )
-        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
diff --git a/tests/models/dpr/test_modeling_tf_dpr.py b/tests/models/dpr/test_modeling_tf_dpr.py
deleted file mode 100644
index 81427f3d94..0000000000
--- a/tests/models/dpr/test_modeling_tf_dpr.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright 2020 Huggingface
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import numpy
-    import tensorflow as tf
-
-    from transformers import (
-        BertConfig,
-        DPRConfig,
-        TFDPRContextEncoder,
-        TFDPRQuestionEncoder,
-        TFDPRReader,
-    )
-
-
-class TFDPRModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-        projection_dim=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.projection_dim = projection_dim
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            # follow test_modeling_tf_ctrl.py
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = BertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-        config = DPRConfig(projection_dim=self.projection_dim, **config.to_dict())
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def create_and_check_dpr_context_encoder(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFDPRContextEncoder(config=config)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim or self.hidden_size))
-
-    def create_and_check_dpr_question_encoder(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFDPRQuestionEncoder(config=config)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim or self.hidden_size))
-
-    def create_and_check_dpr_reader(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFDPRReader(config=config)
-        result = model(input_ids, attention_mask=input_mask)
-
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.relevance_logits.shape, (self.batch_size,))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids}
-        return config, inputs_dict
-
-
-@require_tf
-class TFDPRModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFDPRContextEncoder,
-            TFDPRQuestionEncoder,
-            TFDPRReader,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = {"feature-extraction": TFDPRQuestionEncoder} if is_tf_available() else {}
-
-    test_resize_embeddings = False
-    test_missing_keys = False
-    test_pruning = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFDPRModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DPRConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_dpr_context_encoder_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_dpr_context_encoder(*config_and_inputs)
-
-    def test_dpr_question_encoder_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_dpr_question_encoder(*config_and_inputs)
-
-    def test_dpr_reader_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_dpr_reader(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/dpr-ctx_encoder-single-nq-base"
-        model = TFDPRContextEncoder.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-        model_name = "facebook/dpr-ctx_encoder-single-nq-base"
-        model = TFDPRContextEncoder.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-        model_name = "facebook/dpr-ctx_encoder-single-nq-base"
-        model = TFDPRQuestionEncoder.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-        model_name = "facebook/dpr-ctx_encoder-single-nq-base"
-        model = TFDPRReader.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_tf
-class TFDPRModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head(self):
-        model = TFDPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
-
-        input_ids = tf.constant(
-            [[101, 7592, 1010, 2003, 2026, 3899, 10140, 1029, 102]]
-        )  # [CLS] hello, is my dog cute? [SEP]
-        output = model(input_ids)[0]  # embedding shape = (1, 768)
-        # compare the actual values for a slice.
-        expected_slice = tf.constant(
-            [
-                [
-                    0.03236253,
-                    0.12753335,
-                    0.16818509,
-                    0.00279786,
-                    0.3896933,
-                    0.24264945,
-                    0.2178971,
-                    -0.02335227,
-                    -0.08481959,
-                    -0.14324117,
-                ]
-            ]
-        )
-        self.assertTrue(numpy.allclose(output[:, :10].numpy(), expected_slice.numpy(), atol=1e-4))
diff --git a/tests/models/electra/test_modeling_flax_electra.py b/tests/models/electra/test_modeling_flax_electra.py
deleted file mode 100644
index 698a492fc3..0000000000
--- a/tests/models/electra/test_modeling_flax_electra.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import unittest
-
-import numpy as np
-
-from transformers import ElectraConfig, is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_flax_available():
-    from transformers.models.electra.modeling_flax_electra import (
-        FlaxElectraForCausalLM,
-        FlaxElectraForMaskedLM,
-        FlaxElectraForMultipleChoice,
-        FlaxElectraForPreTraining,
-        FlaxElectraForQuestionAnswering,
-        FlaxElectraForSequenceClassification,
-        FlaxElectraForTokenClassification,
-        FlaxElectraModel,
-    )
-
-
-class FlaxElectraModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_attention_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        embedding_size=24,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_choices=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.embedding_size = embedding_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_choices = num_choices
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        config = ElectraConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            embedding_size=self.embedding_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, token_type_ids, attention_mask
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, token_type_ids, attention_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_flax
-class FlaxElectraModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    test_head_masking = True
-
-    all_model_classes = (
-        (
-            FlaxElectraModel,
-            FlaxElectraForCausalLM,
-            FlaxElectraForMaskedLM,
-            FlaxElectraForPreTraining,
-            FlaxElectraForTokenClassification,
-            FlaxElectraForQuestionAnswering,
-            FlaxElectraForMultipleChoice,
-            FlaxElectraForSequenceClassification,
-        )
-        if is_flax_available()
-        else ()
-    )
-
-    def setUp(self):
-        self.model_tester = FlaxElectraModelTester(self)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            if model_class_name == FlaxElectraForMaskedLM:
-                model = model_class_name.from_pretrained("google/electra-small-generator")
-            else:
-                model = model_class_name.from_pretrained("google/electra-small-discriminator")
-            outputs = model(np.ones((1, 1)))
-            self.assertIsNotNone(outputs)
diff --git a/tests/models/electra/test_modeling_tf_electra.py b/tests/models/electra/test_modeling_tf_electra.py
deleted file mode 100644
index de9e61ea54..0000000000
--- a/tests/models/electra/test_modeling_tf_electra.py
+++ /dev/null
@@ -1,615 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import ElectraConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers.models.electra.modeling_tf_electra import (
-        TFElectraForMaskedLM,
-        TFElectraForMultipleChoice,
-        TFElectraForPreTraining,
-        TFElectraForQuestionAnswering,
-        TFElectraForSequenceClassification,
-        TFElectraForTokenClassification,
-        TFElectraModel,
-    )
-
-
-class TFElectraModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-        self.embedding_size = 128
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = ElectraConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFElectraModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_causal_lm_base_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.is_decoder = True
-
-        model = TFElectraModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFElectraModel(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "encoder_hidden_states": encoder_hidden_states,
-            "encoder_attention_mask": encoder_attention_mask,
-        }
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
-
-        # Also check the case where encoder outputs are not passed
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_causal_lm_base_model_past(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFElectraModel(config=config)
-
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and attn_mask
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-
-        output_from_no_past = model(next_input_ids, output_hidden_states=True).hidden_states[0]
-        output_from_past = model(
-            next_tokens, past_key_values=past_key_values, output_hidden_states=True
-        ).hidden_states[0]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
-    def create_and_check_causal_lm_base_model_past_with_attn_mask(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFElectraModel(config=config)
-
-        # create attention mask
-        half_seq_length = self.seq_length // 2
-        attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
-        attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
-        attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attn_mask, use_cache=True)
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        past_key_values = outputs.past_key_values
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
-        vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
-        condition = tf.transpose(
-            tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
-        )
-        input_ids = tf.where(condition, random_other_next_tokens, input_ids)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        attn_mask = tf.concat(
-            [attn_mask, tf.ones((attn_mask.shape[0], 1), dtype=tf.int32)],
-            axis=1,
-        )
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=attn_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens, past_key_values=past_key_values, attention_mask=attn_mask, output_hidden_states=True
-        ).hidden_states[0]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
-    def create_and_check_causal_lm_base_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFElectraModel(config=config)
-
-        input_ids = input_ids[:1, :]
-        input_mask = input_mask[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=input_mask, use_cache=True)
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        ).hidden_states[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFElectraModel(config=config)
-
-        input_ids = input_ids[:1, :]
-        input_mask = input_mask[:1, :]
-        encoder_hidden_states = encoder_hidden_states[:1, :, :]
-        encoder_attention_mask = encoder_attention_mask[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        ).hidden_states[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFElectraForMaskedLM(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_pretraining(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFElectraForPreTraining(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFElectraForSequenceClassification(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = TFElectraForMultipleChoice(config=config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFElectraForQuestionAnswering(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFElectraForTokenClassification(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFElectraModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFElectraModel,
-            TFElectraForMaskedLM,
-            TFElectraForPreTraining,
-            TFElectraForTokenClassification,
-            TFElectraForMultipleChoice,
-            TFElectraForSequenceClassification,
-            TFElectraForQuestionAnswering,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFElectraModel,
-            "fill-mask": TFElectraForMaskedLM,
-            "question-answering": TFElectraForQuestionAnswering,
-            "text-classification": TFElectraForSequenceClassification,
-            "token-classification": TFElectraForTokenClassification,
-            "zero-shot": TFElectraForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFElectraModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ElectraConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        """Test the base model"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_causal_lm_base_model(self):
-        """Test the base model of the causal LM model
-
-        is_decoder=True, no cross_attention, no encoder outputs
-        """
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_base_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        """Test the base model as a decoder (of an encoder-decoder architecture)
-
-        is_decoder=True + cross_attention + pass encoder outputs
-        """
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_causal_lm_base_model_past(self):
-        """Test causal LM base model with `past_key_values`"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_base_model_past(*config_and_inputs)
-
-    def test_causal_lm_base_model_past_with_attn_mask(self):
-        """Test the causal LM base model with `past_key_values` and `attention_mask`"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_base_model_past_with_attn_mask(*config_and_inputs)
-
-    def test_causal_lm_base_model_past_with_large_inputs(self):
-        """Test the causal LM base model with `past_key_values` and a longer decoder sequence length"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_base_model_past_large_inputs(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        """Similar to `test_causal_lm_base_model_past_with_large_inputs` but with cross-attention"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        #     model_name = 'google/electra-small-generator'
-        for model_name in ["google/electra-small-discriminator"]:
-            model = TFElectraModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-@require_tf
-class TFElectraModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = TFElectraForPreTraining.from_pretrained("lysandre/tiny-electra-random")
-        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        expected_shape = [1, 6]
-        self.assertEqual(output.shape, expected_shape)
-
-        print(output[:, :3])
-
-        expected_slice = tf.constant([[-0.24651965, 0.8835437, 1.823782]])
-        tf.debugging.assert_near(output[:, :3], expected_slice, atol=1e-4)
diff --git a/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py
deleted file mode 100644
index b17f9ed37b..0000000000
--- a/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py
+++ /dev/null
@@ -1,498 +0,0 @@
-# Copyright 2020 HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import tempfile
-import unittest
-
-import numpy as np
-
-from transformers import is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import ids_tensor
-from ..bart.test_modeling_flax_bart import FlaxBartStandaloneDecoderModelTester
-from ..bert.test_modeling_flax_bert import FlaxBertModelTester
-from ..gpt2.test_modeling_flax_gpt2 import FlaxGPT2ModelTester
-
-
-if is_flax_available():
-    from transformers import (
-        AutoTokenizer,
-        EncoderDecoderConfig,
-        FlaxBartForCausalLM,
-        FlaxBertForCausalLM,
-        FlaxBertModel,
-        FlaxEncoderDecoderModel,
-        FlaxGPT2LMHeadModel,
-    )
-
-
-@require_flax
-class FlaxEncoderDecoderMixin:
-    def get_encoder_decoder_model(self, config, decoder_config):
-        raise NotImplementedError
-
-    def prepare_config_and_inputs(self):
-        raise NotImplementedError
-
-    def get_pretrained_model(self):
-        raise NotImplementedError
-
-    def check_encoder_decoder_model_from_pretrained_configs(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-        self.assertTrue(encoder_decoder_config.decoder.is_decoder)
-
-        enc_dec_model = FlaxEncoderDecoderModel(encoder_decoder_config)
-
-        self.assertTrue(enc_dec_model.config.is_encoder_decoder)
-
-        outputs_encoder_decoder = enc_dec_model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-        self.assertEqual(
-            outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
-        )
-
-    def check_encoder_decoder_model_from_pretrained(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        return_dict,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
-        enc_dec_model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
-        outputs_encoder_decoder = enc_dec_model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-            return_dict=True,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-        self.assertEqual(
-            outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
-        )
-
-    def check_save_and_load(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model}
-        enc_dec_model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
-
-        outputs = enc_dec_model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        out_2 = np.array(outputs[0])
-        out_2[np.isnan(out_2)] = 0
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            enc_dec_model.save_pretrained(tmpdirname)
-            FlaxEncoderDecoderModel.from_pretrained(tmpdirname)
-
-            after_outputs = enc_dec_model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-            out_1 = np.array(after_outputs[0])
-            out_1[np.isnan(out_1)] = 0
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-    def check_encoder_decoder_model_from_encoder_decoder_pretrained(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        # assert that model attributes match those of configs
-        self.assertEqual(config.use_cache, encoder_model.config.use_cache)
-        self.assertEqual(decoder_config.use_cache, decoder_model.config.use_cache)
-
-        with tempfile.TemporaryDirectory() as enc_tmpdir:
-            with tempfile.TemporaryDirectory() as dec_tmpdir:
-                encoder_model.save_pretrained(enc_tmpdir)
-                decoder_model.save_pretrained(dec_tmpdir)
-                # load a model from pretrained encoder and decoder checkpoints, setting one encoder and one decoder kwarg opposite to that specified in their respective configs
-                enc_dec_model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained(
-                    encoder_pretrained_model_name_or_path=enc_tmpdir,
-                    decoder_pretrained_model_name_or_path=dec_tmpdir,
-                    encoder_use_cache=not config.use_cache,
-                    decoder_use_cache=not decoder_config.use_cache,
-                )
-
-        # assert that setting encoder and decoder kwargs opposite to those in the configs has correctly been applied
-        self.assertNotEqual(config.use_cache, enc_dec_model.config.encoder.use_cache)
-        self.assertNotEqual(decoder_config.use_cache, enc_dec_model.config.decoder.use_cache)
-
-        outputs_encoder_decoder = enc_dec_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            output_hidden_states=True,
-            return_dict=True,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-
-    def check_encoder_decoder_model_output_attentions(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        # make the decoder inputs a different shape from the encoder inputs to harden the test
-        decoder_input_ids = decoder_input_ids[:, :-1]
-        decoder_attention_mask = decoder_attention_mask[:, :-1]
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model}
-        enc_dec_model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
-        outputs_encoder_decoder = enc_dec_model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-            output_attentions=True,
-        )
-
-        encoder_attentions = outputs_encoder_decoder["encoder_attentions"]
-        self.assertEqual(len(encoder_attentions), config.num_hidden_layers)
-
-        self.assertEqual(
-            encoder_attentions[0].shape[-3:], (config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1])
-        )
-
-        decoder_attentions = outputs_encoder_decoder["decoder_attentions"]
-        num_decoder_layers = (
-            decoder_config.num_decoder_layers
-            if hasattr(decoder_config, "num_decoder_layers")
-            else decoder_config.num_hidden_layers
-        )
-        self.assertEqual(len(decoder_attentions), num_decoder_layers)
-
-        self.assertEqual(
-            decoder_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]),
-        )
-
-        cross_attentions = outputs_encoder_decoder["cross_attentions"]
-        self.assertEqual(len(cross_attentions), num_decoder_layers)
-
-        cross_attention_input_seq_len = decoder_input_ids.shape[-1] * (
-            1 + (decoder_config.ngram if hasattr(decoder_config, "ngram") else 0)
-        )
-        self.assertEqual(
-            cross_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, cross_attention_input_seq_len, input_ids.shape[-1]),
-        )
-
-    def check_encoder_decoder_model_generate(self, input_ids, config, decoder_config, **kwargs):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model}
-        enc_dec_model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
-
-        pad_token_id = enc_dec_model.config.decoder.pad_token_id
-        eos_token_id = enc_dec_model.config.decoder.eos_token_id
-        decoder_start_token_id = enc_dec_model.config.decoder.decoder_start_token_id
-
-        # Copied from generation.utils (GPT2 doesn't have `pad_token_id`)
-        if pad_token_id is None and eos_token_id is not None:
-            pad_token_id = eos_token_id
-        if decoder_start_token_id is None:
-            decoder_start_token_id = enc_dec_model.config.decoder.bos_token_id
-
-        # Bert does not have a bos token id, so use pad_token_id instead
-        # Copied from `test_modeling_encoder_decoder.py`
-        if decoder_start_token_id is None:
-            decoder_start_token_id = pad_token_id
-
-        generated_output = enc_dec_model.generate(
-            input_ids,
-            pad_token_id=pad_token_id,
-            eos_token_id=eos_token_id,
-            decoder_start_token_id=decoder_start_token_id,
-        )
-        generated_sequences = generated_output.sequences
-        self.assertEqual(generated_sequences.shape, (input_ids.shape[0],) + (decoder_config.max_length,))
-
-    def test_encoder_decoder_model_from_pretrained_configs(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained_configs(**input_ids_dict)
-
-    def test_encoder_decoder_model_from_pretrained(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=False)
-
-    def test_encoder_decoder_model_from_pretrained_return_dict(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=True)
-
-    def test_save_and_load_from_pretrained(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_save_and_load(**input_ids_dict)
-
-    def test_encoder_decoder_model_from_encoder_decoder_pretrained(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_encoder_decoder_pretrained(**input_ids_dict)
-
-    def test_encoder_decoder_model_output_attentions(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_output_attentions(**input_ids_dict)
-
-    def test_encoder_decoder_model_generate(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_generate(**input_ids_dict)
-
-    def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
-        diff = np.abs(a - b).max()
-        self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
-
-    @slow
-    def test_real_model_save_load_from_pretrained(self):
-        model_2 = self.get_pretrained_model()
-        input_ids = ids_tensor([13, 5], model_2.config.encoder.vocab_size)
-        decoder_input_ids = ids_tensor([13, 1], model_2.config.encoder.vocab_size)
-        attention_mask = ids_tensor([13, 5], vocab_size=2)
-
-        outputs = model_2(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-        )
-        out_2 = np.array(outputs[0])
-        out_2[np.isnan(out_2)] = 0
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            model_2.save_pretrained(tmp_dirname)
-            model_1 = FlaxEncoderDecoderModel.from_pretrained(tmp_dirname)
-
-            after_outputs = model_1(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-            )
-            out_1 = np.array(after_outputs[0])
-            out_1[np.isnan(out_1)] = 0
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-
-@require_flax
-class FlaxGPT2EncoderDecoderModelTest(FlaxEncoderDecoderMixin, unittest.TestCase):
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = FlaxBertModel(config)
-        decoder_model = FlaxGPT2LMHeadModel(decoder_config)
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        model_tester_encoder = FlaxBertModelTester(self, batch_size=13)
-        model_tester_decoder = FlaxGPT2ModelTester(self, batch_size=13)
-        encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
-        decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder()
-        (config, input_ids, token_type_ids, attention_mask) = encoder_config_and_inputs
-        (
-            decoder_config,
-            decoder_input_ids,
-            decoder_attention_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        return {
-            "config": config,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "encoder_hidden_states": encoder_hidden_states,
-        }
-
-    def get_pretrained_model(self):
-        return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "google-bert/bert-base-cased", "openai-community/gpt2"
-        )
-
-    @slow
-    def test_bert2gpt2_summarization(self):
-        tokenizer_in = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
-        tokenizer_out = AutoTokenizer.from_pretrained("openai-community/gpt2")
-
-        model = FlaxEncoderDecoderModel.from_pretrained(
-            "patrickvonplaten/bert2gpt2-cnn_dailymail-fp16", pad_token_id=tokenizer_out.eos_token_id
-        )
-
-        ARTICLE_STUDENTS = """(CNN)Sigma Alpha Epsilon is under fire for a video showing party-bound fraternity members singing a racist chant. SAE's national chapter suspended the students, but University of Oklahoma President David Boren took it a step further, saying the university's affiliation with the fraternity is permanently done. The news is shocking, but it's not the first time SAE has faced controversy. SAE was founded March 9, 1856, at the University of Alabama, five years before the American Civil War, according to the fraternity website. When the war began, the group had fewer than 400 members, of which "369 went to war for the Confederate States and seven for the Union Army," the website says. The fraternity now boasts more than 200,000 living alumni, along with about 15,000 undergraduates populating 219 chapters and 20 "colonies" seeking full membership at universities. SAE has had to work hard to change recently after a string of member deaths, many blamed on the hazing of new recruits, SAE national President Bradley Cohen wrote in a message on the fraternity's website. The fraternity's website lists more than 130 chapters cited or suspended for "health and safety incidents" since 2010. At least 30 of the incidents involved hazing, and dozens more involved alcohol. However, the list is missing numerous incidents from recent months. Among them, according to various media outlets: Yale University banned the SAEs from campus activities last month after members allegedly tried to interfere with a sexual misconduct investigation connected to an initiation rite. Stanford University in December suspended SAE housing privileges after finding sorority members attending a fraternity function were subjected to graphic sexual content. And Johns Hopkins University in November suspended the fraternity for underage drinking. "The media has labeled us as the 'nation's deadliest fraternity,' " Cohen said. In 2011, for example, a student died while being coerced into excessive alcohol consumption, according to a lawsuit. SAE's previous insurer dumped the fraternity. "As a result, we are paying Lloyd's of London the highest insurance rates in the Greek-letter world," Cohen said. Universities have turned down SAE's attempts to open new chapters, and the fraternity had to close 12 in 18 months over hazing incidents."""
-
-        EXPECTED_SUMMARY_STUDENTS = """SAE's national chapter suspended the students, but university president says it's permanent.\nSAE's national chapter has had to work hard to change recently.\nSAE's chapter has more than 200,000 members.\nSAE's chapter has been criticized for its hazing of new recruits."""
-
-        input_dict = tokenizer_in(ARTICLE_STUDENTS, return_tensors="np")
-        output_ids = model.generate(input_dict["input_ids"]).sequences
-        summary = tokenizer_out.batch_decode(output_ids, skip_special_tokens=True)
-
-        self.assertEqual(summary, [EXPECTED_SUMMARY_STUDENTS])
-
-
-@require_flax
-class FlaxBartEncoderDecoderModelTest(FlaxEncoderDecoderMixin, unittest.TestCase):
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = FlaxBertModel(config)
-        decoder_model = FlaxBartForCausalLM(decoder_config)
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        model_tester_encoder = FlaxBertModelTester(self, batch_size=13)
-        model_tester_decoder = FlaxBartStandaloneDecoderModelTester(self, batch_size=13)
-        encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
-        decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder()
-        (config, input_ids, token_type_ids, attention_mask) = encoder_config_and_inputs
-        (
-            decoder_config,
-            decoder_input_ids,
-            decoder_attention_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        return {
-            "config": config,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "encoder_hidden_states": encoder_hidden_states,
-        }
-
-    def get_pretrained_model(self):
-        return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "google-bert/bert-base-cased", "facebook/bart-base"
-        )
-
-
-@require_flax
-class FlaxBertEncoderDecoderModelTest(FlaxEncoderDecoderMixin, unittest.TestCase):
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = FlaxBertModel(config)
-        decoder_model = FlaxBertForCausalLM(decoder_config)
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        model_tester_encoder = FlaxBertModelTester(self, batch_size=13)
-        model_tester_decoder = FlaxBertModelTester(self, batch_size=13)
-        encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
-        decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder()
-        (config, input_ids, token_type_ids, attention_mask) = encoder_config_and_inputs
-        (
-            decoder_config,
-            decoder_input_ids,
-            decoder_attention_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        return {
-            "config": config,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "encoder_hidden_states": encoder_hidden_states,
-        }
-
-    def get_pretrained_model(self):
-        return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "google-bert/bert-base-cased", "google-bert/bert-base-cased"
-        )
-
-
-@require_flax
-class FlaxEncoderDecoderModelTest(unittest.TestCase):
-    def get_from_encoderdecoder_pretrained_model(self):
-        return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "google-bert/bert-base-cased", "openai-community/gpt2"
-        )
-
-    def _check_configuration_tie(self, model):
-        module = model.module.bind(model.params)
-
-        assert id(module.decoder.config) == id(model.config.decoder)
-        assert id(module.encoder.config) == id(model.config.encoder)
-
-    @slow
-    def test_configuration_tie(self):
-        model = self.get_from_encoderdecoder_pretrained_model()
-        self._check_configuration_tie(model)
diff --git a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
deleted file mode 100644
index 5e1da3242b..0000000000
--- a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
+++ /dev/null
@@ -1,850 +0,0 @@
-# Copyright 2020 HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import os
-import tempfile
-import unittest
-
-import numpy as np
-
-from transformers import is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_modeling_tf_common import ids_tensor
-from ..bert.test_modeling_tf_bert import TFBertModelTester
-from ..gpt2.test_modeling_tf_gpt2 import TFGPT2ModelTester
-from ..rembert.test_modeling_tf_rembert import TFRemBertModelTester
-from ..roberta.test_modeling_tf_roberta import TFRobertaModelTester
-
-
-if is_tf_available():
-    from transformers import (
-        AutoConfig,
-        AutoTokenizer,
-        EncoderDecoderConfig,
-        TFAutoModel,
-        TFAutoModelForCausalLM,
-        TFBertLMHeadModel,
-        TFBertModel,
-        TFEncoderDecoderModel,
-        TFGPT2LMHeadModel,
-        TFRemBertForCausalLM,
-        TFRemBertModel,
-        TFRobertaForCausalLM,
-        TFRobertaModel,
-    )
-    from transformers.modeling_tf_outputs import TFBaseModelOutput
-
-
-@require_tf
-class TFEncoderDecoderMixin:
-    def get_encoder_decoder_model(self, config, decoder_config):
-        raise NotImplementedError
-
-    def prepare_config_and_inputs(self):
-        raise NotImplementedError
-
-    def get_pretrained_model(self):
-        raise NotImplementedError
-
-    def check_encoder_decoder_model_from_pretrained_configs(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-        self.assertTrue(encoder_decoder_config.decoder.is_decoder)
-
-        enc_dec_model = TFEncoderDecoderModel(encoder_decoder_config)
-
-        self.assertTrue(enc_dec_model.config.is_encoder_decoder)
-
-        outputs_encoder_decoder = enc_dec_model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-            kwargs=kwargs,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-        self.assertEqual(
-            outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
-        )
-
-    def check_encoder_decoder_model(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        self.assertTrue(enc_dec_model.config.decoder.is_decoder)
-        self.assertTrue(enc_dec_model.config.decoder.add_cross_attention)
-        self.assertTrue(enc_dec_model.config.is_encoder_decoder)
-
-        outputs_encoder_decoder = enc_dec_model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-            kwargs=kwargs,
-        )
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-        self.assertEqual(
-            outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
-        )
-
-        encoder_outputs = TFBaseModelOutput(last_hidden_state=encoder_hidden_states)
-        outputs_encoder_decoder = enc_dec_model(
-            input_ids=None,
-            encoder_outputs=encoder_outputs,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-            kwargs=kwargs,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-        self.assertEqual(
-            outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
-        )
-
-    def check_encoder_decoder_model_from_pretrained(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        return_dict,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
-        enc_dec_model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
-        outputs_encoder_decoder = enc_dec_model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-            return_dict=True,
-            kwargs=kwargs,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-        self.assertEqual(
-            outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
-        )
-
-    def check_save_and_load(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-
-        outputs = enc_dec_model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-            kwargs=kwargs,
-        )
-        out_2 = np.array(outputs[0])
-        out_2[np.isnan(out_2)] = 0
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            enc_dec_model.save_pretrained(tmpdirname)
-            enc_dec_model = TFEncoderDecoderModel.from_pretrained(tmpdirname)
-
-            after_outputs = enc_dec_model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-                kwargs=kwargs,
-            )
-            out_1 = np.array(after_outputs[0])
-            out_1[np.isnan(out_1)] = 0
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-    def check_encoder_decoder_model_labels(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        labels,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-
-        outputs_encoder_decoder = enc_dec_model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-            labels=labels,
-            kwargs=kwargs,
-        )
-
-        # Make sure `loss` exist
-        self.assertIn("loss", outputs_encoder_decoder)
-
-        batch_size, seq_len = decoder_input_ids.shape
-        expected_shape = (batch_size, seq_len, decoder_config.vocab_size)
-        self.assertEqual(outputs_encoder_decoder["logits"].shape, expected_shape)
-        self.assertEqual(
-            outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
-        )
-
-    def _check_output_with_attentions(
-        self, outputs_encoder_decoder, config, input_ids, decoder_config, decoder_input_ids
-    ):
-        encoder_attentions = outputs_encoder_decoder["encoder_attentions"]
-        self.assertEqual(len(encoder_attentions), config.num_hidden_layers)
-
-        self.assertEqual(
-            encoder_attentions[0].shape[-3:], (config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1])
-        )
-
-        decoder_attentions = outputs_encoder_decoder["decoder_attentions"]
-        num_decoder_layers = (
-            decoder_config.num_decoder_layers
-            if hasattr(decoder_config, "num_decoder_layers")
-            else decoder_config.num_hidden_layers
-        )
-        self.assertEqual(len(decoder_attentions), num_decoder_layers)
-
-        self.assertEqual(
-            decoder_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]),
-        )
-
-        cross_attentions = outputs_encoder_decoder["cross_attentions"]
-        self.assertEqual(len(cross_attentions), num_decoder_layers)
-
-        cross_attention_input_seq_len = decoder_input_ids.shape[-1] * (
-            1 + (decoder_config.ngram if hasattr(decoder_config, "ngram") else 0)
-        )
-        self.assertEqual(
-            cross_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, cross_attention_input_seq_len, input_ids.shape[-1]),
-        )
-
-    def check_encoder_decoder_model_output_attentions(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        # make the decoder inputs a different shape from the encoder inputs to harden the test
-        decoder_input_ids = decoder_input_ids[:, :-1]
-        decoder_attention_mask = decoder_attention_mask[:, :-1]
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        outputs_encoder_decoder = enc_dec_model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-            output_attentions=True,
-            kwargs=kwargs,
-        )
-        self._check_output_with_attentions(
-            outputs_encoder_decoder, config, input_ids, decoder_config, decoder_input_ids
-        )
-
-    def check_encoder_decoder_model_output_attentions_from_config(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        # Similar to `check_encoder_decoder_model_output_attentions`, but with `output_attentions` triggered from the
-        # config file. Contrarily to most models, changing the model's config won't work -- the defaults are loaded
-        # from the inner models' configurations.
-
-        decoder_input_ids = decoder_input_ids[:, :-1]
-        decoder_attention_mask = decoder_attention_mask[:, :-1]
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        enc_dec_model.config.output_attentions = True  # model config -> won't work
-        outputs_encoder_decoder = enc_dec_model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-            kwargs=kwargs,
-        )
-        self.assertTrue(
-            all(
-                key not in outputs_encoder_decoder
-                for key in ["encoder_attentions", "decoder_attentions", "cross_attentions"]
-            )
-        )
-
-        config.output_attentions = True  # inner model config -> will work
-        decoder_config.output_attentions = True
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        outputs_encoder_decoder = enc_dec_model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-            kwargs=kwargs,
-        )
-        self._check_output_with_attentions(
-            outputs_encoder_decoder, config, input_ids, decoder_config, decoder_input_ids
-        )
-
-    def check_encoder_decoder_model_generate(self, input_ids, config, decoder_config, **kwargs):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-
-        # Generate until max length
-        if hasattr(enc_dec_model.config, "eos_token_id"):
-            enc_dec_model.config.eos_token_id = None
-        if hasattr(enc_dec_model.config, "decoder") and hasattr(enc_dec_model.config.decoder, "eos_token_id"):
-            enc_dec_model.config.decoder.eos_token_id = None
-        if hasattr(enc_dec_model.generation_config, "eos_token_id"):
-            enc_dec_model.generation_config.eos_token_id = None
-
-        # Bert does not have a bos token id, so use pad_token_id instead
-        generated_output = enc_dec_model.generate(
-            input_ids, decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id
-        )
-        self.assertEqual(tuple(generated_output.shape.as_list()), (input_ids.shape[0],) + (decoder_config.max_length,))
-
-    def test_encoder_decoder_model(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model(**input_ids_dict)
-
-    def test_encoder_decoder_model_from_pretrained_configs(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained_configs(**input_ids_dict)
-
-    def test_encoder_decoder_model_from_pretrained(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=False)
-
-    def test_encoder_decoder_model_from_pretrained_return_dict(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=True)
-
-    def test_save_and_load_from_pretrained(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_save_and_load(**input_ids_dict)
-
-    def test_encoder_decoder_model_labels(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_labels(**input_ids_dict)
-
-    def test_encoder_decoder_model_output_attentions(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_output_attentions(**input_ids_dict)
-
-    def test_encoder_decoder_model_output_attentions_from_config(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_output_attentions_from_config(**input_ids_dict)
-
-    def test_encoder_decoder_model_generate(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_generate(**input_ids_dict)
-
-    def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
-        diff = np.abs(a - b).max()
-        self.assertLessEqual(diff, tol, f"Difference between torch and tf is {diff} (>= {tol}).")
-
-    def test_model_save_load_from_pretrained(self):
-        model_2 = self.get_pretrained_model()
-        input_ids = ids_tensor([13, 5], model_2.config.encoder.vocab_size)
-        decoder_input_ids = ids_tensor([13, 1], model_2.config.decoder.vocab_size)
-        attention_mask = ids_tensor([13, 5], vocab_size=2)
-
-        outputs = model_2(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-        )
-        out_2 = np.array(outputs[0])
-        out_2[np.isnan(out_2)] = 0
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            model_2.save_pretrained(tmp_dirname)
-            model_1 = TFEncoderDecoderModel.from_pretrained(tmp_dirname)
-
-            after_outputs = model_1(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-            )
-            out_1 = np.array(after_outputs[0])
-            out_1[np.isnan(out_1)] = 0
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-
-@require_tf
-class TFBertEncoderDecoderModelTest(TFEncoderDecoderMixin, unittest.TestCase):
-    def setUp(self):
-        self.encoder_model_tester = TFBertModelTester(self, batch_size=13)
-        self.decoder_model_tester = TFBertModelTester(self, batch_size=13)
-
-    def get_pretrained_model(self):
-        return TFEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "hf-internal-testing/tiny-random-bert",
-            "hf-internal-testing/tiny-random-bert",
-        )
-
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = TFBertModel(config, name="encoder")
-        decoder_model = TFBertLMHeadModel(decoder_config, name="decoder")
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        encoder_config_and_inputs = self.encoder_model_tester.prepare_config_and_inputs()
-        decoder_config_and_inputs = self.decoder_model_tester.prepare_config_and_inputs_for_decoder()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            attention_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = encoder_config_and_inputs
-        (
-            decoder_config,
-            decoder_input_ids,
-            decoder_token_type_ids,
-            decoder_attention_mask,
-            decoder_sequence_labels,
-            decoder_token_labels,
-            decoder_choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        #  disable cache for now
-        decoder_config.use_cache = False
-        return {
-            "config": config,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_token_type_ids": decoder_token_type_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "decoder_sequence_labels": decoder_sequence_labels,
-            "decoder_token_labels": decoder_token_labels,
-            "decoder_choice_labels": decoder_choice_labels,
-            "encoder_hidden_states": encoder_hidden_states,
-            "labels": decoder_token_labels,
-        }
-
-
-@require_tf
-class TFGPT2EncoderDecoderModelTest(TFEncoderDecoderMixin, unittest.TestCase):
-    def setUp(self):
-        self.encoder_model_tester = TFBertModelTester(self, batch_size=13)
-        self.decoder_model_tester = TFGPT2ModelTester(self)
-
-    def get_pretrained_model(self):
-        return TFEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "hf-internal-testing/tiny-random-bert",
-            "hf-internal-testing/tiny-random-gpt2",
-        )
-
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = TFBertModel(config, name="encoder")
-        decoder_model = TFGPT2LMHeadModel(decoder_config, name="decoder")
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        encoder_config_and_inputs = self.encoder_model_tester.prepare_config_and_inputs()
-        decoder_config_and_inputs = self.decoder_model_tester.prepare_config_and_inputs_for_decoder()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            attention_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = encoder_config_and_inputs
-        (
-            decoder_config,
-            decoder_input_ids,
-            decoder_attention_mask,
-            decoder_head_mask,
-            decoder_token_type_ids,
-            decoder_sequence_labels,
-            decoder_token_labels,
-            decoder_choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        # disable cache for now
-        decoder_config.use_cache = False
-        return {
-            "config": config,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_token_type_ids": decoder_token_type_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "decoder_sequence_labels": decoder_sequence_labels,
-            "decoder_token_labels": decoder_token_labels,
-            "decoder_choice_labels": decoder_choice_labels,
-            "encoder_hidden_states": encoder_hidden_states,
-            "labels": decoder_token_labels,
-        }
-
-
-@require_tf
-class TFRoBertaEncoderDecoderModelTest(TFEncoderDecoderMixin, unittest.TestCase):
-    def setUp(self):
-        self.encoder_model_tester = TFRobertaModelTester(self)
-        self.decoder_model_tester = TFRobertaModelTester(self)
-
-    def get_pretrained_model(self):
-        return TFEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "hf-internal-testing/tiny-random-roberta",
-            "hf-internal-testing/tiny-random-roberta",
-        )
-
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = TFRobertaModel(config, name="encoder")
-        decoder_model = TFRobertaForCausalLM(decoder_config, name="decoder")
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        encoder_config_and_inputs = self.encoder_model_tester.prepare_config_and_inputs()
-        decoder_config_and_inputs = self.decoder_model_tester.prepare_config_and_inputs_for_decoder()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = encoder_config_and_inputs
-        (
-            decoder_config,
-            decoder_input_ids,
-            decoder_token_type_ids,
-            decoder_input_mask,
-            decoder_sequence_labels,
-            decoder_token_labels,
-            decoder_choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        #  disable cache for now
-        decoder_config.use_cache = False
-        return {
-            "config": config,
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_token_type_ids": decoder_token_type_ids,
-            "decoder_attention_mask": decoder_input_mask,
-            "decoder_sequence_labels": decoder_sequence_labels,
-            "decoder_token_labels": decoder_token_labels,
-            "decoder_choice_labels": decoder_choice_labels,
-            "encoder_hidden_states": encoder_hidden_states,
-            "labels": decoder_token_labels,
-        }
-
-
-@require_tf
-class TFRembertEncoderDecoderModelTest(TFEncoderDecoderMixin, unittest.TestCase):
-    def setUp(self):
-        self.encoder_model_tester = TFRemBertModelTester(self)
-        self.decoder_model_tester = TFRemBertModelTester(self)
-
-    def get_pretrained_model(self):
-        return TFEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "hf-internal-testing/tiny-random-rembert",
-            "hf-internal-testing/tiny-random-rembert",
-        )
-
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = TFRemBertModel(config, name="encoder")
-        decoder_model = TFRemBertForCausalLM(decoder_config, name="decoder")
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        encoder_config_and_inputs = self.encoder_model_tester.prepare_config_and_inputs()
-        decoder_config_and_inputs = self.decoder_model_tester.prepare_config_and_inputs_for_decoder()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = encoder_config_and_inputs
-        (
-            decoder_config,
-            decoder_input_ids,
-            decoder_token_type_ids,
-            decoder_input_mask,
-            decoder_sequence_labels,
-            decoder_token_labels,
-            decoder_choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        #  disable cache for now
-        decoder_config.use_cache = False
-        return {
-            "config": config,
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_token_type_ids": decoder_token_type_ids,
-            "decoder_attention_mask": decoder_input_mask,
-            "decoder_sequence_labels": decoder_sequence_labels,
-            "decoder_token_labels": decoder_token_labels,
-            "decoder_choice_labels": decoder_choice_labels,
-            "encoder_hidden_states": encoder_hidden_states,
-            "labels": decoder_token_labels,
-        }
-
-
-@require_tf
-class TFEncoderDecoderModelTest(unittest.TestCase):
-    def get_from_encoderdecoder_pretrained_model(self):
-        return TFEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "google-bert/bert-base-cased", "google-bert/bert-base-cased"
-        )
-
-    def get_decoder_config(self):
-        config = AutoConfig.from_pretrained("google-bert/bert-base-cased")
-        config.is_decoder = True
-        config.add_cross_attention = True
-        return config
-
-    def get_encoderdecoder_model(self):
-        return TFEncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
-
-    def get_encoder_decoder_models(self):
-        encoder_model = TFBertModel.from_pretrained("google-bert/bert-base-cased", name="encoder")
-        decoder_model = TFBertLMHeadModel.from_pretrained(
-            "google-bert/bert-base-cased", config=self.get_decoder_config(), name="decoder"
-        )
-        return {"encoder": encoder_model, "decoder": decoder_model}
-
-    def _check_configuration_tie(self, model):
-        assert id(model.decoder.config) == id(model.config.decoder)
-        assert id(model.encoder.config) == id(model.config.encoder)
-
-    @slow
-    def test_configuration_tie(self):
-        model = self.get_from_encoderdecoder_pretrained_model()
-        self._check_configuration_tie(model)
-
-        model = TFEncoderDecoderModel(**self.get_encoder_decoder_models())
-        self._check_configuration_tie(model)
-
-        # # This should be enabled once we upload the TF version of
-        # # "patrickvonplaten/bert2bert-cnn_dailymail-fp16" to the Hub.
-        # model = self.get_encoderdecoder_model()
-        # self._check_configuration_tie(model)
-
-
-@require_tf
-class TFEncoderDecoderModelSaveLoadTests(unittest.TestCase):
-    def get_encoder_decoder_config(self):
-        encoder_config = AutoConfig.from_pretrained("google-bert/bert-base-uncased")
-        decoder_config = AutoConfig.from_pretrained(
-            "google-bert/bert-base-uncased", is_decoder=True, add_cross_attention=True
-        )
-        return EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config)
-
-    def get_encoder_decoder_config_small(self):
-        encoder_config = AutoConfig.from_pretrained("hf-internal-testing/tiny-bert")
-        decoder_config = AutoConfig.from_pretrained(
-            "hf-internal-testing/tiny-bert", is_decoder=True, add_cross_attention=True
-        )
-        return EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config)
-
-    def test_encoder_decoder_save_load_from_encoder_decoder(self):
-        config = self.get_encoder_decoder_config_small()
-
-        # create two random BERT models for bert2bert & initialize weights (+cross_attention weights)
-        encoder = TFBertModel(config.encoder)
-        encoder.build_in_name_scope()
-        decoder = TFBertLMHeadModel(config.decoder)
-        decoder.build_in_name_scope()
-
-        encoder_decoder_orig = TFEncoderDecoderModel(encoder=encoder, decoder=decoder)
-
-        input_ids = ids_tensor([13, 5], encoder.config.vocab_size)
-        decoder_input_ids = ids_tensor([13, 1], decoder.config.vocab_size)
-
-        logits_orig = encoder_decoder_orig(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            encoder_path = os.path.join(tmp_dirname, "encoder")
-            decoder_path = os.path.join(tmp_dirname, "decoder")
-
-            encoder.save_pretrained(encoder_path)
-            decoder.save_pretrained(decoder_path)
-
-            encoder_decoder = TFEncoderDecoderModel.from_encoder_decoder_pretrained(encoder_path, decoder_path)
-
-        logits_1 = encoder_decoder(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits
-
-        self.assertTrue(logits_orig.numpy().sum() - logits_1.numpy().sum() < 1e-3)
-
-        max_diff = np.max(np.abs(logits_1.numpy() - logits_orig.numpy()))
-        self.assertAlmostEqual(max_diff, 0.0, places=4)
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            encoder_decoder.save_pretrained(tmp_dirname)
-            encoder_decoder = TFEncoderDecoderModel.from_pretrained(tmp_dirname)
-
-        logits_2 = encoder_decoder(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits
-
-        max_diff = np.max(np.abs(logits_2.numpy() - logits_orig.numpy()))
-        self.assertAlmostEqual(max_diff, 0.0, places=4)
-
-    @slow
-    def test_encoder_decoder_from_pretrained(self):
-        load_weight_prefix = TFEncoderDecoderModel.load_weight_prefix
-
-        config = self.get_encoder_decoder_config()
-        encoder_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
-        decoder_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
-
-        input_ids = encoder_tokenizer("who sings does he love me with reba", return_tensors="tf").input_ids
-        decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            # Since most of HF's models don't have pretrained cross-attention layers, they are randomly
-            # initialized even if we create models using `from_pretrained` method.
-            # For the tests, the decoder need to be a model with pretrained cross-attention layers.
-            # So we create pretrained models (without `load_weight_prefix`), save them, and later,
-            # we load them using `from_pretrained`.
-            # (we don't need to do this for encoder, but let's make the code more similar between encoder/decoder)
-            encoder = TFAutoModel.from_pretrained("google-bert/bert-base-uncased", name="encoder")
-            # It's necessary to specify `add_cross_attention=True` here.
-            decoder = TFAutoModelForCausalLM.from_pretrained(
-                "google-bert/bert-base-uncased", is_decoder=True, add_cross_attention=True, name="decoder"
-            )
-            pretrained_encoder_dir = os.path.join(tmp_dirname, "pretrained_encoder")
-            pretrained_decoder_dir = os.path.join(tmp_dirname, "pretrained_decoder")
-            encoder.save_pretrained(pretrained_encoder_dir)
-            decoder.save_pretrained(pretrained_decoder_dir)
-            del encoder
-            del decoder
-
-            enc_dec_model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
-                pretrained_encoder_dir,
-                pretrained_decoder_dir,
-            )
-            # check that the from pretrained methods work
-            enc_dec_model.save_pretrained(tmp_dirname)
-            enc_dec_model = TFEncoderDecoderModel.from_pretrained(tmp_dirname)
-
-            output = enc_dec_model(input_ids, decoder_input_ids=decoder_input_ids, labels=decoder_input_ids)
-
-            loss_pretrained = output.loss
-            del enc_dec_model
-
-            # Create the model using `__init__` with loaded ``pretrained`` encoder / decoder
-            encoder = TFAutoModel.from_pretrained(
-                pretrained_encoder_dir, load_weight_prefix=load_weight_prefix, name="encoder"
-            )
-            decoder = TFAutoModelForCausalLM.from_pretrained(
-                pretrained_decoder_dir, load_weight_prefix=load_weight_prefix, name="decoder"
-            )
-            enc_dec_model = TFEncoderDecoderModel(config=config, encoder=encoder, decoder=decoder)
-
-        output = enc_dec_model(input_ids, decoder_input_ids=decoder_input_ids, labels=decoder_input_ids)
-
-        loss_init = output.loss
-
-        max_diff = np.max(np.abs(loss_pretrained - loss_init))
-        expected_diff = 0.0
-
-        self.assertAlmostEqual(max_diff, expected_diff, places=4)
diff --git a/tests/models/esm/test_modeling_tf_esm.py b/tests/models/esm/test_modeling_tf_esm.py
deleted file mode 100644
index c7478ab3c0..0000000000
--- a/tests/models/esm/test_modeling_tf_esm.py
+++ /dev/null
@@ -1,323 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import EsmConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import numpy
-    import tensorflow as tf
-
-    from transformers.modeling_tf_utils import keras
-    from transformers.models.esm.modeling_tf_esm import (
-        TFEsmForMaskedLM,
-        TFEsmForSequenceClassification,
-        TFEsmForTokenClassification,
-        TFEsmModel,
-    )
-
-
-# copied from tests.test_modeling_tf_roberta
-class TFEsmModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = EsmConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            pad_token_id=1,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
-        model = TFEsmModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFEsmModel(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "encoder_hidden_states": encoder_hidden_states,
-            "encoder_attention_mask": encoder_attention_mask,
-        }
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs, encoder_hidden_states=encoder_hidden_states)
-
-        # Also check the case where encoder outputs are not passed
-        result = model(input_ids, attention_mask=input_mask)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFEsmForMaskedLM(config=config)
-        result = model([input_ids, input_mask])
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFEsmForTokenClassification(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFEsmModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFEsmModel,
-            TFEsmForMaskedLM,
-            TFEsmForSequenceClassification,
-            TFEsmForTokenClassification,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFEsmModel,
-            "fill-mask": TFEsmForMaskedLM,
-            "text-classification": TFEsmForSequenceClassification,
-            "token-classification": TFEsmForTokenClassification,
-            "zero-shot": TFEsmForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFEsmModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=EsmConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        """Test the base model"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        """Test the base model as a decoder (of an encoder-decoder architecture)
-
-        is_decoder=True + cross_attention + pass encoder outputs
-        """
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/esm2_t6_8M_UR50D"
-        model = TFEsmModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip("Protein models do not support embedding resizing.")
-    def test_resize_token_embeddings(self):
-        pass
-
-    @unittest.skip("Protein models do not support embedding resizing.")
-    def test_save_load_after_resize_token_embeddings(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), keras.layers.Layer)
-            if model_class is TFEsmForMaskedLM:
-                # Output embedding test differs from the main test because they're a matrix, not a layer
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
-
-@require_tf
-class TFEsmModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = TFEsmForMaskedLM.from_pretrained("facebook/esm2_t6_8M_UR50D")
-
-        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-        expected_shape = [1, 6, 33]
-        self.assertEqual(list(output.numpy().shape), expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = tf.constant(
-            [
-                [
-                    [8.921518, -10.589814, -6.4671307],
-                    [-6.3967156, -13.911377, -1.1211915],
-                    [-7.781247, -13.951557, -3.740592],
-                ]
-            ]
-        )
-        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-2))
-
-    @slow
-    def test_inference_no_head(self):
-        model = TFEsmModel.from_pretrained("facebook/esm2_t6_8M_UR50D")
-
-        input_ids = tf.constant([[0, 6, 4, 13, 5, 4, 16, 12, 11, 7, 2]])
-        output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        expected_slice = tf.constant(
-            [
-                [
-                    [0.14443092, 0.54125327, 0.3247739],
-                    [0.30340484, 0.00526676, 0.31077722],
-                    [0.32278043, -0.24987096, 0.3414628],
-                ]
-            ]
-        )
-        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
diff --git a/tests/models/flaubert/test_modeling_tf_flaubert.py b/tests/models/flaubert/test_modeling_tf_flaubert.py
deleted file mode 100644
index 1a2931c398..0000000000
--- a/tests/models/flaubert/test_modeling_tf_flaubert.py
+++ /dev/null
@@ -1,398 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import numpy as np
-    import tensorflow as tf
-
-    from transformers import (
-        FlaubertConfig,
-        TFFlaubertForMultipleChoice,
-        TFFlaubertForQuestionAnsweringSimple,
-        TFFlaubertForSequenceClassification,
-        TFFlaubertForTokenClassification,
-        TFFlaubertModel,
-        TFFlaubertWithLMHeadModel,
-    )
-
-
-class TFFlaubertModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_lengths = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.gelu_activation = True
-        self.sinusoidal_embeddings = False
-        self.causal = False
-        self.asm = False
-        self.n_langs = 2
-        self.vocab_size = 99
-        self.n_special = 0
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.summary_type = "last"
-        self.use_proj = True
-        self.scope = None
-        self.bos_token_id = 0
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_mask = random_attention_mask([self.batch_size, self.seq_length], dtype=tf.float32)
-
-        input_lengths = None
-        if self.use_input_lengths:
-            input_lengths = (
-                ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
-            )  # small variation of seq_length
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
-
-        sequence_labels = None
-        token_labels = None
-        is_impossible_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = FlaubertConfig(
-            vocab_size=self.vocab_size,
-            n_special=self.n_special,
-            emb_dim=self.hidden_size,
-            n_layers=self.num_hidden_layers,
-            n_heads=self.num_attention_heads,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            gelu_activation=self.gelu_activation,
-            sinusoidal_embeddings=self.sinusoidal_embeddings,
-            asm=self.asm,
-            causal=self.causal,
-            n_langs=self.n_langs,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            summary_type=self.summary_type,
-            use_proj=self.use_proj,
-            bos_token_id=self.bos_token_id,
-        )
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            choice_labels,
-            input_mask,
-        )
-
-    def create_and_check_flaubert_model(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = TFFlaubertModel(config=config)
-        inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_flaubert_lm_head(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = TFFlaubertWithLMHeadModel(config)
-
-        inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
-        result = model(inputs)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_flaubert_qa(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = TFFlaubertForQuestionAnsweringSimple(config)
-
-        inputs = {"input_ids": input_ids, "lengths": input_lengths}
-
-        result = model(inputs)
-
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_flaubert_sequence_classif(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = TFFlaubertForSequenceClassification(config)
-
-        inputs = {"input_ids": input_ids, "lengths": input_lengths}
-
-        result = model(inputs)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def create_and_check_flaubert_for_token_classification(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        config.num_labels = self.num_labels
-        model = TFFlaubertForTokenClassification(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_flaubert_for_multiple_choice(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        config.num_choices = self.num_choices
-        model = TFFlaubertForMultipleChoice(config=config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            choice_labels,
-            input_mask,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "langs": token_type_ids,
-            "lengths": input_lengths,
-        }
-        return config, inputs_dict
-
-
-@require_tf
-class TFFlaubertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFFlaubertModel,
-            TFFlaubertWithLMHeadModel,
-            TFFlaubertForSequenceClassification,
-            TFFlaubertForQuestionAnsweringSimple,
-            TFFlaubertForTokenClassification,
-            TFFlaubertForMultipleChoice,
-        )
-        if is_tf_available()
-        else ()
-    )
-    all_generative_model_classes = (
-        (TFFlaubertWithLMHeadModel,) if is_tf_available() else ()
-    )  # TODO (PVP): Check other models whether language generation is also applicable
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFFlaubertModel,
-            "fill-mask": TFFlaubertWithLMHeadModel,
-            "question-answering": TFFlaubertForQuestionAnsweringSimple,
-            "text-classification": TFFlaubertForSequenceClassification,
-            "token-classification": TFFlaubertForTokenClassification,
-            "zero-shot": TFFlaubertForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_case_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        image_processor_name,
-        feature_extractor_name,
-        processor_name,
-    ):
-        if (
-            pipeline_test_case_name == "QAPipelineTests"
-            and tokenizer_name is not None
-            and not tokenizer_name.endswith("Fast")
-        ):
-            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
-            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
-            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = TFFlaubertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=FlaubertConfig, emb_dim=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_flaubert_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_flaubert_model(*config_and_inputs)
-
-    def test_flaubert_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_flaubert_lm_head(*config_and_inputs)
-
-    def test_flaubert_qa(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_flaubert_qa(*config_and_inputs)
-
-    def test_flaubert_sequence_classif(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_flaubert_sequence_classif(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_flaubert_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_flaubert_for_multiple_choice(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "hf-internal-testing/tiny-random-flaubert"
-        model = TFFlaubertModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_tf
-@require_sentencepiece
-@require_tokenizers
-class TFFlaubertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_output_embeds_base_model(self):
-        model = TFFlaubertModel.from_pretrained("jplu/tf-flaubert-small-cased")
-
-        input_ids = tf.convert_to_tensor(
-            [[0, 158, 735, 2592, 1424, 6727, 82, 1]],
-            dtype=tf.int32,
-        )  # "J'aime flaubert !"
-
-        output = model(input_ids)[0]
-        expected_shape = tf.TensorShape((1, 8, 512))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = tf.convert_to_tensor(
-            [
-                [
-                    [-1.8768773, -1.566555, 0.27072418],
-                    [-1.6920038, -0.5873505, 1.9329599],
-                    [-2.9563985, -1.6993835, 1.7972052],
-                ]
-            ],
-            dtype=tf.float32,
-        )
-
-        self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
diff --git a/tests/models/funnel/test_modeling_tf_funnel.py b/tests/models/funnel/test_modeling_tf_funnel.py
deleted file mode 100644
index 673982eb7b..0000000000
--- a/tests/models/funnel/test_modeling_tf_funnel.py
+++ /dev/null
@@ -1,414 +0,0 @@
-# Copyright 2020 HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import FunnelConfig, is_tf_available
-from transformers.testing_utils import require_tf
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TFFunnelBaseModel,
-        TFFunnelForMaskedLM,
-        TFFunnelForMultipleChoice,
-        TFFunnelForPreTraining,
-        TFFunnelForQuestionAnswering,
-        TFFunnelForSequenceClassification,
-        TFFunnelForTokenClassification,
-        TFFunnelModel,
-    )
-
-
-class TFFunnelModelTester:
-    """You can also import this e.g, from .test_modeling_funnel import FunnelModelTester"""
-
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        block_sizes=[1, 1, 2],
-        num_decoder_layers=1,
-        d_model=32,
-        n_head=4,
-        d_head=8,
-        d_inner=37,
-        hidden_act="gelu_new",
-        hidden_dropout=0.1,
-        attention_dropout=0.1,
-        activation_dropout=0.0,
-        max_position_embeddings=512,
-        type_vocab_size=3,
-        initializer_std=0.02,  # Set to a smaller value, so we can keep the small error threshold (1e-5) in the test
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-        base=False,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.block_sizes = block_sizes
-        self.num_decoder_layers = num_decoder_layers
-        self.d_model = d_model
-        self.n_head = n_head
-        self.d_head = d_head
-        self.d_inner = d_inner
-        self.hidden_act = hidden_act
-        self.hidden_dropout = hidden_dropout
-        self.attention_dropout = attention_dropout
-        self.activation_dropout = activation_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = 2
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.initializer_std = initializer_std
-
-        # Used in the tests to check the size of the first attention layer
-        self.num_attention_heads = n_head
-        # Used in the tests to check the size of the first hidden state
-        self.hidden_size = self.d_model
-        # Used in the tests to check the number of output hidden states/attentions
-        self.num_hidden_layers = sum(self.block_sizes) + (0 if base else self.num_decoder_layers)
-        # FunnelModel adds two hidden layers: input embeddings and the sum of the upsampled encoder hidden state with
-        # the last hidden state of the first block (which is the first hidden state of the decoder).
-        if not base:
-            self.expected_num_hidden_layers = self.num_hidden_layers + 2
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = FunnelConfig(
-            vocab_size=self.vocab_size,
-            block_sizes=self.block_sizes,
-            num_decoder_layers=self.num_decoder_layers,
-            d_model=self.d_model,
-            n_head=self.n_head,
-            d_head=self.d_head,
-            d_inner=self.d_inner,
-            hidden_act=self.hidden_act,
-            hidden_dropout=self.hidden_dropout,
-            attention_dropout=self.attention_dropout,
-            activation_dropout=self.activation_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_std=self.initializer_std,
-        )
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = TFFunnelModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model))
-
-        config.truncate_seq = False
-        model = TFFunnelModel(config=config)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model))
-
-        config.separate_cls = False
-        model = TFFunnelModel(config=config)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model))
-
-    def create_and_check_base_model(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = TFFunnelBaseModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 2, self.d_model))
-
-        config.truncate_seq = False
-        model = TFFunnelBaseModel(config=config)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 3, self.d_model))
-
-        config.separate_cls = False
-        model = TFFunnelBaseModel(config=config)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 2, self.d_model))
-
-    def create_and_check_for_pretraining(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = TFFunnelForPreTraining(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_masked_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = TFFunnelForMaskedLM(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_sequence_classification(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = TFFunnelForSequenceClassification(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.num_choices = self.num_choices
-        model = TFFunnelForMultipleChoice(config=config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_token_classification(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = TFFunnelForTokenClassification(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = TFFunnelForQuestionAnswering(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFFunnelModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFFunnelModel,
-            TFFunnelForMaskedLM,
-            TFFunnelForPreTraining,
-            TFFunnelForQuestionAnswering,
-            TFFunnelForTokenClassification,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": (TFFunnelBaseModel, TFFunnelModel),
-            "fill-mask": TFFunnelForMaskedLM,
-            "question-answering": TFFunnelForQuestionAnswering,
-            "text-classification": TFFunnelForSequenceClassification,
-            "token-classification": TFFunnelForTokenClassification,
-            "zero-shot": TFFunnelForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFFunnelModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=FunnelConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-
-@require_tf
-class TFFunnelBaseModelTest(TFModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (TFFunnelBaseModel, TFFunnelForMultipleChoice, TFFunnelForSequenceClassification) if is_tf_available() else ()
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFFunnelModelTester(self, base=True)
-        self.config_tester = ConfigTester(self, config_class=FunnelConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_base_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_base_model(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
diff --git a/tests/models/gemma/test_modeling_flax_gemma.py b/tests/models/gemma/test_modeling_flax_gemma.py
deleted file mode 100644
index 8bd5a5bb41..0000000000
--- a/tests/models/gemma/test_modeling_flax_gemma.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-import numpy as np
-
-from transformers import AutoTokenizer, GemmaConfig, is_flax_available
-from transformers.testing_utils import require_flax, require_read_token, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
-
-
-if is_flax_available():
-    import jax
-    import jax.numpy as jnp
-
-    from transformers.models.gemma.modeling_flax_gemma import (
-        FlaxGemmaForCausalLM,
-        FlaxGemmaModel,
-    )
-
-
-class FlaxGemmaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        num_key_value_heads=2,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = None
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-        self.pad_token_id = vocab_size - 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = np.tril(np.ones((self.batch_size, self.seq_length)))
-
-        config = GemmaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_key_value_heads,
-            head_dim=self.hidden_size // self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            use_cache=True,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, input_mask
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-    def check_use_cache_forward(self, model_class_name, config, input_ids, attention_mask):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
-        attention_mask = jnp.ones((input_ids.shape[0], max_decoder_length), dtype="i4")
-
-        position_ids = jnp.broadcast_to(
-            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
-        )
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-        )
-
-        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            attention_mask=attention_mask,
-            past_key_values=outputs_cache.past_key_values,
-            position_ids=position_ids,
-        )
-
-        outputs = model(input_ids)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input_ids, attention_mask):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        attention_mask_cache = jnp.concatenate(
-            [attention_mask, jnp.zeros((attention_mask.shape[0], max_decoder_length - attention_mask.shape[1]))],
-            axis=-1,
-        )
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
-        position_ids = jnp.broadcast_to(
-            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
-        )
-
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask_cache,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-        )
-        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            past_key_values=outputs_cache.past_key_values,
-            attention_mask=attention_mask_cache,
-            position_ids=position_ids,
-        )
-
-        outputs = model(input_ids, attention_mask=attention_mask)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-
-@require_flax
-class FlaxGemmaModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxGemmaModel, FlaxGemmaForCausalLM) if is_flax_available() else ()
-
-    def setUp(self):
-        self.model_tester = FlaxGemmaModelTester(self)
-
-    def test_use_cache_forward(self):
-        for model_class_name in self.all_model_classes:
-            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
-            self.model_tester.check_use_cache_forward(model_class_name, config, input_ids, attention_mask)
-
-    def test_use_cache_forward_with_attn_mask(self):
-        for model_class_name in self.all_model_classes:
-            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
-            self.model_tester.check_use_cache_forward_with_attn_mask(
-                model_class_name, config, input_ids, attention_mask
-            )
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("google/gemma-2b", from_pt=True)
-            outputs = model(np.ones((1, 1)))
-            self.assertIsNotNone(outputs)
-
-
-@slow
-@require_flax
-@require_read_token
-class FlaxGemmaIntegrationTest(unittest.TestCase):
-    input_text = ["The capital of France is", "To play the perfect cover drive"]
-    model_id = "google/gemma-2b"
-    revision = "flax"
-
-    def setUp(self):
-        self.model, self.params = FlaxGemmaForCausalLM.from_pretrained(
-            self.model_id, revision=self.revision, _do_init=False
-        )
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
-        self.tokenizer.padding_side = "left"
-
-    def test_logits(self):
-        inputs = self.tokenizer(self.input_text, return_tensors="np", padding=True)
-        # fmt: off
-        EXPECTED_MEAN = [
-            [-16.427, -21.386, -35.491, -36.258, -31.401, -36.370, -37.598],
-            [-21.386, -32.150, -33.155, -34.344, -34.706, -34.678, -38.495],
-        ]
-        EXPECTED_SLICE = [-33.462, -16.481, -30.837, -32.195, -33.113]
-        # fmt: on
-
-        logits = self.model(**inputs, params=self.params).logits
-
-        diff_mean = jnp.abs(logits.mean(-1) - np.array(EXPECTED_MEAN)).max()
-        diff_slice = jnp.abs(logits[0, -1, 475:480] - np.array(EXPECTED_SLICE)).max()
-
-        self.assertAlmostEqual(diff_mean, 0, places=3)
-        self.assertAlmostEqual(diff_slice, 0, places=3)
-
-    def test_generation(self):
-        EXPECTED_TEXTS = [
-            "The capital of France is a city of contrasts. It is a city of history, of art, of culture, of fashion",
-            "To play the perfect cover drive, you need to have a good technique and a good mindset.\n\nThe cover drive is a shot",
-        ]
-        inputs = self.tokenizer(self.input_text, return_tensors="np", padding=True)
-
-        output = self.model.generate(**inputs, params=self.params, max_new_tokens=20, do_sample=False)
-        output_text = self.tokenizer.batch_decode(output.sequences, skip_special_tokens=True)
-
-        self.assertEqual(output_text, EXPECTED_TEXTS)
-
-    def test_jit_generation(self):
-        EXPECTED_TEXTS = [
-            "The capital of France is a city of contrasts. It is a city of history, culture, and art, but it is",
-            "To play the perfect cover drive, you need to have a good technique and a good mindset.\n\nThe cover drive is a shot",
-        ]
-        inputs = self.tokenizer(self.input_text, return_tensors="np", padding=True)
-
-        def generate(input_ids, attention_mask):
-            outputs = self.model.generate(
-                input_ids, attention_mask=attention_mask, params=self.params, max_new_tokens=20, do_sample=False
-            )
-            return outputs
-
-        jit_generate = jax.jit(generate)
-        output_sequences = jit_generate(**inputs).sequences
-        output_text = self.tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
-
-        self.assertEqual(output_text, EXPECTED_TEXTS)
diff --git a/tests/models/gpt2/test_modeling_flax_gpt2.py b/tests/models/gpt2/test_modeling_flax_gpt2.py
deleted file mode 100644
index 3297a3c45d..0000000000
--- a/tests/models/gpt2/test_modeling_flax_gpt2.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers import GPT2Config, GPT2Tokenizer, is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-if is_flax_available():
-    import jax
-    import jax.numpy as jnp
-
-    from transformers.models.gpt2.modeling_flax_gpt2 import FlaxGPT2LMHeadModel, FlaxGPT2Model
-
-
-class FlaxGPT2ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = None
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-        self.pad_token_id = vocab_size - 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = GPT2Config(
-            vocab_size=self.vocab_size,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            n_positions=self.max_position_embeddings,
-            use_cache=False,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-        )
-
-        return (config, input_ids, input_mask)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_decoder(self):
-        config, input_ids, attention_mask = self.prepare_config_and_inputs()
-
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def check_use_cache_forward(self, model_class_name, config, input_ids, attention_mask):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
-        attention_mask = jnp.ones((input_ids.shape[0], max_decoder_length), dtype="i4")
-
-        position_ids = jnp.broadcast_to(
-            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
-        )
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-        )
-
-        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            attention_mask=attention_mask,
-            past_key_values=outputs_cache.past_key_values,
-            position_ids=position_ids,
-        )
-
-        outputs = model(input_ids)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input_ids, attention_mask):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        attention_mask_cache = jnp.concatenate(
-            [attention_mask, jnp.zeros((attention_mask.shape[0], max_decoder_length - attention_mask.shape[1]))],
-            axis=-1,
-        )
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
-        position_ids = jnp.broadcast_to(
-            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
-        )
-
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask_cache,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-        )
-        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            past_key_values=outputs_cache.past_key_values,
-            attention_mask=attention_mask_cache,
-            position_ids=position_ids,
-        )
-
-        outputs = model(input_ids, attention_mask=attention_mask)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    def check_bool_attention_mask_in_generation(self, model_class_name, config, input_ids, attention_mask):
-        model = model_class_name(config)
-
-        output_int_att_mask = model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            max_new_tokens=3,
-        )
-
-        output_bool_att_mask = model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask.astype(bool),
-            max_new_tokens=3,
-        )
-
-        self.parent.assertTrue(
-            (output_bool_att_mask.sequences == output_int_att_mask.sequences).all(),
-            "Generated response differ between boolean and integer attention mask",
-        )
-
-
-@require_flax
-class FlaxGPT2ModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxGPT2Model, FlaxGPT2LMHeadModel) if is_flax_available() else ()
-
-    def setUp(self):
-        self.model_tester = FlaxGPT2ModelTester(self)
-
-    def test_use_cache_forward(self):
-        for model_class_name in self.all_model_classes:
-            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
-            self.model_tester.check_use_cache_forward(model_class_name, config, input_ids, attention_mask)
-
-    def test_use_cache_forward_with_attn_mask(self):
-        for model_class_name in self.all_model_classes:
-            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
-            self.model_tester.check_use_cache_forward_with_attn_mask(
-                model_class_name, config, input_ids, attention_mask
-            )
-
-    def test_bool_attention_mask_in_generation(self):
-        for model_class_name in self.all_generative_model_classes:
-            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
-            self.model_tester.check_bool_attention_mask_in_generation(
-                model_class_name, config, input_ids, attention_mask
-            )
-
-    @slow
-    def test_batch_generation(self):
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2", pad_token="</s>", padding_side="left")
-        inputs = tokenizer(["Hello this is a long string", "Hey"], return_tensors="np", padding=True, truncation=True)
-
-        model = FlaxGPT2LMHeadModel.from_pretrained("openai-community/gpt2")
-        model.do_sample = False
-        model.config.pad_token_id = model.config.eos_token_id
-
-        jit_generate = jax.jit(model.generate)
-
-        output_sequences = jit_generate(inputs["input_ids"], attention_mask=inputs["attention_mask"]).sequences
-
-        output_string = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
-
-        expected_string = [
-            "Hello this is a long string of words. I'm going to start with the first one.\n",
-            "Hey, I'm not sure if I'm going to be able to do",
-        ]
-
-        self.assertListEqual(output_string, expected_string)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("openai-community/gpt2", from_pt=True)
-            outputs = model(np.ones((1, 1)))
-            self.assertIsNotNone(outputs)
diff --git a/tests/models/gpt2/test_modeling_tf_gpt2.py b/tests/models/gpt2/test_modeling_tf_gpt2.py
deleted file mode 100644
index 76ecd6d15b..0000000000
--- a/tests/models/gpt2/test_modeling_tf_gpt2.py
+++ /dev/null
@@ -1,732 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import GPT2Config, is_tf_available
-from transformers.testing_utils import require_tf, require_tf2onnx, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-from ...utils.test_modeling_tf_core import TFCoreModelTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import GPT2Tokenizer
-    from transformers.models.gpt2.modeling_tf_gpt2 import (
-        TFGPT2DoubleHeadsModel,
-        TFGPT2ForSequenceClassification,
-        TFGPT2LMHeadModel,
-        TFGPT2Model,
-    )
-    from transformers.tf_utils import shape_list
-
-
-class TFGPT2ModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_token_type_ids = True
-        self.use_input_mask = True
-        self.use_labels = True
-        self.use_mc_token_ids = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-        self.bos_token_id = self.vocab_size - 1
-        self.eos_token_id = self.vocab_size - 1
-        self.pad_token_id = self.vocab_size - 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        mc_token_ids = None
-        if self.use_mc_token_ids:
-            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = GPT2Config(
-            vocab_size=self.vocab_size,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            # intermediate_size=self.intermediate_size,
-            # hidden_act=self.hidden_act,
-            # hidden_dropout_prob=self.hidden_dropout_prob,
-            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            n_positions=self.max_position_embeddings,
-            # type_vocab_size=self.type_vocab_size,
-            # initializer_range=self.initializer_range
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            return_dict=True,
-        )
-
-        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = TFGPT2Model(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-
-        inputs = [input_ids, None, input_mask]  # None is the input for 'past'
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = TFGPT2Model(config=config)
-
-        # first forward pass
-        outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids)
-        outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1)
-
-        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
-    def create_and_check_gpt2_model_attention_mask_past(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = TFGPT2Model(config=config)
-
-        # create attention mask
-        half_seq_length = self.seq_length // 2
-        attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
-        attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
-        attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
-
-        # first forward pass
-        output, past_key_values = model(input_ids, attention_mask=attn_mask).to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
-        vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
-        condition = tf.transpose(
-            tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
-        )
-        input_ids = tf.where(condition, random_other_next_tokens, input_ids)
-
-        # append to next input_ids and attn_mask
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        attn_mask = tf.concat([attn_mask, tf.ones((shape_list(attn_mask)[0], 1), dtype=tf.int32)], axis=1)
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-12)
-
-    def create_and_check_gpt2_model_past_large_inputs(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = TFGPT2Model(config=config)
-
-        input_ids = input_ids[:1, :]
-        input_mask = input_mask[:1, :]
-        token_type_ids = token_type_ids[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-        next_token_types = ids_tensor((self.batch_size, 3), self.type_vocab_size)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-        next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask
-        )["last_hidden_state"]
-        output_from_past = model(
-            next_tokens,
-            token_type_ids=next_token_types,
-            attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-        )["last_hidden_state"]
-        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = TFGPT2LMHeadModel(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_gpt2_double_head(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
-    ):
-        model = TFGPT2DoubleHeadsModel(config=config)
-
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "mc_token_ids": mc_token_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_choices, self.seq_length, self.vocab_size)
-        )
-        self.parent.assertEqual(result.mc_logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_gpt2_for_sequence_classification(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
-    ):
-        config.num_labels = self.num_labels
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "labels": sequence_labels,
-        }
-        model = TFGPT2ForSequenceClassification(config)
-
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-        }
-        return config, inputs_dict
-
-
-@require_tf
-class TFGPT2ModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2ForSequenceClassification, TFGPT2DoubleHeadsModel)
-        if is_tf_available()
-        else ()
-    )
-    all_generative_model_classes = (TFGPT2LMHeadModel,) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFGPT2Model,
-            "text-classification": TFGPT2ForSequenceClassification,
-            "text-generation": TFGPT2LMHeadModel,
-            "zero-shot": TFGPT2ForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = True
-    onnx_min_opset = 10
-
-    def setUp(self):
-        self.model_tester = TFGPT2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_gpt2_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
-
-    def test_gpt2_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_model_past(*config_and_inputs)
-
-    def test_gpt2_model_att_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_model_attention_mask_past(*config_and_inputs)
-
-    def test_gpt2_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_model_past_large_inputs(*config_and_inputs)
-
-    def test_gpt2_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_lm_head(*config_and_inputs)
-
-    def test_gpt2_double_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
-
-    def test_gpt2_sequence_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_for_sequence_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "openai-community/gpt2"
-        model = TFGPT2Model.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    # overwrite from common since ONNX runtime optimization doesn't work with tf.gather() when the argument
-    # `batch_dims` > 0"
-    @require_tf2onnx
-    @slow
-    def test_onnx_runtime_optimize(self):
-        if not self.test_onnx:
-            return
-
-        import onnxruntime
-        import tf2onnx
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            # Skip these 2 classes which uses `tf.gather` with `batch_dims=1`
-            if model_class in [TFGPT2ForSequenceClassification, TFGPT2DoubleHeadsModel]:
-                continue
-
-            model = model_class(config)
-            model.build_in_name_scope()
-
-            onnx_model_proto, _ = tf2onnx.convert.from_keras(model, opset=self.onnx_min_opset)
-
-            onnxruntime.InferenceSession(onnx_model_proto.SerializeToString())
-
-    # TODO (Joao): fix me
-    @unittest.skip("Onnx compliance broke with TF 2.10")
-    def test_onnx_compliancy(self):
-        pass
-
-
-@require_tf
-class TFGPT2ModelLanguageGenerationTest(unittest.TestCase):
-    @slow
-    def test_lm_generate_greedy_distilgpt2_batch_special(self):
-        model = TFGPT2LMHeadModel.from_pretrained("distilbert/distilgpt2")
-        tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2")
-
-        tokenizer.pad_token = tokenizer.eos_token
-        tokenizer.padding_side = "left"
-
-        sentences = ["Today is a beautiful day and", "Yesterday was"]
-        input_ids = tokenizer(sentences, return_tensors="tf", padding=True)
-
-        generation_kwargs = {
-            "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids],
-            "no_repeat_ngram_size": 2,
-            "do_sample": False,
-            "repetition_penalty": 1.3,
-        }
-
-        output_ids = model.generate(**input_ids, **generation_kwargs)
-
-        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-        expected_output_string = [
-            "Today is a beautiful day and I am so happy to be able take part in this amazing event.",
-            "Yesterday was a very interesting time for the world to see how much of this is",
-        ]
-        self.assertListEqual(output_strings, expected_output_string)
-
-    @slow
-    def test_lm_generate_sample_distilgpt2_batch_special(self):
-        model = TFGPT2LMHeadModel.from_pretrained("distilbert/distilgpt2")
-        tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2")
-
-        tokenizer.pad_token = tokenizer.eos_token
-        tokenizer.padding_side = "left"
-
-        sentences = ["Today is a beautiful day and", "Yesterday was"]
-        input_ids = tokenizer(sentences, return_tensors="tf", padding=True)
-
-        generation_kwargs = {
-            "do_sample": True,
-            "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids],
-            "no_repeat_ngram_size": 2,
-            "repetition_penalty": 1.3,
-            "temperature": 1.5,
-            "top_k": 500,
-            "top_p": 0.9,
-            "seed": [42, 0],  # seed set -> deterministic sampling sequence -> deterministic generation
-        }
-
-        # forces the generation to happen on CPU, to avoid GPU-related quirks
-        with tf.device(":/CPU:0"):
-            output_ids = model.generate(**input_ids, **generation_kwargs)
-
-        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-
-        expected_output_string = [
-            "Today is a beautiful day and we will make you feel very hot/terrific in all your",
-            "Yesterday was known by national television networks as Le Big Show or Wild Dog Jeopard",
-        ]
-        self.assertListEqual(output_strings, expected_output_string)
-
-    @slow
-    def test_lm_generate_greedy_distilgpt2_beam_search_special(self):
-        model = TFGPT2LMHeadModel.from_pretrained("distilbert/distilgpt2")
-        tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2")
-
-        tokenizer.pad_token = tokenizer.eos_token
-        tokenizer.padding_side = "left"
-
-        sentences = ["Today is a beautiful day and", "Yesterday was"]
-        input_ids = tokenizer(sentences, return_tensors="tf", padding=True)
-
-        generation_kwargs = {
-            "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids],
-            "no_repeat_ngram_size": 2,
-            "do_sample": False,
-            "num_beams": 2,
-        }
-
-        output_ids = model.generate(**input_ids, **generation_kwargs)
-
-        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-        expected_output_string = [
-            "Today is a beautiful day and a great day for all of us.\n\nI’m",
-            "Yesterday was the first time that a person has been arrested in the United States for",
-        ]
-        self.assertListEqual(output_strings, expected_output_string)
-
-    @slow
-    def test_lm_generate_distilgpt2_left_padding(self):
-        """Tests that the generated text is the same, regardless of left padding"""
-        model = TFGPT2LMHeadModel.from_pretrained("distilbert/distilgpt2")
-        tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2")
-
-        tokenizer.pad_token = tokenizer.eos_token
-        tokenizer.padding_side = "left"
-
-        generation_kwargs = {
-            "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids],
-            "no_repeat_ngram_size": 2,
-            "do_sample": False,
-            "repetition_penalty": 1.3,
-        }
-        expected_output_string = (
-            "Today is a beautiful day and I am so happy to be able take part in this amazing event."
-        )
-
-        sentences = ["Today is a beautiful day and"]
-        input_ids = tokenizer(sentences, return_tensors="tf", padding=True)
-        # using default length
-        output_ids = model.generate(**input_ids, **generation_kwargs)
-        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-        self.assertEqual(output_strings[0], expected_output_string)
-
-        sentences = ["Today is a beautiful day and", "This is a very long input that we absolutely don't care about"]
-        input_ids = tokenizer(sentences, return_tensors="tf", padding=True)
-        # longer max length to capture the full length (remember: it is left padded)
-        output_ids = model.generate(**input_ids, **generation_kwargs, max_length=27)
-        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-        self.assertEqual(output_strings[0], expected_output_string)
-
-    @slow
-    def test_lm_generate_gpt2_greedy_xla(self):
-        model = TFGPT2LMHeadModel.from_pretrained("openai-community/gpt2")
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
-
-        tokenizer.pad_token = tokenizer.eos_token
-        tokenizer.padding_side = "left"
-
-        sentences = ["The dog", "The flying machine"]
-        expected_output_strings = [
-            "The dog was found in a field near the intersection of West and West Streets.\n\nThe",
-            "The flying machine is a small, lightweight, and lightweight aircraft that can be used for any type of",
-        ]
-        input_ids = tokenizer(sentences, return_tensors="tf", padding=True)
-
-        output_ids = model.generate(**input_ids, do_sample=False)
-        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-        self.assertListEqual(output_strings, expected_output_strings)
-
-        xla_generate = tf.function(model.generate, jit_compile=True)
-        output_ids = xla_generate(**input_ids, do_sample=False)
-        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-        self.assertListEqual(output_strings, expected_output_strings)
-
-    @slow
-    def test_lm_generate_gpt2_sample_xla(self):
-        # NOTE: due to the small numerical differences that are natural when we compile to XLA, sampling the same
-        # output out of the same seed is far from guaranteed. We can, however, confirm that the results are sensible
-        # and that we can seed both versions.
-
-        # forces the generation to happen on CPU, to avoid GPU-related quirks
-        with tf.device(":/CPU:0"):
-            model = TFGPT2LMHeadModel.from_pretrained("openai-community/gpt2")
-            tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
-
-            tokenizer.pad_token = tokenizer.eos_token
-            tokenizer.padding_side = "left"
-
-            sentence = ["The dog", "The flying machine"]
-            expected_output_string = [
-                "The dog owner asked why did our vet decide there needed to be extra ventilation inside because most"
-                " puppies",
-                "The flying machine was made by an artist who found it difficult to control it as it did not use",
-            ]
-            expected_output_string_xla = [
-                "The dog has been named in connection with the murder of a 20-year-old man in",
-                "The flying machine is a new and improved system to operate and operate a new system and system "
-                "system system",
-            ]
-            input_ids = tokenizer(sentence, return_tensors="tf", padding=True)
-
-            output_ids = model.generate(**input_ids, do_sample=True, seed=[7, 0])
-            output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-            self.assertListEqual(output_strings, expected_output_string)
-
-            xla_generate = tf.function(model.generate, jit_compile=True)
-            output_ids = xla_generate(**input_ids, do_sample=True, seed=[7, 0])
-            output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-            self.assertListEqual(output_strings, expected_output_string_xla)
-
-    @slow
-    def test_lm_generate_gpt2_beam_search_xla(self):
-        model = TFGPT2LMHeadModel.from_pretrained("openai-community/gpt2")
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
-
-        tokenizer.pad_token = tokenizer.eos_token
-        tokenizer.padding_side = "left"
-
-        sentences = ["The dog", "The flying machine"]
-        expected_output_strings = [
-            "The dog was found in the backyard of a home in the 6500 block of South Main Street",
-            "The flying machine is a very powerful machine, but it's not a very powerful machine. It's",
-        ]
-        input_ids = tokenizer(sentences, return_tensors="tf", padding=True)
-
-        output_ids = model.generate(**input_ids, do_sample=False, num_beams=2)
-        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-        self.assertListEqual(output_strings, expected_output_strings)
-
-        xla_generate = tf.function(model.generate, jit_compile=True)
-        output_ids = xla_generate(**input_ids, do_sample=False, num_beams=2)
-        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-        self.assertListEqual(output_strings, expected_output_strings)
-
-    @slow
-    def test_contrastive_search_gpt2(self):
-        article = (
-            "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research "
-            "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based"
-        )
-
-        gpt2_tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2-large")
-        gpt2_model = TFGPT2LMHeadModel.from_pretrained("openai-community/gpt2-large")
-        input_ids = gpt2_tokenizer(article, return_tensors="tf")
-
-        outputs = gpt2_model.generate(**input_ids, penalty_alpha=0.6, top_k=4, max_length=256)
-
-        generated_text = gpt2_tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research "
-                "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based in London, "
-                "United Kingdom\n\nGoogle has a lot of data on its users and uses it to improve its products, such as "
-                "Google Now, which helps users find the information they're looking for on the web. But the company "
-                "is not the only one to collect data on its users. Facebook, for example, has its own facial "
-                "recognition technology, as well as a database of millions of photos that it uses to personalize its "
-                "News Feed.\n\nFacebook's use of data is a hot topic in the tech industry, with privacy advocates "
-                "concerned about the company's ability to keep users' information private. In a blog post last "
-                'year, Facebook CEO Mark Zuckerberg said his company would "do our best to be transparent about our '
-                'data use and how we use it."\n\n"We have made it clear that we do not sell or share your data with '
-                'third parties," Zuckerberg wrote. "If you have questions or concerns, please reach out to us at '
-                'privacy@facebook.com."\n\nGoogle declined to comment on the privacy implications of its use of data, '
-                "but said in a statement to The Associated Press that"
-            ],
-        )
-
-    @slow
-    def test_contrastive_search_gpt2_xla(self):
-        article = (
-            "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research "
-            "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based"
-        )
-
-        gpt2_tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2-large")
-        gpt2_model = TFGPT2LMHeadModel.from_pretrained("openai-community/gpt2-large")
-        input_ids = gpt2_tokenizer(article, return_tensors="tf")
-
-        xla_generate = tf.function(gpt2_model.generate, jit_compile=True)
-        outputs = xla_generate(**input_ids, penalty_alpha=0.6, top_k=4, max_length=256)
-
-        generated_text = gpt2_tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research "
-                "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based in London, "
-                "United Kingdom\n\nGoogle has a lot of data on its users and uses it to improve its products, such as "
-                "Google Now, which helps users find the information they're looking for on the web. But the company "
-                "is not the only one to collect data on its users. Facebook, for example, has its own facial "
-                "recognition technology, as well as a database of millions of photos that it uses to personalize its "
-                "News Feed.\n\nFacebook's use of data is a hot topic in the tech industry, with privacy advocates "
-                "concerned about the company's ability to keep users' information private. In a blog post last "
-                'year, Facebook CEO Mark Zuckerberg said his company would "do our best to be transparent about our '
-                'data use and how we use it."\n\n"We have made it clear that we do not sell or share your data with '
-                'third parties," Zuckerberg wrote. "If you have questions or concerns, please reach out to us at '
-                'privacy@facebook.com."\n\nGoogle declined to comment on the privacy implications of its use of data, '
-                "but said in a statement to The Associated Press that"
-            ],
-        )
diff --git a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
deleted file mode 100644
index abaadc2247..0000000000
--- a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers import GPT2Tokenizer, GPTNeoConfig, is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_flax_available():
-    import jax
-    import jax.numpy as jnp
-
-    from transformers.models.gpt_neo.modeling_flax_gpt_neo import FlaxGPTNeoForCausalLM, FlaxGPTNeoModel
-
-
-class FlaxGPTNeoModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        attention_types=[[["global", "local"], 1]],
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        window_size=7,
-        initializer_range=0.02,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.attention_types = attention_types
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.window_size = window_size
-        self.initializer_range = initializer_range
-        self.scope = None
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-        self.pad_token_id = vocab_size - 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = GPTNeoConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_layers=self.num_hidden_layers,
-            num_heads=self.num_attention_heads,
-            max_position_embeddings=self.max_position_embeddings,
-            use_cache=False,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            window_size=self.window_size,
-            attention_types=self.attention_types,
-        )
-
-        return (config, input_ids, input_mask)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-    def check_use_cache_forward(self, model_class_name, config, input_ids, attention_mask):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
-        attention_mask = jnp.ones((input_ids.shape[0], max_decoder_length), dtype="i4")
-
-        position_ids = jnp.broadcast_to(
-            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
-        )
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-        )
-
-        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            attention_mask=attention_mask,
-            past_key_values=outputs_cache.past_key_values,
-            position_ids=position_ids,
-        )
-
-        outputs = model(input_ids)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input_ids, attention_mask):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        attention_mask_cache = jnp.concatenate(
-            [attention_mask, jnp.zeros((attention_mask.shape[0], max_decoder_length - attention_mask.shape[1]))],
-            axis=-1,
-        )
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
-        position_ids = jnp.broadcast_to(
-            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
-        )
-
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask_cache,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-        )
-        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            past_key_values=outputs_cache.past_key_values,
-            attention_mask=attention_mask_cache,
-            position_ids=position_ids,
-        )
-
-        outputs = model(input_ids, attention_mask=attention_mask)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-
-@require_flax
-class FlaxGPTNeoModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxGPTNeoModel, FlaxGPTNeoForCausalLM) if is_flax_available() else ()
-
-    def setUp(self):
-        self.model_tester = FlaxGPTNeoModelTester(self)
-
-    def test_use_cache_forward(self):
-        for model_class_name in self.all_model_classes:
-            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
-            self.model_tester.check_use_cache_forward(model_class_name, config, input_ids, attention_mask)
-
-    def test_use_cache_forward_with_attn_mask(self):
-        for model_class_name in self.all_model_classes:
-            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
-            self.model_tester.check_use_cache_forward_with_attn_mask(
-                model_class_name, config, input_ids, attention_mask
-            )
-
-    @slow
-    def test_batch_generation(self):
-        tokenizer = GPT2Tokenizer.from_pretrained(
-            "openai-community/gpt2", pad_token="<|endoftext|>", padding_side="left"
-        )
-        inputs = tokenizer(["Hello this is a long string", "Hey"], return_tensors="np", padding=True, truncation=True)
-
-        model = FlaxGPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")
-        model.do_sample = False
-        model.config.pad_token_id = model.config.eos_token_id
-
-        jit_generate = jax.jit(model.generate)
-
-        output_sequences = jit_generate(
-            inputs["input_ids"], attention_mask=inputs["attention_mask"], pad_token_id=tokenizer.pad_token_id
-        ).sequences
-
-        output_string = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
-
-        expected_string = [
-            "Hello this is a long string of text.\n\nI'm trying to get the text of the",
-            "Hey, I'm a little late to the party. I'm going to",
-        ]
-
-        self.assertListEqual(output_string, expected_string)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("EleutherAI/gpt-neo-125M")
-            outputs = model(np.ones((1, 1)))
-            self.assertIsNotNone(outputs)
diff --git a/tests/models/gptj/test_modeling_flax_gptj.py b/tests/models/gptj/test_modeling_flax_gptj.py
deleted file mode 100644
index f92c07ab6e..0000000000
--- a/tests/models/gptj/test_modeling_flax_gptj.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers import GPT2Tokenizer, GPTJConfig, is_flax_available
-from transformers.testing_utils import require_flax, tooslow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_flax_available():
-    import jax
-    import jax.numpy as jnp
-
-    from transformers.models.gptj.modeling_flax_gptj import FlaxGPTJForCausalLM, FlaxGPTJModel
-
-
-class FlaxGPTJModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        rotary_dim=4,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.rotary_dim = rotary_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = None
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-        self.pad_token_id = vocab_size - 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = GPTJConfig(
-            vocab_size=self.vocab_size,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            n_positions=self.max_position_embeddings,
-            use_cache=False,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            rotary_dim=self.rotary_dim,
-        )
-
-        return (config, input_ids, input_mask)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-    def check_use_cache_forward(self, model_class_name, config, input_ids, attention_mask):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
-        attention_mask = jnp.ones((input_ids.shape[0], max_decoder_length), dtype="i4")
-
-        position_ids = jnp.broadcast_to(
-            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
-        )
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-        )
-
-        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            attention_mask=attention_mask,
-            past_key_values=outputs_cache.past_key_values,
-            position_ids=position_ids,
-        )
-
-        outputs = model(input_ids)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input_ids, attention_mask):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        attention_mask_cache = jnp.concatenate(
-            [attention_mask, jnp.zeros((attention_mask.shape[0], max_decoder_length - attention_mask.shape[1]))],
-            axis=-1,
-        )
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
-        position_ids = jnp.broadcast_to(
-            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
-        )
-
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask_cache,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-        )
-        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            past_key_values=outputs_cache.past_key_values,
-            attention_mask=attention_mask_cache,
-            position_ids=position_ids,
-        )
-
-        outputs = model(input_ids, attention_mask=attention_mask)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-
-@require_flax
-class FlaxGPTJModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxGPTJModel, FlaxGPTJForCausalLM) if is_flax_available() else ()
-
-    def setUp(self):
-        self.model_tester = FlaxGPTJModelTester(self)
-
-    def test_use_cache_forward(self):
-        for model_class_name in self.all_model_classes:
-            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
-            self.model_tester.check_use_cache_forward(model_class_name, config, input_ids, attention_mask)
-
-    def test_use_cache_forward_with_attn_mask(self):
-        for model_class_name in self.all_model_classes:
-            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
-            self.model_tester.check_use_cache_forward_with_attn_mask(
-                model_class_name, config, input_ids, attention_mask
-            )
-
-    @tooslow
-    def test_batch_generation(self):
-        tokenizer = GPT2Tokenizer.from_pretrained(
-            "openai-community/gpt2", pad_token="<|endoftext|>", padding_side="left"
-        )
-        inputs = tokenizer(["Hello this is a long string", "Hey"], return_tensors="np", padding=True, truncation=True)
-
-        model = FlaxGPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
-        model.do_sample = False
-        model.config.pad_token_id = model.config.eos_token_id
-
-        jit_generate = jax.jit(model.generate)
-
-        output_sequences = jit_generate(
-            inputs["input_ids"], attention_mask=inputs["attention_mask"], pad_token_id=tokenizer.pad_token_id
-        ).sequences
-
-        output_string = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
-
-        expected_string = [
-            "Hello this is a long string of text.\n\nI'm trying to get the text of the",
-            "Hey, I'm a little late to the party. I'm going to",
-        ]
-
-        self.assertListEqual(output_string, expected_string)
-
-    @tooslow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("EleutherAI/gpt-j-6B")
-            outputs = model(np.ones((1, 1)))
-            self.assertIsNotNone(outputs)
diff --git a/tests/models/gptj/test_modeling_tf_gptj.py b/tests/models/gptj/test_modeling_tf_gptj.py
deleted file mode 100644
index 2103dd9c26..0000000000
--- a/tests/models/gptj/test_modeling_tf_gptj.py
+++ /dev/null
@@ -1,468 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import AutoTokenizer, GPTJConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow, tooslow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-from ...utils.test_modeling_tf_core import TFCoreModelTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers.models.gptj.modeling_tf_gptj import (
-        TFGPTJForCausalLM,
-        TFGPTJForQuestionAnswering,
-        TFGPTJForSequenceClassification,
-        TFGPTJModel,
-        shape_list,
-    )
-
-
-class TFGPTJModelTester:
-    def __init__(self, parent):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_token_type_ids = True
-        self.use_input_mask = True
-        self.use_labels = True
-        self.use_mc_token_ids = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.rotary_dim = 4
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-        self.bos_token_id = self.vocab_size - 1
-        self.eos_token_id = self.vocab_size - 1
-        self.pad_token_id = self.vocab_size - 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        mc_token_ids = None
-        if self.use_mc_token_ids:
-            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = GPTJConfig(
-            vocab_size=self.vocab_size,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            n_positions=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            rotary_dim=self.rotary_dim,
-            return_dict=True,
-        )
-
-        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def create_and_check_gptj_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = TFGPTJModel(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-
-        inputs = [input_ids, None, input_mask]  # None is the input for 'past'
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_gptj_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = TFGPTJModel(config=config)
-
-        # first forward pass
-        outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids)
-        outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1)
-
-        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
-    def create_and_check_gptj_model_attention_mask_past(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = TFGPTJModel(config=config)
-
-        # create attention mask
-        half_seq_length = self.seq_length // 2
-        attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
-        attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
-        attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
-
-        # first forward pass
-        output, past_key_values = model(input_ids, attention_mask=attn_mask).to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
-        vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
-        condition = tf.transpose(
-            tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
-        )
-        input_ids = tf.where(condition, random_other_next_tokens, input_ids)
-
-        # append to next input_ids and attn_mask
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        attn_mask = tf.concat([attn_mask, tf.ones((shape_list(attn_mask)[0], 1), dtype=tf.int32)], axis=1)
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-12)
-
-    def create_and_check_gptj_model_past_large_inputs(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = TFGPTJModel(config=config)
-
-        input_ids = input_ids[:1, :]
-        input_mask = input_mask[:1, :]
-        token_type_ids = token_type_ids[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-        next_token_types = ids_tensor((self.batch_size, 3), self.type_vocab_size)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-        next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask
-        )["last_hidden_state"]
-        output_from_past = model(
-            next_tokens,
-            token_type_ids=next_token_types,
-            attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-        )["last_hidden_state"]
-        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_gptj_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = TFGPTJForCausalLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-        }
-        return config, inputs_dict
-
-
-@require_tf
-class TFGPTJModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (TFGPTJForCausalLM, TFGPTJForSequenceClassification, TFGPTJForQuestionAnswering, TFGPTJModel)
-        if is_tf_available()
-        else ()
-    )
-
-    all_generative_model_classes = (TFGPTJForCausalLM,) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFGPTJModel,
-            "question-answering": TFGPTJForQuestionAnswering,
-            "text-classification": TFGPTJForSequenceClassification,
-            "text-generation": TFGPTJForCausalLM,
-            "zero-shot": TFGPTJForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_onnx = False
-    test_pruning = False
-    test_missing_keys = False
-    test_head_masking = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_case_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        image_processor_name,
-        feature_extractor_name,
-        processor_name,
-    ):
-        if (
-            pipeline_test_case_name == "QAPipelineTests"
-            and tokenizer_name is not None
-            and not tokenizer_name.endswith("Fast")
-        ):
-            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
-            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
-            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = TFGPTJModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPTJConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_gptj_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gptj_model(*config_and_inputs)
-
-    def test_gptj_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gptj_model_past(*config_and_inputs)
-
-    def test_gptj_model_att_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gptj_model_attention_mask_past(*config_and_inputs)
-
-    def test_gptj_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gptj_model_past_large_inputs(*config_and_inputs)
-
-    def test_gptj_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gptj_lm_head_model(*config_and_inputs)
-
-    @slow
-    @unittest.skipIf(
-        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) > 0,
-        "skip testing on GPU for now to avoid GPU OOM.",
-    )
-    def test_model_from_pretrained(self):
-        model = TFGPTJModel.from_pretrained("EleutherAI/gpt-j-6B", from_pt=True)
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="Currently, model embeddings are going to undergo a major refactor.")
-    def test_resize_token_embeddings(self):
-        super().test_resize_token_embeddings()
-
-
-@require_tf
-@tooslow
-# Marked as @tooslow due to GPU OOM -- but still useful to run locally. Requires ~39GB of RAM.
-class TFGPTJModelLanguageGenerationTest(unittest.TestCase):
-    def test_lm_generate_gptj(self):
-        model = TFGPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", from_pt=True)
-        input_ids = tf.convert_to_tensor([[464, 3290]], dtype=tf.int32)  # The dog
-        # The dog is a man's best friend. It is a loyal companion, and it is a friend
-        expected_output_ids = [464, 3290, 318, 257, 582, 338, 1266, 1545, 13, 632, 318, 257, 9112, 15185, 11, 290, 340, 318, 257, 1545]  # fmt: skip
-        output_ids = model.generate(input_ids, do_sample=False)
-        self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
-
-    def test_gptj_sample(self):
-        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", revision="float16")
-        model = TFGPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", from_pt=True)
-
-        tokenized = tokenizer("Today is a nice day and", return_tensors="tf")
-        # forces the generation to happen on CPU, to avoid GPU-related quirks
-        with tf.device(":/CPU:0"):
-            output_ids = model.generate(**tokenized, do_sample=True, seed=[42, 0])
-        output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-
-        EXPECTED_OUTPUT_STR = "Today is a nice day and I’m going to go for a walk. I’"
-        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
-
-    def _get_beam_search_test_objects(self):
-        model = TFGPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", from_pt=True)
-        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", revision="float16")
-
-        tokenizer.padding_side = "left"
-
-        # Define PAD Token = EOS Token = 50256
-        tokenizer.pad_token = tokenizer.eos_token
-        model.config.pad_token_id = model.config.eos_token_id
-
-        # use different length sentences to test batching
-        sentences = [
-            "Hello, my dog is a little",
-            "Today, I",
-        ]
-        expected_output_sentences = [
-            "Hello, my dog is a little over a year old and has been diagnosed with hip dysplasia",
-            "Today, I’m going to be talking about a topic that’",
-        ]
-        return model, tokenizer, sentences, expected_output_sentences
-
-    def test_batch_beam_search(self):
-        # Confirms that we get the expected results with left-padded beam search
-        model, tokenizer, sentences, expected_output_sentences = self._get_beam_search_test_objects()
-
-        inputs = tokenizer(sentences, return_tensors="tf", padding=True)
-        outputs = model.generate(**inputs, do_sample=False, num_beams=2)
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        self.assertListEqual(expected_output_sentences, batch_out_sentence)
-
-    def test_batch_left_padding(self):
-        # Confirms that left-padding is working properly
-        model, tokenizer, sentences, expected_output_sentences = self._get_beam_search_test_objects()
-
-        inputs = tokenizer(sentences, return_tensors="tf", padding=True)
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="tf")
-        output_non_padded = model.generate(**inputs_non_padded, do_sample=False, num_beams=2)
-        num_paddings = (
-            shape_list(inputs_non_padded["input_ids"])[-1]
-            - tf.reduce_sum(tf.cast(inputs["attention_mask"][-1], tf.int64)).numpy()
-        )
-        inputs_padded = tokenizer(sentences[1], return_tensors="tf")
-        output_padded = model.generate(
-            **inputs_padded, do_sample=False, num_beams=2, max_length=model.config.max_length - num_paddings
-        )
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-        self.assertListEqual(expected_output_sentences, [non_padded_sentence, padded_sentence])
-
-    def test_xla_beam_search(self):
-        # Confirms that XLA is working properly
-        model, tokenizer, sentences, expected_output_sentences = self._get_beam_search_test_objects()
-
-        inputs = tokenizer(sentences, return_tensors="tf", padding=True)
-        xla_generate = tf.function(model.generate, jit_compile=True)
-        outputs_xla = xla_generate(**inputs, do_sample=False, num_beams=2)
-        xla_sentence = tokenizer.batch_decode(outputs_xla, skip_special_tokens=True)
-        self.assertListEqual(expected_output_sentences, xla_sentence)
diff --git a/tests/models/groupvit/test_modeling_tf_groupvit.py b/tests/models/groupvit/test_modeling_tf_groupvit.py
deleted file mode 100644
index 24ffc88a82..0000000000
--- a/tests/models/groupvit/test_modeling_tf_groupvit.py
+++ /dev/null
@@ -1,695 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow GroupViT model."""
-
-from __future__ import annotations
-
-import inspect
-import os
-import random
-import tempfile
-import unittest
-from importlib import import_module
-
-import requests
-
-from transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
-from transformers.testing_utils import (
-    require_tensorflow_probability,
-    require_tf,
-    require_vision,
-    slow,
-)
-from transformers.utils import is_tf_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFGroupViTModel, TFGroupViTTextModel, TFGroupViTVisionModel, TFSharedEmbeddings
-    from transformers.modeling_tf_utils import keras
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import CLIPProcessor
-
-
-class TFGroupViTVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        depths=[6, 3, 3],
-        num_group_tokens=[64, 8, 0],
-        num_output_groups=[64, 8, 8],
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.depths = depths
-        self.num_hidden_layers = sum(depths)
-        self.expected_num_hidden_layers = len(depths) + 1
-        self.num_group_tokens = num_group_tokens
-        self.num_output_groups = num_output_groups
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        num_patches = (image_size // patch_size) ** 2
-        # no [CLS] token for GroupViT
-        self.seq_length = num_patches
-
-    def prepare_config_and_inputs(self):
-        rng = random.Random(0)
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size], rng=rng)
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return GroupViTVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            depths=self.depths,
-            num_group_tokens=self.num_group_tokens,
-            num_output_groups=self.num_output_groups,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = TFGroupViTVisionModel(config=config)
-        result = model(pixel_values, training=False)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.num_output_groups[-1], self.hidden_size)
-        )
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFGroupViTVisionModelTest(TFModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as GroupViT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (TFGroupViTVisionModel,) if is_tf_available() else ()
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFGroupViTVisionModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=GroupViTVisionConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="GroupViT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    """
-    During saving, TensorFlow will also run with `training=True` which trigger `gumbel_softmax` that requires
-    `tensorflow-probability`.
-    """
-
-    @require_tensorflow_probability
-    @slow
-    def test_saved_model_creation(self):
-        super().test_saved_model_creation()
-
-    @unittest.skip(reason="GroupViT does not use inputs_embeds")
-    def test_graph_mode_with_inputs_embeds(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, keras.layers.Layer))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-
-        expected_num_attention_outputs = sum(g > 0 for g in self.model_tester.num_group_tokens)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-            attentions = outputs.attentions
-            # GroupViT returns attention grouping of each stage
-            self.assertEqual(len(attentions), sum(g > 0 for g in self.model_tester.num_group_tokens))
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-            attentions = outputs.attentions
-            # GroupViT returns attention grouping of each stage
-            self.assertEqual(len(attentions), expected_num_attention_outputs)
-
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-
-            added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            # GroupViT returns attention grouping of each stage
-            self.assertEqual(len(self_attentions), expected_num_attention_outputs)
-            for i, self_attn in enumerate(self_attentions):
-                if self_attn is None:
-                    continue
-
-                self.assertListEqual(
-                    list(self_attentions[i].shape[-2:]),
-                    [
-                        self.model_tester.num_output_groups[i],
-                        self.model_tester.num_output_groups[i - 1] if i > 0 else seq_len,
-                    ],
-                )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            seq_length = getattr(self.model_tester, "seq_length", None)
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "nvidia/groupvit-gcc-yfcc"
-        model = TFGroupViTVisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip(
-        "TFGroupViTVisionModel does not convert `hidden_states` and `attentions` to tensors as they are all of"
-        " different dimensions, and we get `Got a non-Tensor value` error when saving the model."
-    )
-    @slow
-    def test_saved_model_creation_extended(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        if hasattr(config, "use_cache"):
-            config.use_cache = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-
-        for model_class in self.all_model_classes:
-            class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config)
-            num_out = len(model(class_inputs_dict))
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=True)
-                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
-                model = keras.models.load_model(saved_model_dir)
-                outputs = model(class_inputs_dict)
-                output_hidden_states = outputs["hidden_states"]
-                output_attentions = outputs["attentions"]
-
-                # Check num outputs
-                self.assertEqual(len(outputs), num_out)
-
-                # Check num layers
-                expected_num_layers = getattr(
-                    self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-                )
-
-                self.assertEqual(len(output_hidden_states), expected_num_layers)
-                self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers)
-
-                # Check attention outputs
-                image_size = (self.model_tester.image_size, self.model_tester.image_size)
-                patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
-                num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-                seq_len = num_patches + 1
-
-                self.assertListEqual(
-                    list(output_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_len, seq_len],
-                )
-
-                # Check hidden states
-                self.assertListEqual(
-                    list(output_hidden_states[0].shape[-2:]),
-                    [seq_len, self.model_tester.hidden_size],
-                )
-
-
-class TFGroupViTTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        rng = random.Random(0)
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, rng=rng)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-            # make sure the first token has attention mask `1` to ensure that, after combining the causal mask, there
-            # is still at least one token being attended to for each batch.
-            # TODO: Change `random_attention_mask` in PT/TF/Flax common test file, after a discussion with the team.
-            input_mask = tf.concat(
-                [tf.ones_like(input_mask[:, :1], dtype=input_mask.dtype), input_mask[:, 1:]], axis=-1
-            )
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return GroupViTTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = TFGroupViTTextModel(config=config)
-        result = model(input_ids, attention_mask=input_mask, training=False)
-        result = model(input_ids, training=False)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFGroupViTTextModelTest(TFModelTesterMixin, unittest.TestCase):
-    all_model_classes = (TFGroupViTTextModel,) if is_tf_available() else ()
-    test_pruning = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFGroupViTTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GroupViTTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="GroupViTTextModel does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "nvidia/groupvit-gcc-yfcc"
-        model = TFGroupViTTextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_saved_model_creation_extended(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        if hasattr(config, "use_cache"):
-            config.use_cache = True
-
-        for model_class in self.all_model_classes:
-            class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config)
-            num_out = len(model(class_inputs_dict))
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=True)
-                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
-                model = keras.models.load_model(saved_model_dir)
-                outputs = model(class_inputs_dict)
-                output_hidden_states = outputs["hidden_states"]
-                output_attentions = outputs["attentions"]
-
-                # Check number of outputs
-                self.assertEqual(len(outputs), num_out)
-
-                # Check number of layers
-                expected_num_layers = getattr(
-                    self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-                )
-
-                # Check hidden states
-                self.assertEqual(len(output_hidden_states), expected_num_layers)
-                self.assertListEqual(
-                    list(output_hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size],
-                )
-
-                # Check attention outputs
-                self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers)
-
-                seq_length = self.model_tester.seq_length
-                key_length = getattr(self.model_tester, "key_length", seq_length)
-
-                self.assertListEqual(
-                    list(output_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_length, key_length],
-                )
-
-
-class TFGroupViTModelTester:
-    def __init__(self, parent, is_training=True):
-        self.parent = parent
-        self.text_model_tester = TFGroupViTTextModelTester(parent)
-        self.vision_model_tester = TFGroupViTVisionModelTester(parent)
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return GroupViTConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = TFGroupViTModel(config)
-        result = model(input_ids, pixel_values, attention_mask, training=False)
-        self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-            "return_loss": True,
-        }
-        return config, inputs_dict
-
-
-@require_tf
-class TFGroupViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TFGroupViTModel,) if is_tf_available() else ()
-    pipeline_model_mapping = {"feature-extraction": TFGroupViTModel} if is_tf_available() else {}
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFGroupViTModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="hidden_states are tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="input_embeds are tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="CLIPModel does not have input/output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    @require_tensorflow_probability
-    @slow
-    def test_keras_fit(self):
-        super().test_keras_fit()
-
-    # overwrite from common since `TFGroupViTModelTester` set `return_loss` to `True` and causes the preparation of
-    # `symbolic_inputs` failed.
-    def test_keras_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # remove `return_loss` to make code work
-        if self.__class__.__name__ == "TFGroupViTModelTest":
-            inputs_dict.pop("return_loss", None)
-
-        tf_main_layer_classes = {
-            module_member
-            for model_class in self.all_model_classes
-            for module in (import_module(model_class.__module__),)
-            for module_member_name in dir(module)
-            if module_member_name.endswith("MainLayer")
-            # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`.
-            and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
-            for module_member in (getattr(module, module_member_name),)
-            if isinstance(module_member, type)
-            and keras.layers.Layer in module_member.__bases__
-            and getattr(module_member, "_keras_serializable", False)
-        }
-        for main_layer_class in tf_main_layer_classes:
-            # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
-            if "T5" in main_layer_class.__name__:
-                # Take the same values than in TFT5ModelTester for this shared layer
-                shared = TFSharedEmbeddings(99, 32, name="shared")
-                config.use_cache = inputs_dict.pop("use_cache", None)
-                main_layer = main_layer_class(config, embed_tokens=shared)
-            else:
-                main_layer = main_layer_class(config)
-
-            symbolic_inputs = {
-                name: keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
-            }
-
-            model = keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
-            outputs = model(inputs_dict)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                filepath = os.path.join(tmpdirname, "keras_model.h5")
-                model.save(filepath)
-                if "T5" in main_layer_class.__name__:
-                    model = keras.models.load_model(
-                        filepath,
-                        custom_objects={
-                            main_layer_class.__name__: main_layer_class,
-                            "TFSharedEmbeddings": TFSharedEmbeddings,
-                        },
-                    )
-                else:
-                    model = keras.models.load_model(
-                        filepath, custom_objects={main_layer_class.__name__: main_layer_class}
-                    )
-                assert isinstance(model, keras.Model)
-                after_outputs = model(inputs_dict)
-                self.assert_outputs_same(after_outputs, outputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "nvidia/groupvit-gcc-yfcc"
-        model = TFGroupViTModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="Currently `saved_model` doesn't work with nested outputs.")
-    @slow
-    def test_saved_model_creation(self):
-        pass
-
-    @unittest.skip(reason="`saved_model` doesn't work with nested outputs so no preparation happens.")
-    @slow
-    def test_prepare_serving_output(self):
-        pass
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@require_vision
-@require_tf
-class TFGroupViTModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model_name = "nvidia/groupvit-gcc-yfcc"
-        model = TFGroupViTModel.from_pretrained(model_name)
-        processor = CLIPProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        inputs = processor(
-            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="tf"
-        )
-
-        outputs = model(**inputs, training=False)
-
-        # verify the logits
-        self.assertEqual(
-            outputs.logits_per_image.shape,
-            tf.TensorShape((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
-        )
-        self.assertEqual(
-            outputs.logits_per_text.shape,
-            tf.TensorShape((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
-        )
-
-        expected_logits = tf.constant([[13.3523, 6.3629]])
-
-        tf.debugging.assert_near(outputs.logits_per_image, expected_logits, atol=1e-3)
diff --git a/tests/models/hubert/test_modeling_tf_hubert.py b/tests/models/hubert/test_modeling_tf_hubert.py
deleted file mode 100644
index 8d377ae885..0000000000
--- a/tests/models/hubert/test_modeling_tf_hubert.py
+++ /dev/null
@@ -1,562 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import copy
-import inspect
-import math
-import unittest
-
-import numpy as np
-import pytest
-
-from transformers import is_tf_available
-from transformers.testing_utils import require_soundfile, require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import HubertConfig, TFHubertForCTC, TFHubertModel, Wav2Vec2Processor
-    from transformers.models.hubert.modeling_tf_hubert import _compute_mask_indices
-
-
-@require_tf
-class TFHubertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=1024,
-        is_training=False,
-        hidden_size=16,
-        feat_extract_norm="group",
-        feat_extract_dropout=0.0,
-        feat_extract_activation="gelu",
-        conv_dim=(32, 32, 32),
-        conv_stride=(4, 4, 4),
-        conv_kernel=(8, 8, 8),
-        conv_bias=False,
-        num_conv_pos_embeddings=16,
-        num_conv_pos_embedding_groups=2,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
-        intermediate_size=20,
-        layer_norm_eps=1e-5,
-        hidden_act="gelu",
-        initializer_range=0.02,
-        vocab_size=32,
-        do_stable_layer_norm=False,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
-        self.feat_extract_dropout = feat_extract_dropout
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.do_stable_layer_norm = do_stable_layer_norm
-        self.scope = scope
-
-        output_seq_length = self.seq_length
-        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
-            output_seq_length = (output_seq_length - (kernel - 1)) / stride
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-    def prepare_config_and_inputs(self):
-        input_values = tf.cast(ids_tensor([self.batch_size, self.seq_length], 32768), tf.float32) / 32768.0
-        attention_mask = tf.ones_like(input_values)
-
-        config = HubertConfig(
-            hidden_size=self.hidden_size,
-            feat_extract_norm=self.feat_extract_norm,
-            feat_extract_dropout=self.feat_extract_dropout,
-            feat_extract_activation=self.feat_extract_activation,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-            do_stable_layer_norm=self.do_stable_layer_norm,
-        )
-
-        return config, input_values, attention_mask
-
-    def create_and_check_model(self, config, input_values, attention_mask):
-        model = TFHubertModel(config)
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        config.layerdrop = 0.0
-        model = TFHubertModel(config)
-
-        input_values = input_values[:3]
-        attention_mask = tf.ones_like(input_values)
-
-        input_lengths = tf.constant([input_values.shape[-1] // i for i in [4, 2, 1]])
-        length_mask = tf.sequence_mask(input_lengths, dtype=tf.float32)
-
-        # convert values that are over input_lengths to padding
-        input_values = input_values * length_mask
-        attention_mask = attention_mask * length_mask
-
-        batch_outputs = model(input_values, attention_mask=attention_mask, training=False).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice, training=False).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(np.allclose(output, batch_output, atol=1e-3))
-
-    def check_ctc_loss(self, config, input_values, *args):
-        model = TFHubertForCTC(config)
-
-        input_values = input_values[:3]
-        attention_mask = tf.ones_like(input_values)
-
-        input_lengths = tf.constant([input_values.shape[-1] // i for i in [4, 2, 1]])
-        max_length_labels = model.hubert._get_feat_extract_output_lengths(input_lengths)
-        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
-
-        length_mask = tf.sequence_mask(input_lengths, dtype=tf.float32)
-
-        # convert values that are over input_lengths to padding
-        input_values = input_values * length_mask
-        attention_mask = attention_mask * length_mask
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss
-
-        self.parent.assertTrue(abs(labels.shape[0] * mean_loss - sum_loss) < 1e-2)
-
-    def check_training(self, config, input_values, *args):
-        model = TFHubertForCTC(config)
-
-        # freeze feature encoder
-        model.freeze_feature_encoder()
-
-        input_values = input_values[:3]
-
-        input_lengths = tf.constant([input_values.shape[-1] // i for i in [4, 2, 1]])
-        max_length_labels = model.hubert._get_feat_extract_output_lengths(input_lengths)
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size)
-
-        length_mask = tf.sequence_mask(input_lengths, dtype=tf.float32)
-
-        input_values = input_values * length_mask
-
-        pad_size = max(max_length_labels) - labels.shape[1]
-        labels = tf.pad(labels, ((0, 0), (0, pad_size)), constant_values=-100)
-
-        loss = model(input_values, labels=labels, training=True).loss
-
-        self.parent.assertFalse(tf.math.is_inf(loss))
-
-    def check_labels_out_of_vocab(self, config, input_values, *args):
-        model = TFHubertForCTC(config)
-        input_lengths = tf.constant([input_values.shape[-1] // i for i in [4, 2, 1]])
-        max_length_labels = model.hubert._get_feat_extract_output_lengths(input_lengths)
-        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size + 100)
-        with pytest.raises(ValueError):
-            model(input_values, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_values, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFHubertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TFHubertModel, TFHubertForCTC) if is_tf_available() else ()
-    pipeline_model_mapping = {"feature-extraction": TFHubertModel} if is_tf_available() else {}
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFHubertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=HubertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    # overwrite because input_values != input_ids
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["input_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    # overwrite because input_values != input_ids
-    def test_keyword_and_dict_args(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            outputs_dict = model(inputs)
-
-            inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            input_values = inputs_keywords.pop("input_values", None)
-            outputs_keywords = model(input_values, **inputs_keywords)
-            output_dict = outputs_dict[0].numpy()
-            output_keywords = outputs_keywords[0].numpy()
-
-            self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_hidden_states_output(config, inputs_dict, model_class):
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-
-            hidden_states = outputs.hidden_states
-            self.assertEqual(config.output_attentions, False)
-            self.assertEqual(len(hidden_states), expected_num_layers)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.model_tester.output_seq_length, self.model_tester.hidden_size],
-            )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(config, inputs_dict, model_class)
-
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-            check_hidden_states_output(config, inputs_dict, model_class)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    @unittest.skip(reason="Hubert has no input embeddings")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Hubert has no tokens embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Hubert has no input embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TFHubertModel.from_pretrained("facebook/hubert-base-ls960")
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="Fix me! Hubert hits OOM errors when loss is computed on full batch")
-    def test_dataset_conversion(self):
-        # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
-        pass
-
-    @unittest.skip(reason="Fix me! Hubert hits OOM errors when loss is computed on full batch")
-    def test_keras_fit(self):
-        # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
-        pass
-
-
-@require_tf
-class TFHubertRobustModelTest(TFModelTesterMixin, unittest.TestCase):
-    all_model_classes = (TFHubertModel, TFHubertForCTC) if is_tf_available() else ()
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFHubertModelTester(
-            self,
-            conv_stride=(3, 3, 3),
-            feat_extract_norm="layer",
-            do_stable_layer_norm=True,
-            scope="robust",
-        )
-        self.config_tester = ConfigTester(self, config_class=HubertConfig, hidden_size=37)
-
-    # overwrite because input_values != input_ids
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["input_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    # overwrite because input_values != input_ids
-    def test_keyword_and_dict_args(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            outputs_dict = model(inputs)
-
-            inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            input_values = inputs_keywords.pop("input_values", None)
-            outputs_keywords = model(input_values, **inputs_keywords)
-            output_dict = outputs_dict[0].numpy()
-            output_keywords = outputs_keywords[0].numpy()
-
-            self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_hidden_states_output(config, inputs_dict, model_class):
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-
-            hidden_states = outputs.hidden_states
-            self.assertEqual(config.output_attentions, False)
-            self.assertEqual(len(hidden_states), expected_num_layers)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.model_tester.output_seq_length, self.model_tester.hidden_size],
-            )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(config, inputs_dict, model_class)
-
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-            check_hidden_states_output(config, inputs_dict, model_class)
-
-    def test_batched_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    @unittest.skip(reason="Hubert has no input embeddings")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Hubert has no tokens embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Hubert has no input embeddings or get_input_embeddings method")
-    def test_model_common_attributes(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TFHubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="Fix me! Hubert hits OOM errors when loss is computed on full batch")
-    def test_dataset_conversion(self):
-        # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
-        pass
-
-    @unittest.skip(reason="Fix me! Hubert hits OOM errors when loss is computed on full batch")
-    def test_keras_fit(self):
-        # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
-        pass
-
-
-@require_tf
-class TFHubertUtilsTest(unittest.TestCase):
-    def test_compute_mask_indices(self):
-        batch_size = 4
-        sequence_length = 60
-        mask_prob = 0.5
-        mask_length = 1
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-
-        self.assertListEqual(
-            tf.reduce_sum(mask, -1).numpy().tolist(), [mask_prob * sequence_length for _ in range(batch_size)]
-        )
-
-    def test_compute_mask_indices_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-
-        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
-        for batch_sum in tf.reduce_sum(mask, -1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-
-@require_tf
-@slow
-@require_soundfile
-class TFHubertModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        from datasets import load_dataset
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_inference_ctc_normal(self):
-        model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
-        processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft", do_lower_case=True)
-        input_speech = self._load_datasamples(1)
-
-        input_values = processor(input_speech, return_tensors="tf", sampling_rate=16000).input_values
-
-        logits = model(input_values).logits
-
-        predicted_ids = tf.argmax(logits, axis=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_normal_batched(self):
-        model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
-        processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft", do_lower_case=True)
-
-        input_speech = self._load_datasamples(2)
-
-        input_values = processor(input_speech, return_tensors="tf", padding=True, sampling_rate=16000).input_values
-
-        logits = model(input_values).logits
-
-        predicted_ids = tf.argmax(logits, axis=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_robust_batched(self):
-        model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
-        processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft", do_lower_case=True)
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="tf", padding=True, sampling_rate=16000)
-
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-
-        logits = model(input_values, attention_mask=attention_mask).logits
-
-        predicted_ids = tf.argmax(logits, axis=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around"
-            " him with the thousands of spectators were trivialities not worth thinking about",
-            "his instant of panic was followed by a small sharp blow high on his chest",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
diff --git a/tests/models/idefics/test_modeling_tf_idefics.py b/tests/models/idefics/test_modeling_tf_idefics.py
deleted file mode 100644
index bd7c9b06c2..0000000000
--- a/tests/models/idefics/test_modeling_tf_idefics.py
+++ /dev/null
@@ -1,559 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TF Idefics model."""
-
-import os
-import tempfile
-import unittest
-from importlib import import_module
-
-from transformers import IdeficsConfig, is_tf_available, is_vision_available
-from transformers.testing_utils import TestCasePlus, require_tf, require_vision, slow
-from transformers.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import IdeficsProcessor, TFIdeficsForVisionText2Text, TFIdeficsModel
-    from transformers.modeling_tf_utils import keras
-    from transformers.models.idefics.configuration_idefics import IdeficsPerceiverConfig, IdeficsVisionConfig
-
-if is_vision_available():
-    from PIL import Image
-
-
-IDEFICS_TINY_RANDOM_MODEL = "HuggingFaceM4/tiny-random-idefics"
-
-
-class IdeficsModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=1,
-        seq_length=7,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=5,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        scope=None,
-        modality_type_vocab_size=2,
-        vision_embed_dim=32,
-        vision_patch_size=2,
-        vision_image_size=30,
-        vision_num_attention_heads=4,
-        vision_num_hidden_layers=5,
-        vision_intermediate_size=37,
-        perceiver_qk_layer_norms_perceiver=False,
-        perceiver_resampler_depth=2,
-        perceiver_resampler_head_dim=8,
-        perceiver_resampler_n_heads=2,
-        perceiver_resampler_n_latents=16,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.scope = scope
-        self.modality_type_vocab_size = modality_type_vocab_size
-
-        self.vision_embed_dim = vision_embed_dim
-        self.vision_patch_size = vision_patch_size
-        self.vision_image_size = vision_image_size
-        self.vision_num_attention_heads = vision_num_attention_heads
-        self.vision_num_hidden_layers = vision_num_hidden_layers
-        self.vision_intermediate_size = vision_intermediate_size
-
-        self.vision_config = IdeficsVisionConfig(
-            embed_dim=self.vision_embed_dim,
-            patch_size=self.vision_patch_size,
-            image_size=self.vision_image_size,
-            num_attention_heads=self.vision_num_attention_heads,
-            num_hidden_layers=self.vision_num_hidden_layers,
-            intermediate_size=self.vision_intermediate_size,
-        )
-
-        self.perceiver_qk_layer_norms_perceiver = perceiver_qk_layer_norms_perceiver
-        self.perceiver_resampler_depth = perceiver_resampler_depth
-        self.perceiver_resampler_head_dim = perceiver_resampler_head_dim
-        self.perceiver_resampler_n_heads = perceiver_resampler_n_heads
-        self.perceiver_resampler_n_latents = perceiver_resampler_n_latents
-
-        self.perceiver_config = IdeficsPerceiverConfig(
-            qk_layer_norms_perceiver=self.perceiver_qk_layer_norms_perceiver,
-            resampler_depth=self.perceiver_resampler_depth,
-            resampler_head_dim=self.perceiver_resampler_head_dim,
-            resampler_n_heads=self.perceiver_resampler_n_heads,
-            resampler_n_latents=self.perceiver_resampler_n_latents,
-        )
-
-        # we set the expected sequence length (which is used in several tests)
-        # this is equal to the seq length of the text tokens + number of image patches + 1 for the CLS token
-        self.expected_seq_len = self.seq_length + (self.image_size // self.patch_size) ** 2 + 1
-
-    def prepare_config_and_inputs(self, num_images=1, interpolate_pos_encoding=False, image_expansion=0):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        pixel_values = floats_tensor(
-            [
-                self.batch_size,
-                num_images,
-                self.num_channels,
-                self.image_size + image_expansion,
-                self.image_size + image_expansion,
-            ]
-        )
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        image_attention_mask = random_attention_mask([self.batch_size, self.seq_length, num_images])
-
-        config = self.get_config()
-        return (config, input_ids, input_mask, pixel_values, image_attention_mask, interpolate_pos_encoding)
-
-    def get_config(self):
-        return IdeficsConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            num_labels=self.num_labels,
-            modality_type_vocab_size=self.modality_type_vocab_size,
-            vision_config=self.vision_config,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        pixel_values,
-        image_attention_mask,
-        interpolate_pos_encoding,
-    ):
-        model = TFIdeficsModel(config=config)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            pixel_values=pixel_values,
-            image_attention_mask=image_attention_mask,
-            interpolate_pos_encoding=interpolate_pos_encoding,
-        )
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, input_ids.shape[1], self.hidden_size)
-        )
-
-    def create_and_check_model_gen(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        pixel_values,
-        image_attention_mask,
-        interpolate_pos_encoding,
-    ):
-        model = TFIdeficsForVisionText2Text(config)
-        model.generate(
-            input_ids,
-            attention_mask=input_mask,
-            pixel_values=pixel_values,
-            image_attention_mask=image_attention_mask,
-            interpolate_pos_encoding=interpolate_pos_encoding,
-            max_length=self.seq_length + 2,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            input_mask,
-            pixel_values,
-            image_attention_mask,
-            interpolate_pos_encoding,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "pixel_values": pixel_values,
-            "image_attention_mask": image_attention_mask,
-            "interpolate_pos_encoding": interpolate_pos_encoding,
-        }
-        return config, inputs_dict
-
-    def prepare_pixel_values(self):
-        return floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-
-@require_tf
-class TFIdeficsModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TFIdeficsModel, TFIdeficsForVisionText2Text) if is_tf_available() else ()
-    pipeline_model_mapping = {"feature-extraction": TFIdeficsModel} if is_tf_available() else {}
-    test_pruning = False
-    test_headmasking = False
-    test_onnx = False
-    test_resize_embeddings = False
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-        # XXX: IdeficsForVisionText2TextTest has no MODEL_FOR group yet, but it should be the same
-        # as MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, so for now manually changing to do the right thing
-        # as super won't do it
-        if return_labels:
-            inputs_dict["labels"] = tf.zeros(
-                (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int64
-            )
-        return inputs_dict
-
-    def test_model_outputs_equivalence(self):
-        try:
-            orig = self.all_model_classes
-            # IdeficsModel.forward doesn't have labels input arg - only IdeficsForVisionText2Text does
-            self.all_model_classes = (TFIdeficsForVisionText2Text,) if is_tf_available() else ()
-            super().test_model_outputs_equivalence()
-        finally:
-            self.all_model_classes = orig
-
-    def setUp(self):
-        self.model_tester = IdeficsModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model_single_image(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            num_images=1, interpolate_pos_encoding=False, image_expansion=0
-        )
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_multiple_images(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            num_images=2, interpolate_pos_encoding=False, image_expansion=0
-        )
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_image_pos_embeddings_interpolation_single_image(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            num_images=1, interpolate_pos_encoding=True, image_expansion=2
-        )
-        self.model_tester.create_and_check_model(*config_and_inputs)
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            num_images=1, interpolate_pos_encoding=True, image_expansion=0
-        )
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_image_pos_embeddings_interpolation_multiple_images(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            num_images=2, interpolate_pos_encoding=True, image_expansion=2
-        )
-        self.model_tester.create_and_check_model(*config_and_inputs)
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            num_images=2, interpolate_pos_encoding=True, image_expansion=0
-        )
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_generate_with_image_pos_embeddings_interpolation_single_image(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            num_images=1, interpolate_pos_encoding=True, image_expansion=2
-        )
-        self.model_tester.create_and_check_model_gen(*config_and_inputs)
-
-    def test_generate_with_image_pos_embeddings_interpolation_multiple_images(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            num_images=2, interpolate_pos_encoding=True, image_expansion=2
-        )
-        self.model_tester.create_and_check_model_gen(*config_and_inputs)
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(reason="""IDEFICS does not support retaining the gradients of the hidden states and attention""")
-    def test_retain_grad_hidden_states_attentions(self):
-        return
-
-    @unittest.skip(reason="IDEFICS uses out-of-bounds embeddings deliberately.")
-    def test_embeddings_out_of_bounds_raise_exception(self):
-        pass
-
-    @unittest.skip(reason="IDEFICS attention weights are not extracted in scaled_dot_product_attention")
-    def test_prepare_serving_output(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer))
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            # IDEFICS does not support outputting attention score because it uses SDPA under the hood
-            self.assertTrue(attentions[0] is None)
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            self.assertEqual(out_len + 1, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            # IDEFICS does not support outputting attention score because it uses SDPA under the hood
-            self.assertTrue(self_attentions[0] is None)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            seq_length = self.model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_keras_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        tf_main_layer_classes = {
-            module_member
-            for model_class in self.all_model_classes
-            for module in (import_module(model_class.__module__),)
-            for module_member_name in dir(module)
-            if module_member_name.endswith("MainLayer")
-            for module_member in (getattr(module, module_member_name),)
-            if isinstance(module_member, type)
-            and keras.layers.Layer in module_member.__bases__
-            and getattr(module_member, "_keras_serializable", False)
-        }
-
-        for main_layer_class in tf_main_layer_classes:
-            main_layer = main_layer_class(config)
-
-            symbolic_inputs = {
-                name: keras.Input(tensor.shape[1:], dtype=tensor.dtype, batch_size=2)
-                for name, tensor in inputs_dict.items()
-                if tf.is_tensor(tensor)
-            }
-            model = keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
-            outputs = model(inputs_dict)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                filepath = os.path.join(tmpdirname, "keras_model.h5")
-                model.save(filepath)
-                model = keras.models.load_model(filepath, custom_objects={main_layer_class.__name__: main_layer_class})
-                assert isinstance(model, keras.Model)
-                after_outputs = model(inputs_dict)
-                self.assert_outputs_same(after_outputs, outputs)
-
-    @unittest.skip(reason="IDEFICS test_keras_fit testing done in TFIdeficsForVisionText2TextTest")
-    def test_keras_fit(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TFIdeficsModel.from_pretrained(IDEFICS_TINY_RANDOM_MODEL, from_pt=True)
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="Currently `saved_model` doesn't work with nested outputs.")
-    def test_saved_model_creation(self):
-        pass
-
-    @unittest.skip(reason="""IDEFICS loss computation not implemented yet""")
-    def test_loss_computation(self):
-        pass
-
-
-@require_tf
-class TFIdeficsForVisionText2TextTest(TFIdeficsModelTest, unittest.TestCase):
-    all_model_classes = (TFIdeficsForVisionText2Text,) if is_tf_available() else ()
-    test_resize_embeddings = False
-
-    def setUp(self):
-        self.model_tester = IdeficsModelTester(
-            self,
-            modality_type_vocab_size=3,
-        )
-        self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37)
-
-    @unittest.skip("We only test the model that takes in multiple images")
-    def test_model(self):
-        pass
-
-    @unittest.skip("We only test the model that takes in multiple images")
-    def test_for_token_classification(self):
-        pass
-
-    @unittest.skip(reason="""IDEFICS does not support retaining the gradients of the hidden states and attention""")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="""IDEFICS loss computation not implemented yet""")
-    def test_loss_computation(self):
-        pass
-
-    @slow
-    def test_keras_fit(self):
-        super().test_keras_fit()
-
-
-# Below is the expected output for the integration test TFIdeficsModelIntegrationTest.
-# Since we are using tiny-random to be able to fit it on the CI GPU,it is better to assert on the
-# ids because the generated text is gibberish
-
-# fmt: off
-EXPECTED_GENERATED_IDS = [[0, 0, 1, 4911, 29901, 32000, 32001, 32000, 20355, 915, 445, 1967, 29889, 13, 7900, 22137, 29901, 530, 1967, 310, 1023, 26361, 29889, 13, 2659, 29901, 32000, 32001, 32000, 20355, 915, 445, 1967, 29889, 13, 7900, 22137, 29901, 25519, 22326, 8071, 26357, 28004, 4428, 5916, 14383, 1033, 12358, 10536, 21834, 10447, 21201, 18102, 16886, 8875, 25388, 25914, 28304, 8558, 31048, 1322, 25952, 189, 31600, 3600, 12824, 7045, 28090, 20228, 32001, 5385, 29186, 2165, 11822, 13825, 23077, 7883, 22504, 2078, 18893, 2179, 10556, 9515, 7672, 3491, 12403, 5398, 27299, 6463, 16349, 23037, 28956, 16960, 22664, 7724, 17587, 17424, 10175, 17417, 5930, 30855, 17695, 16170, 14474, 29996, 313, 14502, 3241, 13618, 32001, 5385, 29186, 2165, 11822, 13825, 19934, 4875, 27142, 3230, 2709, 28054, 3270, 19148, 10917, 1060, 26443, 12259, 1347, 28482, 3830, 25519, 199, 12782, 9144, 12289, 1142, 18400, 21390, 19129, 7292, 28430, 24711, 5551, 30349, 30533, 13271, 17697, 4982, 8713, 5380, 17869, 12490, 5398, 27299, 11593, 19918, 15924, 29430, 10175, 17417, 5930, 30855, 17695, 16170, 14474, 19234],
-                          [1, 4911, 29901, 32000, 32001, 32000, 20355, 915, 445, 1967, 29889, 13, 7900, 22137, 29901, 530, 1967, 310, 1023, 413, 986, 575, 29889, 13, 2659, 29901, 32000, 32001, 32000, 20355, 915, 445, 1967, 29889, 13, 7900, 22137, 29901, 25519, 22326, 8071, 26357, 28004, 4428, 17554, 20500, 21714, 27834, 4798, 12195, 30379, 5427, 20228, 10473, 14351, 8049, 15605, 14491, 212, 2711, 32000, 21714, 31259, 24368, 19036, 22970, 26083, 19394, 20372, 7672, 9939, 25388, 30533, 8200, 30271, 2114, 24749, 13224, 10603, 21118, 2179, 3759, 16515, 6587, 1287, 23998, 17793, 32001, 5385, 29186, 2165, 11822, 13825, 29732, 17503, 2729, 6722, 2943, 1221, 16043, 18244, 24965, 14383, 19840, 5980, 13488, 28531, 735, 26146, 22504, 2078, 18893, 20372, 7672, 32001, 5385, 29186, 2165, 11822, 13825, 29732, 17503, 2729, 6722, 19551, 220, 10528, 28940, 4453, 28266, 15416, 18693, 8199, 1153, 27706, 29231, 29186, 2165, 11822, 13825, 29732, 17503, 2729, 6722, 19551, 8231, 10739, 31992, 25906, 22254, 23127, 7689, 19614, 1149, 18844, 23037, 28956, 16960, 22664, 6975, 28938, 24002, 11026, 15020, 21964, 16307], ]
-
-@require_tf
-@require_vision
-class TFIdeficsModelIntegrationTest(TestCasePlus):
-    @cached_property
-    def default_processor(self):
-        return IdeficsProcessor.from_pretrained(IDEFICS_TINY_RANDOM_MODEL) if is_vision_available() else None
-
-    @slow
-    def test_inference_natural_language_visual_reasoning(self):
-        cat_image_path = self.tests_dir / "fixtures/tests_samples/COCO/000000039769.png"
-        cats_image_obj = Image.open(cat_image_path)  # 2 cats
-        dogs_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg"
-
-        prompts = [
-            [
-                "User:",
-                dogs_image_url,
-                "Describe this image.\nAssistant: An image of two dogs.\n",
-                "User:",
-                cats_image_obj,
-                "Describe this image.\nAssistant:",
-            ],
-            [
-                "User:",
-                cats_image_obj,
-                "Describe this image.\nAssistant: An image of two kittens.\n",
-                "User:",
-                dogs_image_url,
-                "Describe this image.\nAssistant:",
-            ],
-        ]
-
-        model = TFIdeficsForVisionText2Text.from_pretrained(IDEFICS_TINY_RANDOM_MODEL, from_pt=True)
-        processor = self.default_processor
-        inputs = processor(prompts, return_tensors="tf")
-        generated_ids = model.generate(**inputs, max_length=100)
-        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
-
-        # keep for debugging
-        for i, t in enumerate(generated_text):
-            t = bytes(t, "utf-8").decode("unicode_escape")
-            print(f"{i}:\n{t}\n")
-
-        self.assertListEqual(EXPECTED_GENERATED_IDS[0], generated_ids[0].numpy().tolist())
-        self.assertListEqual(EXPECTED_GENERATED_IDS[1], generated_ids[1].numpy().tolist())
diff --git a/tests/models/layoutlm/test_modeling_tf_layoutlm.py b/tests/models/layoutlm/test_modeling_tf_layoutlm.py
deleted file mode 100644
index f2690b14fb..0000000000
--- a/tests/models/layoutlm/test_modeling_tf_layoutlm.py
+++ /dev/null
@@ -1,369 +0,0 @@
-# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors, The Hugging Face Team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import unittest
-
-import numpy as np
-
-from transformers import LayoutLMConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers.models.layoutlm.modeling_tf_layoutlm import (
-        TFLayoutLMForMaskedLM,
-        TFLayoutLMForQuestionAnswering,
-        TFLayoutLMForSequenceClassification,
-        TFLayoutLMForTokenClassification,
-        TFLayoutLMModel,
-    )
-
-
-class TFLayoutLMModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-        range_bbox=1000,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.range_bbox = range_bbox
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        # convert bbox to numpy since TF does not support item assignment
-        bbox = ids_tensor([self.batch_size, self.seq_length, 4], self.range_bbox).numpy()
-        # Ensure that bbox is legal
-        for i in range(bbox.shape[0]):
-            for j in range(bbox.shape[1]):
-                if bbox[i, j, 3] < bbox[i, j, 1]:
-                    t = bbox[i, j, 3]
-                    bbox[i, j, 3] = bbox[i, j, 1]
-                    bbox[i, j, 1] = t
-                if bbox[i, j, 2] < bbox[i, j, 0]:
-                    t = bbox[i, j, 2]
-                    bbox[i, j, 2] = bbox[i, j, 0]
-                    bbox[i, j, 0] = t
-        bbox = tf.convert_to_tensor(bbox)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = LayoutLMConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def create_and_check_model(
-        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFLayoutLMModel(config=config)
-
-        result = model(input_ids, bbox, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, bbox, token_type_ids=token_type_ids)
-        result = model(input_ids, bbox)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFLayoutLMForMaskedLM(config=config)
-
-        result = model(input_ids, bbox, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFLayoutLMForSequenceClassification(config=config)
-
-        result = model(input_ids, bbox, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFLayoutLMForTokenClassification(config=config)
-
-        result = model(input_ids, bbox, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFLayoutLMForQuestionAnswering(config=config)
-
-        result = model(input_ids, bbox, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            bbox,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "bbox": bbox,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-        }
-        return config, inputs_dict
-
-
-@require_tf
-class TFLayoutLMModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFLayoutLMModel,
-            TFLayoutLMForMaskedLM,
-            TFLayoutLMForTokenClassification,
-            TFLayoutLMForSequenceClassification,
-            TFLayoutLMForQuestionAnswering,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFLayoutLMModel,
-            "fill-mask": TFLayoutLMForMaskedLM,
-            "text-classification": TFLayoutLMForSequenceClassification,
-            "token-classification": TFLayoutLMForTokenClassification,
-            "zero-shot": TFLayoutLMForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = True
-    onnx_min_opset = 10
-
-    def setUp(self):
-        self.model_tester = TFLayoutLMModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LayoutLMConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/layoutlm-base-uncased"
-        model = TFLayoutLMModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    # TODO (Joao): fix me
-    @unittest.skip("Onnx compliancy broke with TF 2.10")
-    def test_onnx_compliancy(self):
-        pass
-
-
-def prepare_layoutlm_batch_inputs():
-    # Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on:
-    # fmt: off
-    input_ids = tf.convert_to_tensor([[101,1019,1014,1016,1037,12849,4747,1004,14246,2278,5439,4524,5002,2930,2193,2930,4341,3208,1005,1055,2171,2848,11300,3531,102],[101,4070,4034,7020,1024,3058,1015,1013,2861,1013,6070,19274,2772,6205,27814,16147,16147,4343,2047,10283,10969,14389,1012,2338,102]])  # noqa: E231
-    attention_mask = tf.convert_to_tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],])  # noqa: E231
-    bbox = tf.convert_to_tensor([[[0,0,0,0],[423,237,440,251],[427,272,441,287],[419,115,437,129],[961,885,992,912],[256,38,330,58],[256,38,330,58],[336,42,353,57],[360,39,401,56],[360,39,401,56],[411,39,471,59],[479,41,528,59],[533,39,630,60],[67,113,134,131],[141,115,209,132],[68,149,133,166],[141,149,187,164],[195,148,287,165],[195,148,287,165],[195,148,287,165],[295,148,349,165],[441,149,492,166],[497,149,546,164],[64,201,125,218],[1000,1000,1000,1000]],[[0,0,0,0],[662,150,754,166],[665,199,742,211],[519,213,554,228],[519,213,554,228],[134,433,187,454],[130,467,204,480],[130,467,204,480],[130,467,204,480],[130,467,204,480],[130,467,204,480],[314,469,376,482],[504,684,582,706],[941,825,973,900],[941,825,973,900],[941,825,973,900],[941,825,973,900],[610,749,652,765],[130,659,168,672],[176,657,237,672],[238,657,312,672],[443,653,628,672],[443,653,628,672],[716,301,825,317],[1000,1000,1000,1000]]])  # noqa: E231
-    token_type_ids = tf.convert_to_tensor([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])  # noqa: E231
-    # these are sequence labels (i.e. at the token level)
-    labels = tf.convert_to_tensor([[-100,10,10,10,9,1,-100,7,7,-100,7,7,4,2,5,2,8,8,-100,-100,5,0,3,2,-100],[-100,12,12,12,-100,12,10,-100,-100,-100,-100,10,12,9,-100,-100,-100,10,10,10,9,12,-100,10,-100]])  # noqa: E231
-    # fmt: on
-
-    return input_ids, attention_mask, bbox, token_type_ids, labels
-
-
-@require_tf
-class TFLayoutLMModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_forward_pass_no_head(self):
-        model = TFLayoutLMModel.from_pretrained("microsoft/layoutlm-base-uncased")
-
-        input_ids, attention_mask, bbox, token_type_ids, labels = prepare_layoutlm_batch_inputs()
-
-        # forward pass
-        outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
-
-        # test the sequence output on [0, :3, :3]
-        expected_slice = tf.convert_to_tensor(
-            [[0.1785, -0.1947, -0.0425], [-0.3254, -0.2807, 0.2553], [-0.5391, -0.3322, 0.3364]],
-        )
-
-        self.assertTrue(np.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-3))
-
-        # test the pooled output on [1, :3]
-        expected_slice = tf.convert_to_tensor([-0.6580, -0.0214, 0.8552])
-
-        self.assertTrue(np.allclose(outputs.pooler_output[1, :3], expected_slice, atol=1e-3))
-
-    @slow
-    def test_forward_pass_sequence_classification(self):
-        # initialize model with randomly initialized sequence classification head
-        model = TFLayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased", num_labels=2)
-
-        input_ids, attention_mask, bbox, token_type_ids, _ = prepare_layoutlm_batch_inputs()
-
-        # forward pass
-        outputs = model(
-            input_ids=input_ids,
-            bbox=bbox,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            labels=tf.convert_to_tensor([1, 1]),
-        )
-
-        # test whether we get a loss as a scalar
-        loss = outputs.loss
-        expected_shape = (2,)
-        self.assertEqual(loss.shape, expected_shape)
-
-        # test the shape of the logits
-        logits = outputs.logits
-        expected_shape = (2, 2)
-        self.assertEqual(logits.shape, expected_shape)
-
-    @slow
-    def test_forward_pass_token_classification(self):
-        # initialize model with randomly initialized token classification head
-        model = TFLayoutLMForTokenClassification.from_pretrained("microsoft/layoutlm-base-uncased", num_labels=13)
-
-        input_ids, attention_mask, bbox, token_type_ids, labels = prepare_layoutlm_batch_inputs()
-
-        # forward pass
-        outputs = model(
-            input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels
-        )
-
-        # test the shape of the logits
-        logits = outputs.logits
-        expected_shape = tf.convert_to_tensor((2, 25, 13))
-        self.assertEqual(logits.shape, expected_shape)
-
-    @slow
-    def test_forward_pass_question_answering(self):
-        # initialize model with randomly initialized token classification head
-        model = TFLayoutLMForQuestionAnswering.from_pretrained("microsoft/layoutlm-base-uncased")
-
-        input_ids, attention_mask, bbox, token_type_ids, labels = prepare_layoutlm_batch_inputs()
-
-        # forward pass
-        outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
-
-        # test the shape of the logits
-        expected_shape = tf.convert_to_tensor((2, 25))
-        self.assertEqual(outputs.start_logits.shape, expected_shape)
-        self.assertEqual(outputs.end_logits.shape, expected_shape)
diff --git a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
deleted file mode 100644
index 5ceea057bb..0000000000
--- a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
+++ /dev/null
@@ -1,515 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow LayoutLMv3 model."""
-
-from __future__ import annotations
-
-import copy
-import inspect
-import unittest
-
-import numpy as np
-
-from transformers import is_tf_available, is_vision_available
-from transformers.models.auto import get_values
-from transformers.testing_utils import require_tf, slow
-from transformers.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
-        TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        LayoutLMv3Config,
-        TFLayoutLMv3ForQuestionAnswering,
-        TFLayoutLMv3ForSequenceClassification,
-        TFLayoutLMv3ForTokenClassification,
-        TFLayoutLMv3Model,
-    )
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import LayoutLMv3ImageProcessor
-
-
-class TFLayoutLMv3ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        num_channels=3,
-        image_size=4,
-        patch_size=2,
-        text_seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=36,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        coordinate_size=6,
-        shape_size=6,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-        range_bbox=1000,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.coordinate_size = coordinate_size
-        self.shape_size = shape_size
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.range_bbox = range_bbox
-
-        # LayoutLMv3's sequence length equals the number of text tokens + number of patches + 1 (we add 1 for the CLS token)
-        self.text_seq_length = text_seq_length
-        self.image_seq_length = (image_size // patch_size) ** 2 + 1
-        self.seq_length = self.text_seq_length + self.image_seq_length
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.text_seq_length], self.vocab_size)
-
-        bbox = ids_tensor([self.batch_size, self.text_seq_length, 4], self.range_bbox)
-        bbox = bbox.numpy()
-        # Ensure that bbox is legal
-        for i in range(bbox.shape[0]):
-            for j in range(bbox.shape[1]):
-                if bbox[i, j, 3] < bbox[i, j, 1]:
-                    tmp_coordinate = bbox[i, j, 3]
-                    bbox[i, j, 3] = bbox[i, j, 1]
-                    bbox[i, j, 1] = tmp_coordinate
-                if bbox[i, j, 2] < bbox[i, j, 0]:
-                    tmp_coordinate = bbox[i, j, 2]
-                    bbox[i, j, 2] = bbox[i, j, 0]
-                    bbox[i, j, 0] = tmp_coordinate
-        bbox = tf.constant(bbox)
-
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.text_seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.text_seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.text_seq_length], self.num_labels)
-
-        config = LayoutLMv3Config(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            coordinate_size=self.coordinate_size,
-            shape_size=self.shape_size,
-            input_size=self.image_size,
-            patch_size=self.patch_size,
-        )
-
-        return config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels, token_labels
-
-    def create_and_check_model(self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask):
-        model = TFLayoutLMv3Model(config=config)
-
-        # text + image
-        result = model(input_ids, pixel_values=pixel_values, training=False)
-        result = model(
-            input_ids,
-            bbox=bbox,
-            pixel_values=pixel_values,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            training=False,
-        )
-        result = model(input_ids, bbox=bbox, pixel_values=pixel_values, training=False)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-        # text only
-        result = model(input_ids, training=False)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.text_seq_length, self.hidden_size)
-        )
-
-        # image only
-        result = model({"pixel_values": pixel_values}, training=False)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.image_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFLayoutLMv3ForSequenceClassification(config=config)
-        result = model(
-            input_ids,
-            bbox=bbox,
-            pixel_values=pixel_values,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=sequence_labels,
-            training=False,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask, token_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFLayoutLMv3ForTokenClassification(config=config)
-        result = model(
-            input_ids,
-            bbox=bbox,
-            pixel_values=pixel_values,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            training=False,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.text_seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels
-    ):
-        config.num_labels = 2
-        model = TFLayoutLMv3ForQuestionAnswering(config=config)
-        result = model(
-            input_ids,
-            bbox=bbox,
-            pixel_values=pixel_values,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-            training=False,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, input_ids, bbox, pixel_values, token_type_ids, input_mask, _, _) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "bbox": bbox,
-            "pixel_values": pixel_values,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-        }
-        return config, inputs_dict
-
-
-@require_tf
-class TFLayoutLMv3ModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFLayoutLMv3Model,
-            TFLayoutLMv3ForQuestionAnswering,
-            TFLayoutLMv3ForSequenceClassification,
-            TFLayoutLMv3ForTokenClassification,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"document-question-answering": TFLayoutLMv3ForQuestionAnswering, "feature-extraction": TFLayoutLMv3Model}
-        if is_tf_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_onnx = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_case_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        image_processor_name,
-        feature_extractor_name,
-        processor_name,
-    ):
-        return True
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict:
-        inputs_dict = copy.deepcopy(inputs_dict)
-
-        if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-            inputs_dict = {
-                k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1))
-                if isinstance(v, tf.Tensor) and v.ndim > 0
-                else v
-                for k, v in inputs_dict.items()
-            }
-
-        if return_labels:
-            if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
-                inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-                inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
-                inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING):
-                inputs_dict["labels"] = tf.zeros(
-                    (self.model_tester.batch_size, self.model_tester.text_seq_length), dtype=tf.int32
-                )
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = TFLayoutLMv3ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LayoutLMv3Config, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_loss_computation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            if getattr(model, "hf_compute_loss", None):
-                # The number of elements in the loss should be the same as the number of elements in the label
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                added_label = prepared_for_class[
-                    sorted(prepared_for_class.keys() - inputs_dict.keys(), reverse=True)[0]
-                ]
-                expected_loss_size = added_label.shape.as_list()[:1]
-
-                # Test that model correctly compute the loss with kwargs
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                input_ids = prepared_for_class.pop("input_ids")
-
-                loss = model(input_ids, **prepared_for_class)[0]
-                self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
-
-                # Test that model correctly compute the loss when we mask some positions
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                input_ids = prepared_for_class.pop("input_ids")
-                if "labels" in prepared_for_class:
-                    labels = prepared_for_class["labels"].numpy()
-                    if len(labels.shape) > 1 and labels.shape[1] != 1:
-                        labels[0] = -100
-                        prepared_for_class["labels"] = tf.convert_to_tensor(labels)
-                        loss = model(input_ids, **prepared_for_class)[0]
-                        self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
-                        self.assertTrue(not np.any(np.isnan(loss.numpy())))
-
-                # Test that model correctly compute the loss with a dict
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                loss = model(prepared_for_class)[0]
-                self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
-
-                # Test that model correctly compute the loss with a tuple
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-
-                # Get keys that were added with the _prepare_for_class function
-                label_keys = prepared_for_class.keys() - inputs_dict.keys()
-                signature = inspect.signature(model.call).parameters
-                signature_names = list(signature.keys())
-
-                # Create a dictionary holding the location of the tensors in the tuple
-                tuple_index_mapping = {0: "input_ids"}
-                for label_key in label_keys:
-                    label_key_index = signature_names.index(label_key)
-                    tuple_index_mapping[label_key_index] = label_key
-                sorted_tuple_index_mapping = sorted(tuple_index_mapping.items())
-                # Initialize a list with their default values, update the values and convert to a tuple
-                list_input = []
-
-                for name in signature_names:
-                    if name != "kwargs":
-                        list_input.append(signature[name].default)
-
-                for index, value in sorted_tuple_index_mapping:
-                    list_input[index] = prepared_for_class[value]
-
-                tuple_input = tuple(list_input)
-
-                # Send to model
-                loss = model(tuple_input[:-1])[0]
-
-                self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
-
-    def test_model(self):
-        (
-            config,
-            input_ids,
-            bbox,
-            pixel_values,
-            token_type_ids,
-            input_mask,
-            _,
-            _,
-        ) = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(config, input_ids, bbox, pixel_values, token_type_ids, input_mask)
-
-    def test_model_various_embeddings(self):
-        (
-            config,
-            input_ids,
-            bbox,
-            pixel_values,
-            token_type_ids,
-            input_mask,
-            _,
-            _,
-        ) = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config.position_embedding_type = type
-            self.model_tester.create_and_check_model(config, input_ids, bbox, pixel_values, token_type_ids, input_mask)
-
-    def test_for_sequence_classification(self):
-        (
-            config,
-            input_ids,
-            bbox,
-            pixel_values,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            _,
-        ) = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(
-            config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels
-        )
-
-    def test_for_token_classification(self):
-        (
-            config,
-            input_ids,
-            bbox,
-            pixel_values,
-            token_type_ids,
-            input_mask,
-            _,
-            token_labels,
-        ) = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(
-            config, input_ids, bbox, pixel_values, token_type_ids, input_mask, token_labels
-        )
-
-    def test_for_question_answering(self):
-        (
-            config,
-            input_ids,
-            bbox,
-            pixel_values,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            _,
-        ) = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(
-            config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels
-        )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/layoutlmv3-base"
-        model = TFLayoutLMv3Model.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_tf
-class TFLayoutLMv3ModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return LayoutLMv3ImageProcessor(apply_ocr=False) if is_vision_available() else None
-
-    @slow
-    def test_inference_no_head(self):
-        model = TFLayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        pixel_values = image_processor(images=image, return_tensors="tf").pixel_values
-
-        input_ids = tf.constant([[1, 2]])
-        bbox = tf.expand_dims(tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]]), axis=0)
-
-        # forward pass
-        outputs = model(input_ids=input_ids, bbox=bbox, pixel_values=pixel_values, training=False)
-
-        # verify the logits
-        expected_shape = (1, 199, 768)
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
-
-        expected_slice = tf.constant(
-            [[-0.0529, 0.3618, 0.1632], [-0.1587, -0.1667, -0.0400], [-0.1557, -0.1671, -0.0505]]
-        )
-
-        self.assertTrue(np.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/led/test_modeling_tf_led.py b/tests/models/led/test_modeling_tf_led.py
deleted file mode 100644
index e63b376d58..0000000000
--- a/tests/models/led/test_modeling_tf_led.py
+++ /dev/null
@@ -1,342 +0,0 @@
-# Copyright Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import LEDConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFLEDForConditionalGeneration, TFLEDModel
-
-
-@require_tf
-class TFLEDModelTester:
-    config_cls = LEDConfig
-    config_updates = {}
-    hidden_act = "gelu"
-
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-        attention_window=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.attention_window = attention_window
-
-        # `ModelTesterMixin.test_attention_outputs` is expecting attention tensors to be of size
-        # [num_attention_heads, encoder_seq_length, encoder_key_length], but TFLongformerSelfAttention
-        # returns attention of shape [num_attention_heads, encoder_seq_length, self.attention_window + 1]
-        # because its local attention only attends to `self.attention_window` and one before and one after
-        self.key_length = self.attention_window + 2
-
-        # because of padding `encoder_seq_length`, is different from `seq_length`. Relevant for
-        # the `test_attention_outputs` and `test_hidden_states_output` tests
-        self.encoder_seq_length = (
-            self.seq_length + (self.attention_window - self.seq_length % self.attention_window) % self.attention_window
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
-        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
-        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.config_cls(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_ids=[2],
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.pad_token_id,
-            attention_window=self.attention_window,
-            **self.config_updates,
-        )
-        inputs_dict = prepare_led_inputs_dict(config, input_ids, decoder_input_ids)
-        global_attention_mask = tf.concat(
-            [tf.zeros_like(input_ids)[:, :-1], tf.ones_like(input_ids)[:, -1:]],
-            axis=-1,
-        )
-        inputs_dict["global_attention_mask"] = global_attention_mask
-        return config, inputs_dict
-
-    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = TFLEDModel(config=config).get_decoder()
-        input_ids = inputs_dict["input_ids"]
-
-        input_ids = input_ids[:1, :]
-        attention_mask = inputs_dict["attention_mask"][:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = tf.cast(ids_tensor((self.batch_size, 3), 2), tf.int8)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-
-def prepare_led_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = tf.concat(
-            [
-                tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int8),
-                tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int8),
-            ],
-            axis=-1,
-        )
-    if head_mask is None:
-        head_mask = tf.ones((config.encoder_layers, config.encoder_attention_heads))
-    if decoder_head_mask is None:
-        decoder_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
-    return {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "decoder_input_ids": decoder_input_ids,
-        "decoder_attention_mask": decoder_attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-    }
-
-
-@require_tf
-class TFLEDModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TFLEDForConditionalGeneration, TFLEDModel) if is_tf_available() else ()
-    all_generative_model_classes = (TFLEDForConditionalGeneration,) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFLEDModel,
-            "summarization": TFLEDForConditionalGeneration,
-            "text2text-generation": TFLEDForConditionalGeneration,
-            "translation": TFLEDForConditionalGeneration,
-        }
-        if is_tf_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    test_pruning = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFLEDModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LEDConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        inputs_dict["global_attention_mask"] = tf.zeros_like(inputs_dict["attention_mask"])
-        num_global_attn_indices = 2
-        inputs_dict["global_attention_mask"] = tf.where(
-            tf.range(self.model_tester.seq_length)[None, :] < num_global_attn_indices,
-            1,
-            inputs_dict["global_attention_mask"],
-        )
-
-        config.return_dict = True
-        seq_length = self.model_tester.seq_length
-        encoder_seq_length = self.model_tester.encoder_seq_length
-
-        def check_decoder_attentions_output(outputs):
-            decoder_attentions = outputs.decoder_attentions
-            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_length, seq_length],
-            )
-
-        def check_encoder_attentions_output(outputs):
-            attentions = [t.numpy() for t in outputs.encoder_attentions]
-            global_attentions = [t.numpy() for t in outputs.encoder_global_attentions]
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-            self.assertEqual(len(global_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_length, seq_length],
-            )
-            self.assertListEqual(
-                list(global_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, num_global_attn_indices],
-            )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["use_cache"] = False
-            config.output_hidden_states = False
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            out_len = len(outputs)
-            self.assertEqual(config.output_hidden_states, False)
-            check_encoder_attentions_output(outputs)
-
-            if self.is_encoder_decoder:
-                model = model_class(config)
-                outputs = model(self._prepare_for_class(inputs_dict, model_class))
-                self.assertEqual(config.output_hidden_states, False)
-                check_decoder_attentions_output(outputs)
-
-            # Check that output attentions can also be changed via the config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            self.assertEqual(config.output_hidden_states, False)
-            check_encoder_attentions_output(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = True
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-
-            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
-            self.assertEqual(model.config.output_hidden_states, True)
-            check_encoder_attentions_output(outputs)
-
-    @unittest.skip("LED keeps using potentially symbolic tensors in conditionals and breaks tracing.")
-    def test_saved_model_creation(self):
-        pass
-
-    def test_generate_with_headmasking(self):
-        # TODO: Head-masking not yet implement
-        pass
-
-
-def _long_tensor(tok_lst):
-    return tf.constant(tok_lst, dtype=tf.int32)
-
-
-TOLERANCE = 1e-4
-
-
-@slow
-@require_tf
-class TFLEDModelIntegrationTest(unittest.TestCase):
-    def test_inference_no_head(self):
-        model = TFLEDForConditionalGeneration.from_pretrained("allenai/led-base-16384").led
-
-        # change to intended input here
-        input_ids = _long_tensor([512 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
-        decoder_input_ids = _long_tensor([128 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
-        inputs_dict = prepare_led_inputs_dict(model.config, input_ids, decoder_input_ids)
-        output = model(**inputs_dict)[0]
-        expected_shape = (1, 1024, 768)
-        self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
-        expected_slice = tf.convert_to_tensor(
-            [[2.3050, 2.8279, 0.6531], [-1.8457, -0.1455, -3.5661], [-1.0186, 0.4586, -2.2043]],
-        )
-        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-3)
-
-    def test_inference_with_head(self):
-        model = TFLEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")
-
-        # change to intended input here
-        input_ids = _long_tensor([512 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
-        decoder_input_ids = _long_tensor([128 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
-        inputs_dict = prepare_led_inputs_dict(model.config, input_ids, decoder_input_ids)
-        output = model(**inputs_dict)[0]
-        expected_shape = (1, 1024, model.config.vocab_size)
-        self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
-        expected_slice = tf.convert_to_tensor(
-            [[33.6507, 6.4572, 16.8089], [5.8739, -2.4238, 11.2902], [-3.2139, -4.3149, 4.2783]],
-        )
-        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-3, rtol=1e-3)
diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
deleted file mode 100644
index 7091dadf58..0000000000
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers import LlamaConfig, is_flax_available, is_tokenizers_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
-
-
-if is_flax_available():
-    import jax.numpy as jnp
-
-    from transformers.models.llama.modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel
-
-
-if is_tokenizers_available():
-    from transformers import LlamaTokenizerFast
-
-
-class FlaxLlamaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        intermediate_size=64,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        window_size=7,
-        initializer_range=0.02,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.window_size = window_size
-        self.initializer_range = initializer_range
-        self.scope = None
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-        self.pad_token_id = vocab_size - 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = np.tril(np.ones((self.batch_size, self.seq_length)))
-
-        config = LlamaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            use_cache=True,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-        return (config, input_ids, input_mask)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-    def check_use_cache_forward(self, model_class_name, config, input_ids, attention_mask):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
-        attention_mask = jnp.ones((input_ids.shape[0], max_decoder_length), dtype="i4")
-
-        position_ids = jnp.broadcast_to(
-            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
-        )
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-        )
-
-        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            attention_mask=attention_mask,
-            past_key_values=outputs_cache.past_key_values,
-            position_ids=position_ids,
-        )
-
-        outputs = model(input_ids)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input_ids, attention_mask):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        attention_mask_cache = jnp.concatenate(
-            [attention_mask, jnp.zeros((attention_mask.shape[0], max_decoder_length - attention_mask.shape[1]))],
-            axis=-1,
-        )
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
-        position_ids = jnp.broadcast_to(
-            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
-        )
-
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask_cache,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-        )
-        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            past_key_values=outputs_cache.past_key_values,
-            attention_mask=attention_mask_cache,
-            position_ids=position_ids,
-        )
-
-        outputs = model(input_ids, attention_mask=attention_mask)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-
-@require_flax
-class FlaxLlamaModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxLlamaModel, FlaxLlamaForCausalLM) if is_flax_available() else ()
-
-    def setUp(self):
-        self.model_tester = FlaxLlamaModelTester(self)
-
-    def test_use_cache_forward(self):
-        for model_class_name in self.all_model_classes:
-            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
-            self.model_tester.check_use_cache_forward(model_class_name, config, input_ids, attention_mask)
-
-    def test_use_cache_forward_with_attn_mask(self):
-        for model_class_name in self.all_model_classes:
-            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
-            self.model_tester.check_use_cache_forward_with_attn_mask(
-                model_class_name, config, input_ids, attention_mask
-            )
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("openlm-research/open_llama_3b_v2", from_pt=True)
-            outputs = model(np.ones((1, 1)))
-            self.assertIsNotNone(outputs)
-
-
-@slow
-@require_flax
-class FlaxLlamaIntegrationTest(unittest.TestCase):
-    def setUp(self):
-        self.model_id = "openlm-research/open_llama_3b_v2"
-        self.model = FlaxLlamaForCausalLM.from_pretrained(self.model_id, from_pt=True)
-        self.test_batch = jnp.arange(32).reshape(4, 8) + 1911
-
-    def test_model_logits(self):
-        flax_logits = self.model(self.test_batch).logits
-
-        # fmt: off
-        EXPECTED_LOGITS = [-74.4243, -74.0680, -65.2507, -79.1658, -77.7460, -69.2379, -86.4588, -84.8933, -77.8456]
-        EXPECTED_MIN, EXPECTED_MAX, EXPECTED_MEAN = -96.9952
-        EXPECTED_MAX = -18.4571
-        EXPECTED_MEAN = -65.0608
-        # fmt: on
-
-        self.assertTrue(np.allclose(flax_logits[0, :3, :3].flatten(), EXPECTED_LOGITS, atol=1e-4))
-        self.assertAlmostEqual(flax_logits.min(), EXPECTED_MIN, places=3)
-        self.assertAlmostEqual(flax_logits.max(), EXPECTED_MAX, places=3)
-        self.assertAlmostEqual(flax_logits.mean(), EXPECTED_MEAN, places=3)
-
-    def test_model_hidden_states(self):
-        flax_hidden_states = self.model(self.test_batch, output_hidden_states=True).hidden_states
-        flax_hidden_means = [h.mean() for h in flax_hidden_states]
-
-        # fmt: off
-        EXPECTED_HIDDEN_MEANS = [
-            -0.00007,-0.00049,-0.00169,-0.00253,-0.00271,
-            -0.00290,-0.00252,0.00230,0.00230,0.00198,
-            0.00196,0.00174,0.00246,0.00205,0.00242,
-            0.00171,0.00092,0.00054,0.00102,0.00024,
-            0.00029,0.00037,-0.00101,-0.00062,-0.00341,-0.00636,-0.00357
-        ]
-        # fmt: on
-
-        self.assertTrue(np.allclose(flax_hidden_means, EXPECTED_HIDDEN_MEANS, atol=1e-4))
-
-    def test_generated_text(self):
-        tokenizer = LlamaTokenizerFast.from_pretrained(self.model_id)
-        tokenizer.pad_token_id = 2
-        test_batch = ["Aloha, World! ", "2 + 2 = ", "Paris is the capital of ", "我很高興認識"]
-
-        inputs = tokenizer(test_batch, return_tensors="np", truncation=True, padding=True)
-        generated_ids = self.model.generate(**inputs, max_length=15).sequences
-        generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        # fmt: off
-        EXPECTED_GENERATION = [
-            "Aloha, World! 201",
-            "2 + 2 = 4\n2",
-            "Paris is the capital of Île-",
-            "我很高興認識你，我"
-        ]
-        # fmt: on
-
-        self.assertListEqual(generated_text, EXPECTED_GENERATION)
diff --git a/tests/models/longformer/test_modeling_tf_longformer.py b/tests/models/longformer/test_modeling_tf_longformer.py
deleted file mode 100644
index 19b4197943..0000000000
--- a/tests/models/longformer/test_modeling_tf_longformer.py
+++ /dev/null
@@ -1,736 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        LongformerConfig,
-        TFLongformerForMaskedLM,
-        TFLongformerForMultipleChoice,
-        TFLongformerForQuestionAnswering,
-        TFLongformerForSequenceClassification,
-        TFLongformerForTokenClassification,
-        TFLongformerModel,
-    )
-    from transformers.models.longformer.modeling_tf_longformer import TFLongformerSelfAttention
-    from transformers.tf_utils import shape_list
-
-
-class TFLongformerModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-        self.attention_window = 4
-
-        # `ModelTesterMixin.test_attention_outputs` is expecting attention tensors to be of size
-        # [num_attention_heads, encoder_seq_length, encoder_key_length], but TFLongformerSelfAttention
-        # returns attention of shape [num_attention_heads, encoder_seq_length, self.attention_window + 1]
-        # because its local attention only attends to `self.attention_window` and one before and one after
-        self.key_length = self.attention_window + 2
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = LongformerConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            attention_window=self.attention_window,
-        )
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def create_and_check_attention_mask_determinism(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFLongformerModel(config=config)
-
-        attention_mask = tf.ones(input_ids.shape, dtype=tf.int64)
-        output_with_mask = model(input_ids, attention_mask=attention_mask)[0]
-        output_without_mask = model(input_ids)[0]
-        tf.debugging.assert_near(output_with_mask[0, 0, :5], output_without_mask[0, 0, :5], rtol=1e-4)
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.return_dict = True
-        model = TFLongformerModel(config=config)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertListEqual(
-            shape_list(result.last_hidden_state), [self.batch_size, self.seq_length, self.hidden_size]
-        )
-        self.parent.assertListEqual(shape_list(result.pooler_output), [self.batch_size, self.hidden_size])
-
-    def create_and_check_model_with_global_attention_mask(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.return_dict = True
-        model = TFLongformerModel(config=config)
-        half_input_mask_length = shape_list(input_mask)[-1] // 2
-        global_attention_mask = tf.concat(
-            [
-                tf.zeros_like(input_mask)[:, :half_input_mask_length],
-                tf.ones_like(input_mask)[:, half_input_mask_length:],
-            ],
-            axis=-1,
-        )
-
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            global_attention_mask=global_attention_mask,
-            token_type_ids=token_type_ids,
-        )
-        result = model(input_ids, token_type_ids=token_type_ids, global_attention_mask=global_attention_mask)
-        result = model(input_ids, global_attention_mask=global_attention_mask)
-
-        self.parent.assertListEqual(
-            shape_list(result.last_hidden_state), [self.batch_size, self.seq_length, self.hidden_size]
-        )
-        self.parent.assertListEqual(shape_list(result.pooler_output), [self.batch_size, self.hidden_size])
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.return_dict = True
-        model = TFLongformerForMaskedLM(config=config)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertListEqual(shape_list(result.logits), [self.batch_size, self.seq_length, self.vocab_size])
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.return_dict = True
-        model = TFLongformerForQuestionAnswering(config=config)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-
-        self.parent.assertListEqual(shape_list(result.start_logits), [self.batch_size, self.seq_length])
-        self.parent.assertListEqual(shape_list(result.end_logits), [self.batch_size, self.seq_length])
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFLongformerForSequenceClassification(config=config)
-        output = model(
-            input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
-        ).logits
-        self.parent.assertListEqual(shape_list(output), [self.batch_size, self.num_labels])
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFLongformerForTokenClassification(config=config)
-        output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels).logits
-        self.parent.assertListEqual(shape_list(output), [self.batch_size, self.seq_length, self.num_labels])
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = TFLongformerForMultipleChoice(config=config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        output = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            global_attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        ).logits
-        self.parent.assertListEqual(list(output.shape), [self.batch_size, self.num_choices])
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-
-        # global attention mask has to be partly defined
-        # to trace all weights
-        global_attention_mask = tf.concat(
-            [tf.zeros_like(input_ids)[:, :-1], tf.ones_like(input_ids)[:, -1:]],
-            axis=-1,
-        )
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-            "global_attention_mask": global_attention_mask,
-        }
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_question_answering(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-
-        # Replace sep_token_id by some random id
-        input_ids = tf.where(input_ids == config.sep_token_id, 0, input_ids)
-        # Make sure there are exactly three sep_token_id
-        input_ids = tf.concat([input_ids[:, :-3], tf.ones_like(input_ids)[:, -3:] * config.sep_token_id], axis=-1)
-        input_mask = tf.ones_like(input_ids)
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-
-@require_tf
-class TFLongformerModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFLongformerModel,
-            TFLongformerForMaskedLM,
-            TFLongformerForQuestionAnswering,
-            TFLongformerForSequenceClassification,
-            TFLongformerForMultipleChoice,
-            TFLongformerForTokenClassification,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFLongformerModel,
-            "fill-mask": TFLongformerForMaskedLM,
-            "question-answering": TFLongformerForQuestionAnswering,
-            "text-classification": TFLongformerForSequenceClassification,
-            "token-classification": TFLongformerForTokenClassification,
-            "zero-shot": TFLongformerForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_case_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        image_processor_name,
-        feature_extractor_name,
-        processor_name,
-    ):
-        if (
-            pipeline_test_case_name == "QAPipelineTests"
-            and tokenizer_name is not None
-            and not tokenizer_name.endswith("Fast")
-        ):
-            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
-            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
-            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = TFLongformerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LongformerConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model_attention_mask_determinism(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_attention_mask_determinism(*config_and_inputs)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_global_attention_mask(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_global_attention_mask(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_question_answering()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    @unittest.skip("Longformer keeps using potentially symbolic tensors in conditionals and breaks tracing.")
-    def test_saved_model_creation(self):
-        pass
-
-    @unittest.skip("Longformer keeps using potentially symbolic tensors in conditionals and breaks tracing.")
-    def test_compile_tf_model(self):
-        pass
-
-
-@require_tf
-@require_sentencepiece
-@require_tokenizers
-class TFLongformerModelIntegrationTest(unittest.TestCase):
-    def _get_hidden_states(self):
-        return tf.convert_to_tensor(
-            [
-                [
-                    [
-                        4.98332758e-01,
-                        2.69175139e00,
-                        -7.08081422e-03,
-                        1.04915401e00,
-                        -1.83476661e00,
-                        7.67220476e-01,
-                        2.98580543e-01,
-                        2.84803992e-02,
-                    ],
-                    [
-                        -7.58357372e-01,
-                        4.20635998e-01,
-                        -4.04739919e-02,
-                        1.59924145e-01,
-                        2.05135748e00,
-                        -1.15997978e00,
-                        5.37166397e-01,
-                        2.62873606e-01,
-                    ],
-                    [
-                        -1.69438001e00,
-                        4.17574660e-01,
-                        -1.49196962e00,
-                        -1.76483717e00,
-                        -1.94566312e-01,
-                        -1.71183858e00,
-                        7.72903565e-01,
-                        -1.11557056e00,
-                    ],
-                    [
-                        5.44028163e-01,
-                        2.05466114e-01,
-                        -3.63045868e-01,
-                        2.41865062e-01,
-                        3.20348382e-01,
-                        -9.05611176e-01,
-                        -1.92690727e-01,
-                        -1.19917547e00,
-                    ],
-                ]
-            ],
-            dtype=tf.float32,
-        )
-
-    def test_diagonalize(self):
-        hidden_states = self._get_hidden_states()
-        hidden_states = tf.reshape(hidden_states, (1, 8, 4))  # set seq length = 8, hidden dim = 4
-        chunked_hidden_states = TFLongformerSelfAttention._chunk(hidden_states, window_overlap=2)
-        window_overlap_size = shape_list(chunked_hidden_states)[2]
-        self.assertTrue(window_overlap_size == 4)
-
-        padded_hidden_states = TFLongformerSelfAttention._pad_and_diagonalize(chunked_hidden_states)
-
-        self.assertTrue(
-            shape_list(padded_hidden_states)[-1] == shape_list(chunked_hidden_states)[-1] + window_overlap_size - 1
-        )
-
-        # first row => [0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000]
-        tf.debugging.assert_near(padded_hidden_states[0, 0, 0, :4], chunked_hidden_states[0, 0, 0], rtol=1e-3)
-        tf.debugging.assert_near(padded_hidden_states[0, 0, 0, 4:], tf.zeros((3,), dtype=tf.float32), rtol=1e-3)
-
-        # last row => [0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629]
-        tf.debugging.assert_near(padded_hidden_states[0, 0, -1, 3:], chunked_hidden_states[0, 0, -1], rtol=1e-3)
-        tf.debugging.assert_near(padded_hidden_states[0, 0, -1, :3], tf.zeros((3,), dtype=tf.float32), rtol=1e-3)
-
-    def test_pad_and_transpose_last_two_dims(self):
-        hidden_states = self._get_hidden_states()
-        self.assertEqual(shape_list(hidden_states), [1, 4, 8])
-
-        # pad along seq length dim
-        paddings = tf.constant([[0, 0], [0, 0], [0, 1], [0, 0]], dtype=tf.int64)
-
-        hidden_states = TFLongformerSelfAttention._chunk(hidden_states, window_overlap=2)
-        padded_hidden_states = TFLongformerSelfAttention._pad_and_transpose_last_two_dims(hidden_states, paddings)
-        self.assertTrue(shape_list(padded_hidden_states) == [1, 1, 8, 5])
-
-        expected_added_dim = tf.zeros((5,), dtype=tf.float32)
-        tf.debugging.assert_near(expected_added_dim, padded_hidden_states[0, 0, -1, :], rtol=1e-6)
-        tf.debugging.assert_near(
-            hidden_states[0, 0, -1, :], tf.reshape(padded_hidden_states, (1, -1))[0, 24:32], rtol=1e-6
-        )
-
-    def test_mask_invalid_locations(self):
-        hidden_states = self._get_hidden_states()
-        batch_size = 1
-        seq_length = 8
-        hidden_size = 4
-        hidden_states = tf.reshape(hidden_states, (batch_size, seq_length, hidden_size))
-        hidden_states = TFLongformerSelfAttention._chunk(hidden_states, window_overlap=2)
-
-        hid_states_1 = TFLongformerSelfAttention._mask_invalid_locations(hidden_states, 1)
-        hid_states_2 = TFLongformerSelfAttention._mask_invalid_locations(hidden_states, 2)
-        hid_states_3 = TFLongformerSelfAttention._mask_invalid_locations(hidden_states[:, :, :, :3], 2)
-        hid_states_4 = TFLongformerSelfAttention._mask_invalid_locations(hidden_states[:, :, 2:, :], 2)
-
-        self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_1), tf.int64)) == 8)
-        self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_2), tf.int64)) == 24)
-        self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_3), tf.int64)) == 24)
-        self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_4), tf.int64)) == 12)
-
-    def test_chunk(self):
-        hidden_states = self._get_hidden_states()
-        batch_size = 1
-        seq_length = 8
-        hidden_size = 4
-        hidden_states = tf.reshape(hidden_states, (batch_size, seq_length, hidden_size))
-
-        chunked_hidden_states = TFLongformerSelfAttention._chunk(hidden_states, window_overlap=2)
-
-        # expected slices across chunk and seq length dim
-        expected_slice_along_seq_length = tf.convert_to_tensor([0.4983, -0.7584, -1.6944], dtype=tf.float32)
-        expected_slice_along_chunk = tf.convert_to_tensor([0.4983, -1.8348, -0.7584, 2.0514], dtype=tf.float32)
-
-        self.assertTrue(shape_list(chunked_hidden_states) == [1, 3, 4, 4])
-        tf.debugging.assert_near(
-            chunked_hidden_states[0, :, 0, 0], expected_slice_along_seq_length, rtol=1e-3, atol=1e-4
-        )
-        tf.debugging.assert_near(chunked_hidden_states[0, 0, :, 0], expected_slice_along_chunk, rtol=1e-3, atol=1e-4)
-
-    def test_layer_local_attn(self):
-        model = TFLongformerModel.from_pretrained("patrickvonplaten/longformer-random-tiny")
-        layer = model.longformer.encoder.layer[0].attention.self_attention
-        hidden_states = self._get_hidden_states()
-        batch_size, seq_length, hidden_size = hidden_states.shape
-
-        attention_mask = tf.zeros((batch_size, seq_length), dtype=tf.float32)
-        is_index_global_attn = tf.math.greater(attention_mask, 1)
-        is_global_attn = tf.math.reduce_any(is_index_global_attn)
-
-        attention_mask = tf.where(tf.range(4)[None, :, None, None] > 1, -10000.0, attention_mask[:, :, None, None])
-        is_index_masked = tf.math.less(attention_mask[:, :, 0, 0], 0)
-
-        layer_head_mask = None
-
-        output_hidden_states = layer(
-            [hidden_states, attention_mask, layer_head_mask, is_index_masked, is_index_global_attn, is_global_attn]
-        )[0]
-
-        expected_slice = tf.convert_to_tensor(
-            [0.00188, 0.012196, -0.017051, -0.025571, -0.02996, 0.017297, -0.011521, 0.004848], dtype=tf.float32
-        )
-
-        self.assertEqual(output_hidden_states.shape, (1, 4, 8))
-        tf.debugging.assert_near(output_hidden_states[0, 1], expected_slice, rtol=1e-3, atol=1e-4)
-
-    def test_layer_global_attn(self):
-        model = TFLongformerModel.from_pretrained("patrickvonplaten/longformer-random-tiny")
-        layer = model.longformer.encoder.layer[0].attention.self_attention
-        hidden_states = self._get_hidden_states()
-
-        hidden_states = tf.concat([self._get_hidden_states(), self._get_hidden_states() - 0.5], axis=0)
-        batch_size, seq_length, hidden_size = hidden_states.shape
-
-        # create attn mask
-        attention_mask_1 = tf.zeros((1, 1, 1, seq_length), dtype=tf.float32)
-        attention_mask_2 = tf.zeros((1, 1, 1, seq_length), dtype=tf.float32)
-
-        attention_mask_1 = tf.where(tf.range(4)[None, :, None, None] > 1, 10000.0, attention_mask_1)
-        attention_mask_1 = tf.where(tf.range(4)[None, :, None, None] > 2, -10000.0, attention_mask_1)
-        attention_mask_2 = tf.where(tf.range(4)[None, :, None, None] > 0, 10000.0, attention_mask_2)
-        attention_mask = tf.concat([attention_mask_1, attention_mask_2], axis=0)
-
-        is_index_masked = tf.math.less(attention_mask[:, :, 0, 0], 0)
-        is_index_global_attn = tf.math.greater(attention_mask[:, :, 0, 0], 0)
-        is_global_attn = tf.math.reduce_any(is_index_global_attn)
-
-        layer_head_mask = None
-
-        output_hidden_states = layer(
-            [
-                hidden_states,
-                -tf.math.abs(attention_mask),
-                layer_head_mask,
-                is_index_masked,
-                is_index_global_attn,
-                is_global_attn,
-            ]
-        )[0]
-
-        self.assertEqual(output_hidden_states.shape, (2, 4, 8))
-        expected_slice_0 = tf.convert_to_tensor(
-            [-0.06508, -0.039306, 0.030934, -0.03417, -0.00656, -0.01553, -0.02088, -0.04938], dtype=tf.float32
-        )
-
-        expected_slice_1 = tf.convert_to_tensor(
-            [-0.04055, -0.038399, 0.0396, -0.03735, -0.03415, 0.01357, 0.00145, -0.05709], dtype=tf.float32
-        )
-
-        tf.debugging.assert_near(output_hidden_states[0, 2], expected_slice_0, rtol=1e-3, atol=1e-4)
-        tf.debugging.assert_near(output_hidden_states[1, -2], expected_slice_1, rtol=1e-3, atol=1e-4)
-
-    def test_layer_attn_probs(self):
-        model = TFLongformerModel.from_pretrained("patrickvonplaten/longformer-random-tiny")
-        layer = model.longformer.encoder.layer[0].attention.self_attention
-        hidden_states = tf.concat([self._get_hidden_states(), self._get_hidden_states() - 0.5], axis=0)
-        batch_size, seq_length, hidden_size = hidden_states.shape
-
-        # create attn mask
-        attention_mask_1 = tf.zeros((1, 1, 1, seq_length), dtype=tf.float32)
-        attention_mask_2 = tf.zeros((1, 1, 1, seq_length), dtype=tf.float32)
-
-        attention_mask_1 = tf.where(tf.range(4)[None, :, None, None] > 1, 10000.0, attention_mask_1)
-        attention_mask_1 = tf.where(tf.range(4)[None, :, None, None] > 2, -10000.0, attention_mask_1)
-        attention_mask_2 = tf.where(tf.range(4)[None, :, None, None] > 0, 10000.0, attention_mask_2)
-        attention_mask = tf.concat([attention_mask_1, attention_mask_2], axis=0)
-
-        is_index_masked = tf.math.less(attention_mask[:, :, 0, 0], 0)
-        is_index_global_attn = tf.math.greater(attention_mask[:, :, 0, 0], 0)
-        is_global_attn = tf.math.reduce_any(is_index_global_attn)
-
-        layer_head_mask = None
-
-        output_hidden_states, local_attentions, global_attentions = layer(
-            [
-                hidden_states,
-                -tf.math.abs(attention_mask),
-                layer_head_mask,
-                is_index_masked,
-                is_index_global_attn,
-                is_global_attn,
-            ]
-        )
-
-        self.assertEqual(local_attentions.shape, (2, 4, 2, 8))
-        self.assertEqual(global_attentions.shape, (2, 2, 3, 4))
-
-        self.assertTrue((local_attentions[0, 2:4, :, :] == 0).numpy().tolist())
-        self.assertTrue((local_attentions[1, 1:4, :, :] == 0).numpy().tolist())
-
-        #
-        # The weight of all tokens with local attention must sum to 1.
-        self.assertTrue(
-            (tf.math.abs(tf.math.reduce_sum(global_attentions[0, :, :2, :], axis=-1) - 1) < 1e-6).numpy().tolist()
-        )
-        self.assertTrue(
-            (tf.math.abs(tf.math.reduce_sum(global_attentions[1, :, :1, :], axis=-1) - 1) < 1e-6).numpy().tolist()
-        )
-
-        tf.debugging.assert_near(
-            local_attentions[0, 0, 0, :],
-            tf.convert_to_tensor([0.3328, 0.0000, 0.0000, 0.0000, 0.0000, 0.3355, 0.3318, 0.0000], dtype=tf.float32),
-            rtol=1e-3,
-            atol=1e-4,
-        )
-
-        tf.debugging.assert_near(
-            local_attentions[1, 0, 0, :],
-            tf.convert_to_tensor([0.2492, 0.2502, 0.2502, 0.0000, 0.0000, 0.2505, 0.0000, 0.0000], dtype=tf.float32),
-            rtol=1e-3,
-            atol=1e-4,
-        )
-
-        # All the global attention weights must sum to 1.
-        self.assertTrue((tf.math.abs(tf.math.reduce_sum(global_attentions, axis=-1) - 1) < 1e-6).numpy().tolist())
-
-        tf.debugging.assert_near(
-            global_attentions[0, 0, 1, :],
-            tf.convert_to_tensor([0.2500, 0.2500, 0.2500, 0.2500], dtype=tf.float32),
-            rtol=1e-3,
-            atol=1e-4,
-        )
-        tf.debugging.assert_near(
-            global_attentions[1, 0, 0, :],
-            tf.convert_to_tensor([0.2497, 0.2500, 0.2499, 0.2504], dtype=tf.float32),
-            rtol=1e-3,
-            atol=1e-4,
-        )
-
-    @slow
-    def test_inference_no_head(self):
-        model = TFLongformerModel.from_pretrained("allenai/longformer-base-4096")
-
-        # 'Hello world!'
-        input_ids = tf.convert_to_tensor([[0, 20920, 232, 328, 1437, 2]], dtype=tf.int64)
-        attention_mask = tf.ones(shape_list(input_ids), dtype=tf.int64)
-
-        output = model(input_ids, attention_mask=attention_mask)[0]
-        output_without_mask = model(input_ids)[0]
-
-        expected_output_slice = tf.convert_to_tensor([0.0549, 0.1087, -0.1119, -0.0368, 0.0250], dtype=tf.float32)
-
-        tf.debugging.assert_near(output[0, 0, -5:], expected_output_slice, rtol=1e-3, atol=1e-4)
-        tf.debugging.assert_near(output_without_mask[0, 0, -5:], expected_output_slice, rtol=1e-3, atol=1e-4)
-
-    @slow
-    def test_inference_no_head_long(self):
-        model = TFLongformerModel.from_pretrained("allenai/longformer-base-4096")
-
-        # 'Hello world! ' repeated 1000 times
-        input_ids = tf.convert_to_tensor([[0] + [20920, 232, 328, 1437] * 1000 + [2]], dtype=tf.int64)
-
-        attention_mask = tf.ones(shape_list(input_ids), dtype=tf.int64)
-        global_attention_mask = tf.zeros(shape_list(input_ids), dtype=tf.int64)
-        # Set global attention on a few random positions
-        global_attention_mask = tf.tensor_scatter_nd_update(
-            global_attention_mask,
-            tf.constant([[0, 1], [0, 4], [0, 21]], dtype=tf.int64),
-            tf.constant([1, 1, 1], dtype=tf.int64),
-        )
-
-        output = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)[0]
-
-        expected_output_sum = tf.constant(74585.875)
-        expected_output_mean = tf.constant(0.024267)
-
-        # assert close
-        tf.debugging.assert_near(tf.reduce_sum(output), expected_output_sum, rtol=1e-4, atol=1e-4)
-        tf.debugging.assert_near(tf.reduce_mean(output), expected_output_mean, rtol=1e-4, atol=1e-4)
-
-    @slow
-    def test_inference_masked_lm_long(self):
-        model = TFLongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
-
-        # 'Hello world! ' repeated 1000 times
-        input_ids = tf.convert_to_tensor([[0] + [20920, 232, 328, 1437] * 1000 + [2]], dtype=tf.int64)
-
-        output = model(input_ids, labels=input_ids)
-        loss = output.loss
-        prediction_scores = output.logits
-
-        expected_loss = tf.constant(0.0073798)
-        expected_prediction_scores_sum = tf.constant(-610476600.0)
-        expected_prediction_scores_mean = tf.constant(-3.03477)
-
-        # assert close
-        tf.debugging.assert_near(tf.reduce_mean(loss), expected_loss, rtol=1e-4, atol=1e-4)
-        tf.debugging.assert_near(
-            tf.reduce_sum(prediction_scores), expected_prediction_scores_sum, rtol=1e-4, atol=1e-4
-        )
-        tf.debugging.assert_near(
-            tf.reduce_mean(prediction_scores), expected_prediction_scores_mean, rtol=1e-4, atol=1e-4
-        )
-
-    @slow
-    def test_inference_masked_lm(self):
-        model = TFLongformerForMaskedLM.from_pretrained("lysandre/tiny-longformer-random")
-        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        expected_shape = [1, 6, 10]
-        self.assertEqual(output.shape, expected_shape)
-
-        print(output[:, :3, :3])
-
-        expected_slice = tf.constant(
-            [
-                [
-                    [-0.04926379, 0.0367098, 0.02099686],
-                    [0.03940692, 0.01547744, -0.01448723],
-                    [0.03495252, -0.05900355, -0.01675752],
-                ]
-            ]
-        )
-        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
diff --git a/tests/models/longt5/test_modeling_flax_longt5.py b/tests/models/longt5/test_modeling_flax_longt5.py
deleted file mode 100644
index e4f872bd43..0000000000
--- a/tests/models/longt5/test_modeling_flax_longt5.py
+++ /dev/null
@@ -1,661 +0,0 @@
-# Copyright 2022 Google LongT5 Authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import tempfile
-import unittest
-
-import numpy as np
-
-from transformers import is_flax_available
-from transformers.models.auto import get_values
-from transformers.testing_utils import (
-    require_flax,
-    require_sentencepiece,
-    require_tokenizers,
-    slow,
-)
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
-
-
-if is_flax_available():
-    import os
-
-    # The slow tests are often failing with OOM error on GPU
-    # This makes JAX allocate exactly what is needed on demand, and deallocate memory that is no longer needed
-    # but will be slower as stated here https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
-    os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
-
-    import jax
-    import jax.numpy as jnp
-    from flax.core.frozen_dict import unfreeze
-    from flax.traverse_util import flatten_dict
-
-    from transformers import FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING, FLAX_MODEL_MAPPING, AutoTokenizer, LongT5Config
-    from transformers.models.longt5.modeling_flax_longt5 import (
-        FlaxLongT5ForConditionalGeneration,
-        FlaxLongT5Model,
-        shift_tokens_right,
-    )
-
-
-class FlaxLongT5ModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        encoder_seq_length=7,
-        decoder_seq_length=9,
-        local_radius=5,
-        encoder_attention_type="local",
-        global_block_size=3,
-        # For common tests
-        is_training=True,
-        use_attention_mask=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=8,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        eos_token_id=1,
-        pad_token_id=0,
-        decoder_start_token_id=0,
-        scope=None,
-        decoder_layers=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        self.decoder_seq_length = decoder_seq_length
-        self.local_radius = local_radius
-        self.block_len = local_radius + 1
-        self.encoder_attention_type = encoder_attention_type
-        self.global_block_size = global_block_size
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.scope = None
-        self.decoder_layers = decoder_layers
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        decoder_attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        config = LongT5Config(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            local_radius=self.local_radius,
-            encoder_attention_type=self.encoder_attention_type,
-            global_block_size=self.global_block_size,
-        )
-
-        return (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-    ):
-        model = FlaxLongT5Model(config=config)
-        result = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-        decoder_output = result.last_hidden_state
-        encoder_output = result.encoder_last_hidden_state
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, self.encoder_seq_length, self.hidden_size))
-        self.parent.assertEqual(decoder_output.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size))
-
-    def check_use_cache_forward_with_attn_mask(
-        self,
-        model_class_name,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-    ):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        encoder_outputs = model.encode(input_ids)
-
-        # prevent fully zero'd out attention mask
-        decoder_attention_mask = jnp.ones_like(decoder_attention_mask)
-
-        decoder_attention_mask_cache = jnp.concatenate(
-            [
-                decoder_attention_mask,
-                jnp.zeros((decoder_attention_mask.shape[0], max_decoder_length - decoder_attention_mask.shape[1])),
-            ],
-            axis=-1,
-        )
-
-        past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
-
-        outputs_cache = model.decode(
-            decoder_input_ids[:, :-1],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask_cache,
-            past_key_values=past_key_values,
-        )
-        outputs_cache_next = model.decode(
-            decoder_input_ids[:, -1:],
-            encoder_outputs,
-            past_key_values=outputs_cache.past_key_values,
-            decoder_attention_mask=decoder_attention_mask_cache,
-        )
-
-        outputs = model.decode(decoder_input_ids, encoder_outputs, decoder_attention_mask=decoder_attention_mask)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_flax
-class FlaxLongT5ModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxLongT5Model, FlaxLongT5ForConditionalGeneration) if is_flax_available() else ()
-    is_encoder_decoder = True
-
-    def setUp(self):
-        self.model_tester = FlaxLongT5ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LongT5Config, d_model=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_v1_1(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        # check that gated gelu feed forward and different word embeddings work
-        config = config_and_inputs[0]
-        config.tie_word_embeddings = False
-        config.feed_forward_proj = "gated-gelu"
-        self.model_tester.create_and_check_model(config, *config_and_inputs[1:])
-
-    def test_use_cache_forward_with_attn_mask(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward_with_attn_mask(model_class, *config_and_inputs)
-
-    def test_encode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def encode_jitted(input_ids, attention_mask=None, **kwargs):
-                    return model.encode(input_ids=input_ids, attention_mask=attention_mask)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    def test_decode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                model = model_class(config)
-                encoder_outputs = model.encode(inputs_dict["input_ids"], inputs_dict["attention_mask"])
-
-                prepared_inputs_dict = {
-                    "decoder_input_ids": inputs_dict["decoder_input_ids"],
-                    "decoder_attention_mask": inputs_dict["decoder_attention_mask"],
-                    "encoder_outputs": encoder_outputs,
-                }
-
-                @jax.jit
-                def decode_jitted(decoder_input_ids, decoder_attention_mask, encoder_outputs):
-                    return model.decode(
-                        decoder_input_ids=decoder_input_ids,
-                        decoder_attention_mask=decoder_attention_mask,
-                        encoder_outputs=encoder_outputs,
-                    )
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    def test_shift_right(self):
-        decoder_start_token_id = 0
-        pad_token_id = 1
-        labels = np.arange(2, 102).reshape(5, 20)
-        labels[:2, 15:] = -100
-
-        decoder_input_ids = shift_tokens_right(labels, pad_token_id, decoder_start_token_id)
-        np_decoder_input_ids = np.array(decoder_input_ids)
-
-        padded_slice = np_decoder_input_ids[:2, (15 + 1) :]
-        self.assertTrue((padded_slice == 1).all())
-
-        not_padded_slice = np_decoder_input_ids[2:, 1:]
-        rolled_labels = np.roll(labels[2:], 1)[:, 1:]
-        self.assertTrue((not_padded_slice == rolled_labels).all())
-        self.assertTrue((np_decoder_input_ids[:, 0] == 0).all())
-
-    # overwrite since special base model prefix is used
-    def test_save_load_from_base(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = base_class(config)
-            base_params = flatten_dict(unfreeze(model.params))
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                head_model = model_class.from_pretrained(tmpdirname)
-
-                base_param_from_head = flatten_dict(unfreeze(head_model.params))
-
-                for key in base_param_from_head.keys():
-                    max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite since special base model prefix is used
-    def test_save_load_to_base(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            base_params_from_head = flatten_dict(unfreeze(model.params))
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_length = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_length)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        block_len = getattr(self.model_tester, "block_len", None)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, block_len, 3 * block_len],
-            )
-            out_len = len(outputs)
-
-            if self.is_encoder_decoder:
-                correct_outlen = 5
-
-                # Question Answering model returns start_logits and end_logits
-                if model_class in get_values(FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
-                    correct_outlen += 1  # start_logits and end_logits instead of only 1 output
-
-                self.assertEqual(out_len, correct_outlen)
-
-                # decoder attentions
-                decoder_attentions = outputs.decoder_attentions
-                self.assertIsInstance(decoder_attentions, (list, tuple))
-                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(decoder_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                )
-
-                # cross attentions
-                cross_attentions = outputs.cross_attentions
-                self.assertIsInstance(cross_attentions, (list, tuple))
-                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(cross_attentions[0].shape[-3:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        decoder_seq_length,
-                        encoder_key_length,
-                    ],
-                )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, block_len, 3 * block_len],
-            )
-
-
-class FlaxLongT5TGlobalModelTest(FlaxLongT5ModelTest):
-    def setUp(self):
-        self.model_tester = FlaxLongT5ModelTester(self, encoder_attention_type="transient-global")
-        self.config_tester = ConfigTester(self, config_class=LongT5Config, d_model=37)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_length = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_length)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        block_len = getattr(self.model_tester, "block_len", None)
-        global_block_size = getattr(self.model_tester, "global_block_size", None)
-        global_seq_len = encoder_seq_length // global_block_size
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, block_len, 3 * block_len + global_seq_len],
-            )
-            out_len = len(outputs)
-
-            if self.is_encoder_decoder:
-                correct_outlen = 5
-
-                # Question Answering model returns start_logits and end_logits
-                if model_class in get_values(FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
-                    correct_outlen += 1  # start_logits and end_logits instead of only 1 output
-
-                self.assertEqual(out_len, correct_outlen)
-
-                # decoder attentions
-                decoder_attentions = outputs.decoder_attentions
-                self.assertIsInstance(decoder_attentions, (list, tuple))
-                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(decoder_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                )
-
-                # cross attentions
-                cross_attentions = outputs.cross_attentions
-                self.assertIsInstance(cross_attentions, (list, tuple))
-                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(cross_attentions[0].shape[-3:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        decoder_seq_length,
-                        encoder_key_length,
-                    ],
-                )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, block_len, 3 * block_len + global_seq_len],
-            )
-
-
-@require_sentencepiece
-@require_tokenizers
-@require_flax
-class FlaxLongT5ModelIntegrationTests(unittest.TestCase):
-    model_path = "Stancld/longt5-tglobal-large-16384-pubmed-3k_steps"
-
-    def expected_summary(self):
-        return [
-            "background : coronary artery disease ( cad ) is the emerging cause of morbidity and mortality in"
-            " developing world . it provides an excellent resolution for visualization of the coronary arteries for"
-            " catheter - based or operating interventions . although the association of this technique with major"
-            " complications such as mortality is highly uncommon , it is frequently associated with various cardiac"
-            " and noncardiac complications . computed tomography coronary angiography is a promising technique for the"
-            " evaluation of cad noninvasively . it assesses disease within the coronary artery and provides"
-            " qualitative and quantitative information about nonobstructive atherosclerotic plaque"
-        ]
-
-    @slow
-    def test_summarization(self):
-        model = FlaxLongT5ForConditionalGeneration.from_pretrained(self.model_path)
-        tok = AutoTokenizer.from_pretrained(self.model_path)
-
-        ARTICLE = """coronary artery disease ( cad ) is the emerging cause of morbidity and mortality in developing world . \n it provides an excellent resolution for visualization of the coronary arteries for catheter - based or operating interventions . \n
-            although the association of this technique with major complications such as mortality is highly uncommon , it is frequently associated with various cardiac and noncardiac complications . computed tomography ( ct ) coronary angiography is
-            a promising technique for the evaluation of cad noninvasively . \n it assesses disease within the coronary artery and provides qualitative and quantitative information about nonobstructive atherosclerotic plaque burden within the vessel
-            wall . \n thus , ct angiography - based disease evaluation may provide clinically more significant information than conventional angiography . the introduction of multi - slice computed tomography ( msct ) technology such as 64-slice , 12
-            8-slice , 256-slice , and now 320-slice msct has produced a high diagnostic accuracy of ct coronary angiography . \n it has consistently showed to have a very high negative predictive value ( well above 90% ) in ruling out patients with s
-            ignificant cad defined as coronary luminal stenosis of > 50% . \n the american college of cardiology / american heart association recommends that coronary angiography should be performed before valve surgery in men aged > 40 years , women
-            aged > 35 years with coronary risk factors and in postmenopausal women . \n the prevalence of cad in patients undergoing valve replacement is 2040% in developed countries . in the previous studies , \n the incidence of angiographically p
-            roven cad in acquired valvular diseases has been shown to vary widely from 9% to 41% . in aortic stenosis , \n we aimed to report the diagnostic performance of 128-slice ct coronary angiography in 50 patients undergoing for major noncoron
-            ary cardiac surgery referred for diagnostic invasive coronary angiography to assess the extent and severity of coronary stenosis . \n during january 2013 to december 2014 , we enrolled fifty major noncoronary cardiac surgery patients sche
-            duled for invasive coronary angiography who fulfilled the following inclusion criteria of age 40 years , having low or intermediate probability of cad , left ventricular ejection fraction ( lvef ) > 35% , and patient giving informed conse
-            nt for undergoing msct and conventional coronary angiography . \n those having any contraindication for contrast injection , lvef < 35% , high pretest probability of cad , and hemodynamic instability were excluded from the study . \n pati
-            ents with heart rates of > 70 bpm received ( unless they had known overt heart failure or electrocardiogram ( ecg ) atrioventricular conduction abnormalities ) a single oral dose of 100 mg metoprolol 45 min before the scan . \n patients w
-            ith heart rates of > 80 bpm received an additional oral dose of metoprolol if not contraindicated . \n all patients were scanned with a 128-slice ct scanner ( siemens , somatom definition as ) equipped with a new feature in msct technolog
-            y , so - called z - axis flying - focus technology . \n the central 32 detector rows acquire 0.6-mm slices , and the flying - focus spot switches back and forth between 2 z positions between each reading . \n two slices per detector row a
-            re acquired , which results in a higher oversampling rate in the z - axis , thereby reducing artifacts related to the spiral acquisition and improving spatial resolution down to 0.4 mm . \n a bolus of 6580 ml contrast material ( omnipaque
-            ) was injected through an arm vein at a flow rate of 5 ml / s . \n a bolus tracking technique was used to synchronize the arrival of contrast in the coronary arteries with the initiation of the scan . to monitor the arrival of contrast m
-            aterial , \n axial scans were obtained at the level of the ascending aorta with a delay of 10 s after the start of the contrast injection . \n the scan was automatically started when a threshold of 150 hounsfield units was reached in a re
-            gion of interest positioned in the ascending aorta . \n images were reconstructed with ecg gating to obtain optimal , motion - free image quality . \n all scans were performed within 2 weeks of the msct coronary diagnostic angiogram . a s
-            ingle observer unaware of the multi - slice ct results identified coronary lesion as a single vessel , double vessel , or triple vessel disease . \n all lesion , regardless of size , were included for comparison with ct coronary angiograp
-            hy . \n lesions were classified as having nonsignificant disease ( luminal irregularities or < 50% stenosis ) or as having significant stenosis . \n stenosis was evaluated in two orthogonal views and classified as significant if the mean
-            lumen diameter reduction was 50% using a validated quantitative coronary angiography ( qca ) . \n all scans were analyzed independently by a radiologist and a cardiologist who were unaware of the results of conventional coronary angiograp
-            hy . \n total calcium scores of all patients were calculated with dedicated software and expressed as agatston scores . \n the agatston score is a commonly used scoring method that calculates the total amount of calcium on the basis of th
-            e number , areas , and peak hounsfield units of the detected calcified lesions . \n all available coronary segments were visually scored for the presence of > 50% considered as significant stenosis . \n maximum intensity projections were
-            used to identify coronary lesions and ( curved ) multiplanar reconstructions to classify lesions as significant or nonsignificant . \n data were analyzed using statistical system spss version 20 software ( chicago , il , usa ) . \n the di
-            agnostic performance of ct coronary angiography for the detection of significant lesions in coronary arteries with qca as the standard of reference is presented as sensitivity , specificity , positive and negative predictive values , and
-            positive and negative likelihood ratios with the corresponding exact 95% of confidence interval ( cis ) . \n comparison between ct and conventional coronary angiography was performed on the two level vessel by vessel ( no or any disease p
-            er vessel ) , and patient by patient ( no or any disease per patient ) . \n all scans were performed within 2 weeks of the msct coronary diagnostic angiogram . a single observer unaware of the multi - slice ct results identified coronary
-            lesion as a single vessel , double vessel , or triple vessel disease . \n all lesion , regardless of size , were included for comparison with ct coronary angiography . \n lesions were classified as having nonsignificant disease ( luminal
-            irregularities or < 50% stenosis ) or as having significant stenosis . \n stenosis was evaluated in two orthogonal views and classified as significant if the mean lumen diameter reduction was 50% using a validated quantitative coronary an
-            giography ( qca ) . \n all scans were analyzed independently by a radiologist and a cardiologist who were unaware of the results of conventional coronary angiography . \n total calcium scores of all patients were calculated with dedicated
-            software and expressed as agatston scores . \n the agatston score is a commonly used scoring method that calculates the total amount of calcium on the basis of the number , areas , and peak hounsfield units of the detected calcified lesi
-            ons . \n all available coronary segments were visually scored for the presence of > 50% considered as significant stenosis . \n maximum intensity projections were used to identify coronary lesions and ( curved ) multiplanar reconstruction
-            s to classify lesions as significant or nonsignificant . \n data were analyzed using statistical system spss version 20 software ( chicago , il , usa ) . \n the diagnostic performance of ct coronary angiography for the detection of signif
-            icant lesions in coronary arteries with qca as the standard of reference is presented as sensitivity , specificity , positive and negative predictive values , and positive and negative likelihood ratios with the corresponding exact 95% of
-            confidence interval ( cis ) . \n comparison between ct and conventional coronary angiography was performed on the two level vessel by vessel ( no or any disease per vessel ) , and patient by patient ( no or any disease per patient ) . \n
-            in this study , 29 ( 58% ) subjects were female , and 21 ( 42% ) were male showing an average age of 50.36  8.39 years . \n of fifty patients 24 ( 48% ) , 13 ( 26% ) , eight ( 16% ) , and five ( 10% ) underwent mitral valve replacement ,
-            double valve replacement ( dvr ) , aortic valve replacement , and other surgeries , respectively . \n high distribution of cad risk factors such as hypertension ( 24% ) , smoking ( 22% ) , and dyslipidemia ( 18% ) was observed in the stu
-            dy group . \n the mean creatinine level was 0.766  0.17 and average dye used in conventional angiography was 48.5  26.6 whereas for ct angiography it was 72.8  6.32 . \n average radiation dose in conventional coronary angiography and msct
-            coronary angiography was 5.2 msv and 9.2 msv , respectively . \n the majority of the patients had sinus rhythm ( 68% ) , whereas atrial fibrillation was found in 32% of the subjects . \n patients included in the study had low to intermed
-            iate probability of cad . in this study , three patients had complications after conventional angiography . \n complications were of local site hematoma , acute kidney injury managed conservatively , and acute heart failure . \n a patient
-            who developed hematoma was obese female patients with body mass index > 30 kg / m . \n the patient suffered from pseudoaneurysm , had hospitalized for 9 days , which leads to increased morbidity and cost of hospital stay . \n the diagnos
-            tic accuracy of ct coronary angiography was evaluated regarding true positive , true negative values and is presented in table 1 . the overall sensitivity and \n specificity of ct angiography technique was 100% ( 95% ci : 39.76%100% ) and
-            91.30% ( 95% ci : 79.21%97.58% ) , respectively [ table 2 ] . \n the positive predictive value ( 50% ; 95% ci : 15.70%84.30% ) and negative predictive value ( 100% ; 95% ci : 91.59%100% ) of ct angiography were also fairly high in these
-            patients . \n recent reports from multiple studies demonstrated that recent - generation msct scanners showed promise for noninvasive detection of coronary stenosis however , until now no studies were found regarding the clinical efficacy
-            or prognostic value of 128-slice ct coronary angiography versus conventional invasive coronary angiography in the diagnosis of patients planned for major noncoronary surgeries such as dvr , bentall , atrial septal defect closure , etc .
-            in our study , we reported 8% cad prevalence in patients planned for major noncoronary cardiac surgery . \n we performed conventional and msct coronary angiography in all patients and the results showed that ct coronary angiography with i
-            nvasive coronary angiography as the reference standard had a considerably high sensitivity ( 100% ) and specificity ( 95.65% ) . \n the health economic model using invasive coronary angiography as the reference standard showed that at a p
-            retest probability of cad of 70% or lower , ct coronary angiography resulted in lower cost per patient with a true positive diagnosis . at a pretest probability of cad of 70% or higher , invasive coronary angiography was associated with a
-            lower cost per patient with a true positive diagnosis . in our study population , \n two patients developed local site complications in the form of hematoma and pseudoaneurysm after conventional angiography . \n hence , msct coronary ang
-            iography will be more favorable in female obese patients with intermediate likelihood of cad . \n hence , msct coronary angiography will be cost - effective in patients of valvular heart diseases . \n however , ct angiography suffers from
-            a drawback that average amount of dye used in msct coronary angiography were 72.8  6.32 ml which is higher than average amount of dye required for conventional angiography ( 48.6  26.6 ml ) . \n hence , the use of ct coronary angiography
-            could not be used in patients with known renal dysfunction , where reduction of contrast dye load is highly advocated . \n our results show that 128-slice ct coronary angiography is a reliable technique to detect coronary stenosis in pat
-            ients planned for noncoronary cardiac surgery . \n although there has been important technological progress in the development of ct coronary angiography , its clinical application remains limited . \n a study wth large numbers of patient
-            s is required for the recommendation of only ct coronary angiography for the coronary evaluation in major non - cardiac surgeries . \n mehta institute of cardiology and research center ( affiliated to bj medical college , ahmedabad , guja
-            rat , india ) . \n u.n . mehta institute of cardiology and research center ( affiliated to bj medical college , ahmedabad , gujarat , india ) . \n """
-
-        dct = tok(
-            [ARTICLE],
-            max_length=1024,
-            padding="max_length",
-            truncation=True,
-            return_tensors="np",
-        )
-
-        hypotheses_batch = model.generate(
-            **dct,
-            num_beams=4,
-            length_penalty=2.0,
-            max_length=142,
-            min_length=56,
-            do_sample=False,
-            early_stopping=True,
-        ).sequences
-
-        decoded = tok.batch_decode(hypotheses_batch, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        self.assertListEqual(
-            self.expected_summary(),
-            decoded,
-        )
diff --git a/tests/models/lxmert/test_modeling_tf_lxmert.py b/tests/models/lxmert/test_modeling_tf_lxmert.py
deleted file mode 100644
index 27fa146fb0..0000000000
--- a/tests/models/lxmert/test_modeling_tf_lxmert.py
+++ /dev/null
@@ -1,558 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import tempfile
-import unittest
-
-import numpy as np
-
-from transformers import LxmertConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers.models.lxmert.modeling_tf_lxmert import TFLxmertForPreTraining, TFLxmertModel
-
-
-class TFLxmertModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=300,
-        hidden_size=28,
-        num_attention_heads=2,
-        num_labels=2,
-        intermediate_size=64,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=2,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        pad_token_id=0,
-        num_qa_labels=30,
-        num_object_labels=16,
-        num_attr_labels=4,
-        num_visual_features=10,
-        l_layers=2,
-        x_layers=1,
-        r_layers=1,
-        visual_feat_dim=128,
-        visual_pos_dim=4,
-        visual_loss_normalizer=6.67,
-        seq_length=20,
-        batch_size=8,
-        is_training=True,
-        task_matched=True,
-        task_mask_lm=True,
-        task_obj_predict=True,
-        task_qa=True,
-        visual_obj_loss=True,
-        visual_attr_loss=True,
-        visual_feat_loss=True,
-        use_token_type_ids=True,
-        use_lang_mask=True,
-        output_attentions=False,
-        output_hidden_states=False,
-        scope=None,
-    ):
-        self.parent = parent
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_attention_heads = num_attention_heads
-        self.num_labels = num_labels
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.pad_token_id = pad_token_id
-        self.num_qa_labels = num_qa_labels
-        self.num_object_labels = num_object_labels
-        self.num_attr_labels = num_attr_labels
-        self.l_layers = l_layers
-        self.x_layers = x_layers
-        self.r_layers = r_layers
-        self.visual_feat_dim = visual_feat_dim
-        self.visual_pos_dim = visual_pos_dim
-        self.visual_loss_normalizer = visual_loss_normalizer
-        self.seq_length = seq_length
-        self.batch_size = batch_size
-        self.is_training = is_training
-        self.use_lang_mask = use_lang_mask
-        self.task_matched = task_matched
-        self.task_mask_lm = task_mask_lm
-        self.task_obj_predict = task_obj_predict
-        self.task_qa = task_qa
-        self.visual_obj_loss = visual_obj_loss
-        self.visual_attr_loss = visual_attr_loss
-        self.visual_feat_loss = visual_feat_loss
-        self.num_visual_features = num_visual_features
-        self.use_token_type_ids = use_token_type_ids
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
-        self.scope = scope
-        self.num_hidden_layers = {"vision": r_layers, "cross_encoder": x_layers, "language": l_layers}
-
-    def prepare_config_and_inputs(self):
-        output_attentions = self.output_attentions
-        input_ids = ids_tensor([self.batch_size, self.seq_length], vocab_size=self.vocab_size)
-        visual_feats = tf.random.uniform((self.batch_size, self.num_visual_features, self.visual_feat_dim))
-        bounding_boxes = tf.random.uniform((self.batch_size, self.num_visual_features, 4))
-
-        input_mask = None
-        if self.use_lang_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-        obj_labels = None
-        if self.task_obj_predict:
-            obj_labels = {}
-        if self.visual_attr_loss and self.task_obj_predict:
-            obj_labels["attr"] = (
-                ids_tensor([self.batch_size, self.num_visual_features], self.num_attr_labels),
-                ids_tensor([self.batch_size, self.num_visual_features], self.num_attr_labels),
-            )
-        if self.visual_feat_loss and self.task_obj_predict:
-            obj_labels["feat"] = (
-                ids_tensor(
-                    [self.batch_size, self.num_visual_features, self.visual_feat_dim], self.num_visual_features
-                ),
-                ids_tensor([self.batch_size, self.num_visual_features], self.num_visual_features),
-            )
-        if self.visual_obj_loss and self.task_obj_predict:
-            obj_labels["obj"] = (
-                ids_tensor([self.batch_size, self.num_visual_features], self.num_object_labels),
-                ids_tensor([self.batch_size, self.num_visual_features], self.num_object_labels),
-            )
-        ans = None
-        if self.task_qa:
-            ans = ids_tensor([self.batch_size], self.num_qa_labels)
-        masked_lm_labels = None
-        if self.task_mask_lm:
-            masked_lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        matched_label = None
-        if self.task_matched:
-            matched_label = ids_tensor([self.batch_size], self.num_labels)
-
-        config = LxmertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_attention_heads=self.num_attention_heads,
-            num_labels=self.num_labels,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            layer_norm_eps=self.layer_norm_eps,
-            pad_token_id=self.pad_token_id,
-            num_qa_labels=self.num_qa_labels,
-            num_object_labels=self.num_object_labels,
-            num_attr_labels=self.num_attr_labels,
-            l_layers=self.l_layers,
-            x_layers=self.x_layers,
-            r_layers=self.r_layers,
-            visual_feat_dim=self.visual_feat_dim,
-            visual_pos_dim=self.visual_pos_dim,
-            visual_loss_normalizer=self.visual_loss_normalizer,
-            task_matched=self.task_matched,
-            task_mask_lm=self.task_mask_lm,
-            task_obj_predict=self.task_obj_predict,
-            task_qa=self.task_qa,
-            visual_obj_loss=self.visual_obj_loss,
-            visual_attr_loss=self.visual_attr_loss,
-            visual_feat_loss=self.visual_feat_loss,
-            output_attentions=self.output_attentions,
-            output_hidden_states=self.output_hidden_states,
-        )
-
-        return (
-            config,
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids,
-            input_mask,
-            obj_labels,
-            masked_lm_labels,
-            matched_label,
-            ans,
-            output_attentions,
-        )
-
-    def create_and_check_lxmert_model(
-        self,
-        config,
-        input_ids,
-        visual_feats,
-        bounding_boxes,
-        token_type_ids,
-        input_mask,
-        obj_labels,
-        masked_lm_labels,
-        matched_label,
-        ans,
-        output_attentions,
-    ):
-        model = TFLxmertModel(config=config)
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            output_attentions=output_attentions,
-        )
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            output_attentions=not output_attentions,
-        )
-        result = model(input_ids, visual_feats, bounding_boxes, return_dict=False)
-        result = model(input_ids, visual_feats, bounding_boxes, return_dict=True)
-
-        self.parent.assertEqual(result.language_output.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(
-            result.vision_output.shape, (self.batch_size, self.num_visual_features, self.hidden_size)
-        )
-        self.parent.assertEqual(result.pooled_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self, return_obj_labels=False):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids,
-            input_mask,
-            obj_labels,
-            masked_lm_labels,
-            matched_label,
-            ans,
-            output_attentions,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "visual_feats": visual_feats,
-            "visual_pos": bounding_boxes,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-        }
-
-        if return_obj_labels:
-            inputs_dict["obj_labels"] = obj_labels
-        else:
-            config.task_obj_predict = False
-
-        return config, inputs_dict
-
-    def create_and_check_lxmert_for_pretraining(
-        self,
-        config,
-        input_ids,
-        visual_feats,
-        bounding_boxes,
-        token_type_ids,
-        input_mask,
-        obj_labels,
-        masked_lm_labels,
-        matched_label,
-        ans,
-        output_attentions,
-    ):
-        model = TFLxmertForPreTraining(config=config)
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            masked_lm_labels=masked_lm_labels,
-            obj_labels=obj_labels,
-            matched_label=matched_label,
-            ans=ans,
-            output_attentions=output_attentions,
-        )
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            masked_lm_labels=masked_lm_labels,
-            output_attentions=not output_attentions,
-            return_dict=False,
-        )
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            masked_lm_labels=masked_lm_labels,
-        )
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            obj_labels=obj_labels,
-        )
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            matched_label=matched_label,
-        )
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            ans=ans,
-        )
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            masked_lm_labels=masked_lm_labels,
-            obj_labels=obj_labels,
-            matched_label=matched_label,
-            ans=ans,
-            output_attentions=not output_attentions,
-        )
-
-        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-
-@require_tf
-class TFLxmertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TFLxmertModel, TFLxmertForPreTraining) if is_tf_available() else ()
-    pipeline_model_mapping = {"feature-extraction": TFLxmertModel} if is_tf_available() else {}
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFLxmertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LxmertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_lxmert_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lxmert_model(*config_and_inputs)
-
-    def test_lxmert_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lxmert_for_pretraining(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in ["unc-nlp/lxmert-base-uncased"]:
-            model = TFLxmertModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        encoder_seq_length = (
-            self.model_tester.encoder_seq_length
-            if hasattr(self.model_tester, "encoder_seq_length")
-            else self.model_tester.seq_length
-        )
-        encoder_key_length = (
-            self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length
-        )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            language_attentions, vision_attentions, cross_encoder_attentions = (outputs[-3], outputs[-2], outputs[-1])
-
-            self.assertEqual(model.config.output_hidden_states, False)
-
-            self.assertEqual(len(language_attentions), self.model_tester.num_hidden_layers["language"])
-            self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers["vision"])
-            self.assertEqual(len(cross_encoder_attentions), self.model_tester.num_hidden_layers["cross_encoder"])
-
-            attentions = [language_attentions, vision_attentions, cross_encoder_attentions]
-            attention_shapes = [
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_visual_features,
-                    self.model_tester.num_visual_features,
-                ],
-                [self.model_tester.num_attention_heads, encoder_key_length, self.model_tester.num_visual_features],
-            ]
-
-            for attention, attention_shape in zip(attentions, attention_shapes):
-                self.assertListEqual(list(attention[0].shape[-3:]), attention_shape)
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-
-            # 2 hidden states were added
-            self.assertEqual(out_len + 2, len(outputs))
-            language_attentions, vision_attentions, cross_encoder_attentions = (outputs[-3], outputs[-2], outputs[-1])
-            self.assertEqual(len(language_attentions), self.model_tester.num_hidden_layers["language"])
-            self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers["vision"])
-            self.assertEqual(len(cross_encoder_attentions), self.model_tester.num_hidden_layers["cross_encoder"])
-
-            attentions = [language_attentions, vision_attentions, cross_encoder_attentions]
-            attention_shapes = [
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_visual_features,
-                    self.model_tester.num_visual_features,
-                ],
-                [self.model_tester.num_attention_heads, encoder_key_length, self.model_tester.num_visual_features],
-            ]
-
-            for attention, attention_shape in zip(attentions, attention_shapes):
-                self.assertListEqual(list(attention[0].shape[-3:]), attention_shape)
-
-    def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_hidden_states_output(config, inputs_dict, model_class):
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            language_hidden_states, vision_hidden_states = outputs[-2], outputs[-1]
-
-            self.assertEqual(len(language_hidden_states), self.model_tester.num_hidden_layers["language"] + 1)
-            self.assertEqual(len(vision_hidden_states), self.model_tester.num_hidden_layers["vision"] + 1)
-
-            seq_length = self.model_tester.seq_length
-            num_visual_features = self.model_tester.num_visual_features
-
-            self.assertListEqual(
-                list(language_hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-            self.assertListEqual(
-                list(vision_hidden_states[0].shape[-2:]),
-                [num_visual_features, self.model_tester.hidden_size],
-            )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(config, inputs_dict, model_class)
-
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-            check_hidden_states_output(config, inputs_dict, model_class)
-
-    def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
-        import torch
-
-        pt_inputs_dict = {}
-        for key, value in tf_inputs_dict.items():
-            if isinstance(value, dict):
-                pt_inputs_dict[key] = self.prepare_pt_inputs_from_tf_inputs(value)
-            elif isinstance(value, (list, tuple)):
-                pt_inputs_dict[key] = (self.prepare_pt_inputs_from_tf_inputs(iter_value) for iter_value in value)
-            elif isinstance(key, bool):
-                pt_inputs_dict[key] = value
-            elif key == "input_values":
-                pt_inputs_dict[key] = torch.from_numpy(value.numpy()).to(torch.float32)
-            elif key == "pixel_values":
-                pt_inputs_dict[key] = torch.from_numpy(value.numpy()).to(torch.float32)
-            elif key == "input_features":
-                pt_inputs_dict[key] = torch.from_numpy(value.numpy()).to(torch.float32)
-            # other general float inputs
-            elif tf_inputs_dict[key].dtype.is_floating:
-                pt_inputs_dict[key] = torch.from_numpy(value.numpy()).to(torch.float32)
-            else:
-                pt_inputs_dict[key] = torch.from_numpy(value.numpy()).to(torch.long)
-
-        return pt_inputs_dict
-
-    def test_save_load(self):
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
-                return_obj_labels="PreTraining" in model_class.__name__
-            )
-
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname)
-                after_outputs = model(self._prepare_for_class(inputs_dict, model_class))
-
-                self.assert_outputs_same(after_outputs, outputs)
-
-
-@require_tf
-class TFLxmertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = TFLxmertModel.from_pretrained("unc-nlp/lxmert-base-uncased")
-        input_ids = tf.constant([[101, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 102]])
-
-        num_visual_features = 10
-        _, visual_feats = np.random.seed(0), np.random.rand(1, num_visual_features, model.config.visual_feat_dim)
-        _, visual_pos = np.random.seed(0), np.random.rand(1, num_visual_features, 4)
-        visual_feats = tf.convert_to_tensor(visual_feats, dtype=tf.float32)
-        visual_pos = tf.convert_to_tensor(visual_pos, dtype=tf.float32)
-        output = model(input_ids, visual_feats=visual_feats, visual_pos=visual_pos)[0]
-        expected_shape = [1, 11, 768]
-        self.assertEqual(expected_shape, output.shape)
-        expected_slice = tf.constant(
-            [
-                [
-                    [0.24170142, -0.98075, 0.14797261],
-                    [1.2540525, -0.83198136, 0.5112344],
-                    [1.4070463, -1.1051831, 0.6990401],
-                ]
-            ]
-        )
-        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
diff --git a/tests/models/marian/test_modeling_flax_marian.py b/tests/models/marian/test_modeling_flax_marian.py
deleted file mode 100644
index 73cd489761..0000000000
--- a/tests/models/marian/test_modeling_flax_marian.py
+++ /dev/null
@@ -1,488 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import timeout_decorator  # noqa
-
-from transformers import MarianConfig, is_flax_available
-from transformers.testing_utils import require_flax, require_sentencepiece, require_tokenizers, slow
-from transformers.utils import cached_property
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
-
-
-if is_flax_available():
-    import os
-
-    # The slow tests are often failing with OOM error on GPU
-    # This makes JAX allocate exactly what is needed on demand, and deallocate memory that is no longer needed
-    # but will be slower as stated here https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
-    os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
-
-    import jax
-    import jax.numpy as jnp
-
-    from transformers import MarianTokenizer
-    from transformers.models.marian.modeling_flax_marian import FlaxMarianModel, FlaxMarianMTModel, shift_tokens_right
-
-
-def prepare_marian_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids=None,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = np.where(input_ids != config.pad_token_id, 1, 0)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = np.where(decoder_input_ids != config.pad_token_id, 1, 0)
-    if head_mask is None:
-        head_mask = np.ones((config.encoder_layers, config.encoder_attention_heads))
-    if decoder_head_mask is None:
-        decoder_head_mask = np.ones((config.decoder_layers, config.decoder_attention_heads))
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = np.ones((config.decoder_layers, config.decoder_attention_heads))
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-    }
-
-
-class FlaxMarianModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=32,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-        initializer_range=0.02,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.initializer_range = initializer_range
-
-    def prepare_config_and_inputs(self):
-        input_ids = np.clip(ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size), 3, self.vocab_size)
-        input_ids = np.concatenate((input_ids, 2 * np.ones((self.batch_size, 1), dtype=np.int64)), -1)
-
-        decoder_input_ids = shift_tokens_right(input_ids, 1, 2)
-
-        config = MarianConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            initializer_range=self.initializer_range,
-            use_cache=False,
-        )
-        inputs_dict = prepare_marian_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def check_use_cache_forward(self, model_class_name, config, inputs_dict):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        encoder_outputs = model.encode(inputs_dict["input_ids"])
-
-        decoder_input_ids, decoder_attention_mask = (
-            inputs_dict["decoder_input_ids"],
-            inputs_dict["decoder_attention_mask"],
-        )
-
-        past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
-        decoder_attention_mask = jnp.ones((decoder_input_ids.shape[0], max_decoder_length), dtype="i4")
-
-        decoder_position_ids = jnp.broadcast_to(
-            jnp.arange(decoder_input_ids.shape[-1] - 1)[None, :],
-            (decoder_input_ids.shape[0], decoder_input_ids.shape[-1] - 1),
-        )
-        outputs_cache = model.decode(
-            decoder_input_ids[:, :-1],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            past_key_values=past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        decoder_position_ids = jnp.array(decoder_input_ids.shape[0] * [[decoder_input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model.decode(
-            decoder_input_ids[:, -1:],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            past_key_values=outputs_cache.past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        outputs = model.decode(decoder_input_ids, encoder_outputs)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, inputs_dict):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        encoder_outputs = model.encode(inputs_dict["input_ids"])
-
-        decoder_input_ids, decoder_attention_mask = (
-            inputs_dict["decoder_input_ids"],
-            inputs_dict["decoder_attention_mask"],
-        )
-
-        decoder_attention_mask_cache = jnp.concatenate(
-            [
-                decoder_attention_mask,
-                jnp.zeros((decoder_attention_mask.shape[0], max_decoder_length - decoder_attention_mask.shape[1])),
-            ],
-            axis=-1,
-        )
-
-        past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
-        decoder_position_ids = jnp.broadcast_to(
-            jnp.arange(decoder_input_ids.shape[-1] - 1)[None, :],
-            (decoder_input_ids.shape[0], decoder_input_ids.shape[-1] - 1),
-        )
-
-        outputs_cache = model.decode(
-            decoder_input_ids[:, :-1],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask_cache,
-            past_key_values=past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-        decoder_position_ids = jnp.array(decoder_input_ids.shape[0] * [[decoder_input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model.decode(
-            decoder_input_ids[:, -1:],
-            encoder_outputs,
-            past_key_values=outputs_cache.past_key_values,
-            decoder_attention_mask=decoder_attention_mask_cache,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        outputs = model.decode(decoder_input_ids, encoder_outputs, decoder_attention_mask=decoder_attention_mask)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-
-@require_flax
-class FlaxMarianModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    is_encoder_decoder = True
-    all_model_classes = (FlaxMarianModel, FlaxMarianMTModel) if is_flax_available() else ()
-
-    def setUp(self):
-        self.model_tester = FlaxMarianModelTester(self)
-
-    def test_use_cache_forward(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward(model_class, config, inputs_dict)
-
-    def test_use_cache_forward_with_attn_mask(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward_with_attn_mask(model_class, config, inputs_dict)
-
-    def test_encode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def encode_jitted(input_ids, attention_mask=None, **kwargs):
-                    return model.encode(input_ids=input_ids, attention_mask=attention_mask)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    def test_decode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                model = model_class(config)
-                encoder_outputs = model.encode(inputs_dict["input_ids"], inputs_dict["attention_mask"])
-
-                prepared_inputs_dict = {
-                    "decoder_input_ids": inputs_dict["decoder_input_ids"],
-                    "decoder_attention_mask": inputs_dict["decoder_attention_mask"],
-                    "encoder_outputs": encoder_outputs,
-                }
-
-                @jax.jit
-                def decode_jitted(decoder_input_ids, decoder_attention_mask, encoder_outputs):
-                    return model.decode(
-                        decoder_input_ids=decoder_input_ids,
-                        decoder_attention_mask=decoder_attention_mask,
-                        encoder_outputs=encoder_outputs,
-                    )
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("Helsinki-NLP/opus-mt-en-de")
-            # FlaxMarianForSequenceClassification expects eos token in input_ids
-            input_ids = np.ones((1, 1)) * model.config.eos_token_id
-            outputs = model(input_ids)
-            self.assertIsNotNone(outputs)
-
-
-@require_flax
-@require_sentencepiece
-@require_tokenizers
-class MarianIntegrationTest(unittest.TestCase):
-    src = None
-    tgt = None
-
-    @classmethod
-    def setUpClass(cls) -> None:
-        cls.model_name = f"Helsinki-NLP/opus-mt-{cls.src}-{cls.tgt}"
-        return cls
-
-    @cached_property
-    def tokenizer(self):
-        return MarianTokenizer.from_pretrained(self.model_name)
-
-    @property
-    def eos_token_id(self) -> int:
-        return self.tokenizer.eos_token_id
-
-    @cached_property
-    def model(self):
-        model: FlaxMarianMTModel = FlaxMarianMTModel.from_pretrained(self.model_name)
-        self.assertEqual(model.config.decoder_start_token_id, model.config.pad_token_id)
-        return model
-
-    def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
-        generated_words = self.translate_src_text(**tokenizer_kwargs)
-        self.assertListEqual(self.expected_text, generated_words)
-
-    def translate_src_text(self, **tokenizer_kwargs):
-        model_inputs = self.tokenizer(self.src_text, padding=True, return_tensors="np", **tokenizer_kwargs)
-        generated_ids = self.model.generate(
-            model_inputs.input_ids,
-            attention_mask=model_inputs.attention_mask,
-            num_beams=2,
-            max_length=128,
-        ).sequences
-        generated_words = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-        return generated_words
-
-
-@require_flax
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_EN_FR(MarianIntegrationTest):
-    src = "en"
-    tgt = "fr"
-    src_text = [
-        "I am a small frog.",
-        "Now I can forget the 100 words of german that I know.",
-    ]
-    expected_text = [
-        "Je suis une petite grenouille.",
-        "Maintenant, je peux oublier les 100 mots d'allemand que je connais.",
-    ]
-
-    @slow
-    def test_batch_generation_en_fr(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_flax
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_FR_EN(MarianIntegrationTest):
-    src = "fr"
-    tgt = "en"
-    src_text = [
-        "Donnez moi le micro.",
-        "Tom et Mary étaient assis à une table.",  # Accents
-    ]
-    expected_text = [
-        "Give me the microphone.",
-        "Tom and Mary were sitting at a table.",
-    ]
-
-    @slow
-    def test_batch_generation_fr_en(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_flax
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_MT_EN(MarianIntegrationTest):
-    """Cover low resource/high perplexity setting. This breaks without adjust_logits_generation overwritten"""
-
-    src = "mt"
-    tgt = "en"
-    src_text = ["Billi messu b'mod ġentili, Ġesù fejjaq raġel li kien milqut bil - marda kerha tal - ġdiem."]
-    expected_text = ["Touching gently, Jesus healed a man who was affected by the sad disease of leprosy."]
-
-    @slow
-    def test_batch_generation_mt_en(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_flax
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_EN_DE(MarianIntegrationTest):
-    src = "en"
-    tgt = "de"
-    src_text = [
-        "I am a small frog.",
-        "Now I can forget the 100 words of german that I know.",
-        "Tom asked his teacher for advice.",
-        "That's how I would do it.",
-        "Tom really admired Mary's courage.",
-        "Turn around and close your eyes.",
-    ]
-    expected_text = [
-        "Ich bin ein kleiner Frosch.",
-        "Jetzt kann ich die 100 Wörter des Deutschen vergessen, die ich kenne.",
-        "Tom bat seinen Lehrer um Rat.",
-        "So würde ich das machen.",
-        "Tom bewunderte Marias Mut wirklich.",
-        "Drehen Sie sich um und schließen Sie die Augen.",
-    ]
-
-    @slow
-    def test_batch_generation_en_de(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_flax
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_en_zh(MarianIntegrationTest):
-    src = "en"
-    tgt = "zh"
-    src_text = ["My name is Wolfgang and I live in Berlin"]
-    expected_text = ["我叫沃尔夫冈 我住在柏林"]
-
-    @slow
-    def test_batch_generation_eng_zho(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_flax
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_RU_FR(MarianIntegrationTest):
-    src = "ru"
-    tgt = "fr"
-    src_text = ["Он показал мне рукопись своей новой пьесы."]
-    expected_text = ["Il m'a montré le manuscrit de sa nouvelle pièce."]
-
-    @slow
-    def test_batch_generation_ru_fr(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_flax
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_en_ROMANCE(MarianIntegrationTest):
-    """Multilingual on target side."""
-
-    src = "en"
-    tgt = "ROMANCE"
-    src_text = [
-        ">>fr<< Don't spend so much time watching TV.",
-        ">>pt<< Your message has been sent.",
-        ">>es<< He's two years older than me.",
-    ]
-    expected_text = [
-        "Ne passez pas autant de temps à regarder la télé.",
-        "A sua mensagem foi enviada.",
-        "Es dos años más viejo que yo.",
-    ]
-
-    @slow
-    def test_batch_generation_en_ROMANCE_multi(self):
-        self._assert_generated_batch_equal_expected()
diff --git a/tests/models/marian/test_modeling_tf_marian.py b/tests/models/marian/test_modeling_tf_marian.py
deleted file mode 100644
index d6f0d06405..0000000000
--- a/tests/models/marian/test_modeling_tf_marian.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-import warnings
-
-from transformers import AutoTokenizer, MarianConfig, MarianTokenizer, TranslationPipeline, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-from transformers.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFAutoModelForSeq2SeqLM, TFMarianModel, TFMarianMTModel
-
-
-@require_tf
-class TFMarianModelTester:
-    config_cls = MarianConfig
-    config_updates = {}
-    hidden_act = "gelu"
-
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
-        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
-        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.config_cls(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_ids=[2],
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.pad_token_id,
-            **self.config_updates,
-        )
-        inputs_dict = prepare_marian_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = TFMarianModel(config=config).get_decoder()
-        input_ids = inputs_dict["input_ids"]
-
-        input_ids = input_ids[:1, :]
-        attention_mask = inputs_dict["attention_mask"][:1, :]
-        head_mask = inputs_dict["head_mask"]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = tf.cast(ids_tensor((self.batch_size, 3), 2), tf.int8)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-
-def prepare_marian_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = tf.concat(
-            [
-                tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int8),
-                tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int8),
-            ],
-            axis=-1,
-        )
-    if head_mask is None:
-        head_mask = tf.ones((config.encoder_layers, config.encoder_attention_heads))
-    if decoder_head_mask is None:
-        decoder_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": decoder_attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-@require_tf
-class TFMarianModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TFMarianMTModel, TFMarianModel) if is_tf_available() else ()
-    all_generative_model_classes = (TFMarianMTModel,) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFMarianModel,
-            "summarization": TFMarianMTModel,
-            "text2text-generation": TFMarianMTModel,
-            "translation": TFMarianMTModel,
-        }
-        if is_tf_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    test_pruning = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFMarianModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MarianConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
-
-
-@require_tf
-class AbstractMarianIntegrationTest(unittest.TestCase):
-    maxDiff = 1000  # show more chars for failing integration tests
-
-    @classmethod
-    def setUpClass(cls) -> None:
-        cls.model_name = f"Helsinki-NLP/opus-mt-{cls.src}-{cls.tgt}"
-        return cls
-
-    @cached_property
-    def tokenizer(self) -> MarianTokenizer:
-        return AutoTokenizer.from_pretrained(self.model_name)
-
-    @property
-    def eos_token_id(self) -> int:
-        return self.tokenizer.eos_token_id
-
-    @cached_property
-    def model(self):
-        warnings.simplefilter("error")
-        model: TFMarianMTModel = TFAutoModelForSeq2SeqLM.from_pretrained(self.model_name)
-        assert isinstance(model, TFMarianMTModel)
-        c = model.config
-        self.assertListEqual(c.bad_words_ids, [[c.pad_token_id]])
-        self.assertEqual(c.max_length, 512)
-        self.assertEqual(c.decoder_start_token_id, c.pad_token_id)
-        return model
-
-    def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
-        generated_words = self.translate_src_text(**tokenizer_kwargs)
-        self.assertListEqual(self.expected_text, generated_words)
-
-    def translate_src_text(self, **tokenizer_kwargs):
-        model_inputs = self.tokenizer(self.src_text, **tokenizer_kwargs, padding=True, return_tensors="tf")
-        generated_ids = self.model.generate(
-            model_inputs.input_ids, attention_mask=model_inputs.attention_mask, num_beams=2, max_length=128
-        )
-        generated_words = self.tokenizer.batch_decode(generated_ids.numpy(), skip_special_tokens=True)
-        return generated_words
-
-
-@require_sentencepiece
-@require_tokenizers
-@require_tf
-class TestMarian_MT_EN(AbstractMarianIntegrationTest):
-    """Cover low resource/high perplexity setting. This breaks if pad_token_id logits not set to LARGE_NEGATIVE."""
-
-    src = "mt"
-    tgt = "en"
-    src_text = ["Billi messu b'mod ġentili, Ġesù fejjaq raġel li kien milqut bil - marda kerha tal - ġdiem."]
-    expected_text = ["Touching gently, Jesus healed a man who was affected by the sad disease of leprosy."]
-
-    @unittest.skip("Skipping until #12647 is resolved.")
-    @slow
-    def test_batch_generation_mt_en(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_sentencepiece
-@require_tokenizers
-@require_tf
-class TestMarian_en_zh(AbstractMarianIntegrationTest):
-    src = "en"
-    tgt = "zh"
-    src_text = ["My name is Wolfgang and I live in Berlin"]
-    expected_text = ["我叫沃尔夫冈 我住在柏林"]
-
-    @unittest.skip("Skipping until #12647 is resolved.")
-    @slow
-    def test_batch_generation_en_zh(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_sentencepiece
-@require_tokenizers
-@require_tf
-class TestMarian_en_ROMANCE(AbstractMarianIntegrationTest):
-    """Multilingual on target side."""
-
-    src = "en"
-    tgt = "ROMANCE"
-    src_text = [
-        ">>fr<< Don't spend so much time watching TV.",
-        ">>pt<< Your message has been sent.",
-        ">>es<< He's two years older than me.",
-    ]
-    expected_text = [
-        "Ne passez pas autant de temps à regarder la télé.",
-        "A sua mensagem foi enviada.",
-        "Es dos años más viejo que yo.",
-    ]
-
-    @unittest.skip("Skipping until #12647 is resolved.")
-    @slow
-    def test_batch_generation_en_ROMANCE_multi(self):
-        self._assert_generated_batch_equal_expected()
-
-    @unittest.skip("Skipping until #12647 is resolved.")
-    @slow
-    def test_pipeline(self):
-        pipeline = TranslationPipeline(self.model, self.tokenizer, framework="tf")
-        output = pipeline(self.src_text)
-        self.assertEqual(self.expected_text, [x["translation_text"] for x in output])
diff --git a/tests/models/mbart/test_modeling_flax_mbart.py b/tests/models/mbart/test_modeling_flax_mbart.py
deleted file mode 100644
index eaeea3f50f..0000000000
--- a/tests/models/mbart/test_modeling_flax_mbart.py
+++ /dev/null
@@ -1,463 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import timeout_decorator  # noqa
-
-from transformers import MBartConfig, is_flax_available
-from transformers.testing_utils import require_flax, require_sentencepiece, require_tokenizers, slow
-from transformers.utils import cached_property
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
-
-
-if is_flax_available():
-    import os
-
-    # The slow tests are often failing with OOM error on GPU
-    # This makes JAX allocate exactly what is needed on demand, and deallocate memory that is no longer needed
-    # but will be slower as stated here https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
-    os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
-
-    import jax
-    import jax.numpy as jnp
-
-    from transformers import AutoTokenizer
-    from transformers.models.mbart.modeling_flax_mbart import (
-        FlaxMBartForConditionalGeneration,
-        FlaxMBartForQuestionAnswering,
-        FlaxMBartForSequenceClassification,
-        FlaxMBartModel,
-        shift_tokens_right,
-    )
-
-
-def prepare_mbart_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids=None,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = np.where(input_ids != config.pad_token_id, 1, 0)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = np.where(decoder_input_ids != config.pad_token_id, 1, 0)
-    if head_mask is None:
-        head_mask = np.ones((config.encoder_layers, config.encoder_attention_heads))
-    if decoder_head_mask is None:
-        decoder_head_mask = np.ones((config.decoder_layers, config.decoder_attention_heads))
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = np.ones((config.decoder_layers, config.decoder_attention_heads))
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": decoder_attention_mask,
-    }
-
-
-class FlaxMBartModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=32,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-        decoder_start_token_id=2,
-        initializer_range=0.02,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.initializer_range = initializer_range
-
-    def prepare_config_and_inputs(self):
-        input_ids = np.clip(ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size), 3, self.vocab_size)
-        input_ids = np.concatenate((input_ids, 2 * np.ones((self.batch_size, 1), dtype=np.int64)), -1)
-
-        decoder_input_ids = shift_tokens_right(input_ids, 1)
-
-        config = MBartConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            initializer_range=self.initializer_range,
-            use_cache=False,
-        )
-        inputs_dict = prepare_mbart_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def check_use_cache_forward(self, model_class_name, config, inputs_dict):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        encoder_outputs = model.encode(inputs_dict["input_ids"])
-
-        decoder_input_ids, decoder_attention_mask = (
-            inputs_dict["decoder_input_ids"],
-            inputs_dict["decoder_attention_mask"],
-        )
-
-        past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
-        decoder_attention_mask = jnp.ones((decoder_input_ids.shape[0], max_decoder_length), dtype="i4")
-
-        decoder_position_ids = jnp.broadcast_to(
-            jnp.arange(decoder_input_ids.shape[-1] - 1)[None, :],
-            (decoder_input_ids.shape[0], decoder_input_ids.shape[-1] - 1),
-        )
-        outputs_cache = model.decode(
-            decoder_input_ids[:, :-1],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            past_key_values=past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        decoder_position_ids = jnp.array(decoder_input_ids.shape[0] * [[decoder_input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model.decode(
-            decoder_input_ids[:, -1:],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            past_key_values=outputs_cache.past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        outputs = model.decode(decoder_input_ids, encoder_outputs)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, inputs_dict):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        encoder_outputs = model.encode(inputs_dict["input_ids"])
-
-        decoder_input_ids, decoder_attention_mask = (
-            inputs_dict["decoder_input_ids"],
-            inputs_dict["decoder_attention_mask"],
-        )
-
-        decoder_attention_mask_cache = jnp.concatenate(
-            [
-                decoder_attention_mask,
-                jnp.zeros((decoder_attention_mask.shape[0], max_decoder_length - decoder_attention_mask.shape[1])),
-            ],
-            axis=-1,
-        )
-
-        past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
-        decoder_position_ids = jnp.broadcast_to(
-            jnp.arange(decoder_input_ids.shape[-1] - 1)[None, :],
-            (decoder_input_ids.shape[0], decoder_input_ids.shape[-1] - 1),
-        )
-
-        outputs_cache = model.decode(
-            decoder_input_ids[:, :-1],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask_cache,
-            past_key_values=past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-        decoder_position_ids = jnp.array(decoder_input_ids.shape[0] * [[decoder_input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model.decode(
-            decoder_input_ids[:, -1:],
-            encoder_outputs,
-            past_key_values=outputs_cache.past_key_values,
-            decoder_attention_mask=decoder_attention_mask_cache,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        outputs = model.decode(decoder_input_ids, encoder_outputs, decoder_attention_mask=decoder_attention_mask)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-
-@require_flax
-class MBartHeadTests(unittest.TestCase):
-    vocab_size = 99
-
-    def _get_config_and_data(self):
-        input_ids = np.array(
-            [
-                [71, 82, 18, 33, 46, 91, 2],
-                [68, 34, 26, 58, 30, 82, 2],
-                [5, 97, 17, 39, 94, 40, 2],
-                [76, 83, 94, 25, 70, 78, 2],
-                [87, 59, 41, 35, 48, 66, 2],
-                [55, 13, 16, 58, 5, 2, 1],  # note padding
-                [64, 27, 31, 51, 12, 75, 2],
-                [52, 64, 86, 17, 83, 39, 2],
-                [48, 61, 9, 24, 71, 82, 2],
-                [26, 1, 60, 48, 22, 13, 2],
-                [21, 5, 62, 28, 14, 76, 2],
-                [45, 98, 37, 86, 59, 48, 2],
-                [70, 70, 50, 9, 28, 0, 2],
-            ],
-            dtype=np.int64,
-        )
-
-        batch_size = input_ids.shape[0]
-        config = MBartConfig(
-            vocab_size=self.vocab_size,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            eos_token_id=2,
-            pad_token_id=1,
-            bos_token_id=0,
-        )
-        return config, input_ids, batch_size
-
-    def test_sequence_classification_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        model = FlaxMBartForSequenceClassification(config)
-        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
-        expected_shape = (batch_size, config.num_labels)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-
-    def test_question_answering_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        model = FlaxMBartForQuestionAnswering(config)
-        outputs = model(input_ids=input_ids)
-
-        self.assertEqual(outputs["start_logits"].shape, input_ids.shape)
-        self.assertEqual(outputs["end_logits"].shape, input_ids.shape)
-
-    # @timeout_decorator.timeout(1)  # not working with the decorator so far
-    def test_lm_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        lm_model = FlaxMBartForConditionalGeneration(config)
-        outputs = lm_model(input_ids=input_ids)
-        expected_shape = (batch_size, input_ids.shape[1], config.vocab_size)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-
-    def test_lm_uneven_forward(self):
-        config = MBartConfig(
-            vocab_size=self.vocab_size,
-            d_model=14,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=8,
-            decoder_ffn_dim=8,
-            max_position_embeddings=48,
-        )
-        lm_model = FlaxMBartForConditionalGeneration(config)
-        context = np.array([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]], dtype=np.int64)
-        summary = np.array([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]], dtype=np.int64)
-        outputs = lm_model(input_ids=context, decoder_input_ids=summary)
-        expected_shape = (*summary.shape, config.vocab_size)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-
-    def test_shift_tokens_right(self):
-        input_ids = np.array([[71, 82, 18, 33, 2, 1, 1], [68, 34, 26, 58, 30, 82, 2]], dtype=np.int64)
-        shifted = shift_tokens_right(input_ids, 1)
-        n_pad_before = np.equal(input_ids, 1).astype(np.float32).sum()
-        n_pad_after = np.equal(shifted, 1).astype(np.float32).sum()
-        self.assertEqual(shifted.shape, input_ids.shape)
-        self.assertEqual(n_pad_after, n_pad_before - 1)
-        self.assertTrue(np.equal(shifted[:, 0], 2).all())
-
-
-@require_flax
-class FlaxMBartModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    is_encoder_decoder = True
-    all_model_classes = (
-        (
-            FlaxMBartModel,
-            FlaxMBartForConditionalGeneration,
-            FlaxMBartForSequenceClassification,
-            FlaxMBartForQuestionAnswering,
-        )
-        if is_flax_available()
-        else ()
-    )
-
-    def setUp(self):
-        self.model_tester = FlaxMBartModelTester(self)
-
-    def test_use_cache_forward(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward(model_class, config, inputs_dict)
-
-    def test_use_cache_forward_with_attn_mask(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward_with_attn_mask(model_class, config, inputs_dict)
-
-    def test_encode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def encode_jitted(input_ids, attention_mask=None, **kwargs):
-                    return model.encode(input_ids=input_ids, attention_mask=attention_mask)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    def test_decode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                model = model_class(config)
-                encoder_outputs = model.encode(inputs_dict["input_ids"], inputs_dict["attention_mask"])
-
-                prepared_inputs_dict = {
-                    "decoder_input_ids": inputs_dict["decoder_input_ids"],
-                    "decoder_attention_mask": inputs_dict["decoder_attention_mask"],
-                    "encoder_outputs": encoder_outputs,
-                }
-
-                @jax.jit
-                def decode_jitted(decoder_input_ids, decoder_attention_mask, encoder_outputs):
-                    return model.decode(
-                        decoder_input_ids=decoder_input_ids,
-                        decoder_attention_mask=decoder_attention_mask,
-                        encoder_outputs=encoder_outputs,
-                    )
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("facebook/mbart-large-cc25", from_pt=True)
-            # FlaxMBartForSequenceClassification expects eos token in input_ids
-            input_ids = np.ones((1, 1)) * model.config.eos_token_id
-            outputs = model(input_ids)
-            self.assertIsNotNone(outputs)
-
-
-@require_flax
-@require_sentencepiece
-@require_tokenizers
-class FlaxMBartModelIntegrationTest(unittest.TestCase):
-    src_text = [
-        " UN Chief Says There Is No Military Solution in Syria",
-    ]
-    expected_text = [
-        "Şeful ONU declară că nu există o soluţie militară în Siria",
-    ]
-    model_name = "facebook/mbart-large-en-ro"
-
-    @cached_property
-    def tokenizer(self):
-        return AutoTokenizer.from_pretrained(self.model_name)
-
-    @cached_property
-    def model(self):
-        model = FlaxMBartForConditionalGeneration.from_pretrained(self.model_name, from_pt=True)
-        return model
-
-    def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
-        generated_words = self.translate_src_text(**tokenizer_kwargs)
-        self.assertListEqual(self.expected_text, generated_words)
-
-    def translate_src_text(self, **tokenizer_kwargs):
-        model_inputs = self.tokenizer(self.src_text, **tokenizer_kwargs, return_tensors="np")
-        generated_ids = self.model.generate(
-            model_inputs.input_ids,
-            attention_mask=model_inputs.attention_mask,
-            decoder_start_token_id=self.tokenizer.lang_code_to_id["ro_RO"],
-            early_stopping=True,
-            num_beams=2,
-        ).sequences
-        generated_words = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-        return generated_words
-
-    @slow
-    def test_batch_generation_en_ro(self):
-        self._assert_generated_batch_equal_expected()
diff --git a/tests/models/mbart/test_modeling_tf_mbart.py b/tests/models/mbart/test_modeling_tf_mbart.py
deleted file mode 100644
index 1e2986f7b5..0000000000
--- a/tests/models/mbart/test_modeling_tf_mbart.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import AutoTokenizer, MBartConfig, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-from transformers.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFAutoModelForSeq2SeqLM, TFMBartForConditionalGeneration, TFMBartModel
-
-
-@require_tf
-class TFMBartModelTester:
-    config_cls = MBartConfig
-    config_updates = {}
-    hidden_act = "gelu"
-
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
-        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
-        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.config_cls(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_ids=[2],
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.pad_token_id,
-            **self.config_updates,
-        )
-        inputs_dict = prepare_mbart_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = TFMBartModel(config=config).get_decoder()
-        input_ids = inputs_dict["input_ids"]
-
-        input_ids = input_ids[:1, :]
-        attention_mask = inputs_dict["attention_mask"][:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-        past_key_values = past_key_values[1]
-
-
-def prepare_mbart_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = tf.concat(
-            [
-                tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int8),
-                tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int8),
-            ],
-            axis=-1,
-        )
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": decoder_attention_mask,
-    }
-
-
-@require_tf
-class TFMBartModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TFMBartForConditionalGeneration, TFMBartModel) if is_tf_available() else ()
-    all_generative_model_classes = (TFMBartForConditionalGeneration,) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFMBartModel,
-            "summarization": TFMBartForConditionalGeneration,
-            "text2text-generation": TFMBartForConditionalGeneration,
-            "translation": TFMBartForConditionalGeneration,
-        }
-        if is_tf_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    test_pruning = False
-    test_onnx = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_case_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        image_processor_name,
-        feature_extractor_name,
-        processor_name,
-    ):
-        if pipeline_test_case_name != "FeatureExtractionPipelineTests":
-            # Exception encountered when calling layer '...'
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = TFMBartModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MBartConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
-
-
-@require_sentencepiece
-@require_tokenizers
-@require_tf
-class TFMBartModelIntegrationTest(unittest.TestCase):
-    src_text = [
-        " UN Chief Says There Is No Military Solution in Syria",
-    ]
-    expected_text = [
-        "Şeful ONU declară că nu există o soluţie militară în Siria",
-    ]
-    model_name = "facebook/mbart-large-en-ro"
-
-    @cached_property
-    def tokenizer(self):
-        return AutoTokenizer.from_pretrained(self.model_name)
-
-    @cached_property
-    def model(self):
-        model = TFAutoModelForSeq2SeqLM.from_pretrained(self.model_name)
-        return model
-
-    def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
-        generated_words = self.translate_src_text(**tokenizer_kwargs)
-        self.assertListEqual(self.expected_text, generated_words)
-
-    def translate_src_text(self, **tokenizer_kwargs):
-        model_inputs = self.tokenizer(self.src_text, **tokenizer_kwargs, return_tensors="tf")
-        generated_ids = self.model.generate(
-            model_inputs.input_ids, attention_mask=model_inputs.attention_mask, num_beams=2
-        )
-        generated_words = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-        return generated_words
-
-    @slow
-    def test_batch_generation_en_ro(self):
-        self._assert_generated_batch_equal_expected()
diff --git a/tests/models/mistral/test_modeling_flax_mistral.py b/tests/models/mistral/test_modeling_flax_mistral.py
deleted file mode 100644
index 1993c58366..0000000000
--- a/tests/models/mistral/test_modeling_flax_mistral.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers import MistralConfig, is_flax_available, is_tokenizers_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
-
-
-if is_flax_available():
-    import jax.numpy as jnp
-
-    from transformers.models.mistral.modeling_flax_mistral import (
-        FlaxMistralForCausalLM,
-        FlaxMistralModel,
-    )
-
-
-if is_tokenizers_available():
-    from transformers import LlamaTokenizerFast
-
-
-class FlaxMistralModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        num_key_value_heads=2,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        window_size=7,
-        initializer_range=0.02,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.window_size = window_size
-        self.initializer_range = initializer_range
-        self.scope = None
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-        self.pad_token_id = vocab_size - 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = np.tril(np.ones((self.batch_size, self.seq_length)))
-
-        config = MistralConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_key_value_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            use_cache=True,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            sliding_window=self.window_size,
-        )
-        config.pad_token_id = config.eos_token_id
-
-        return (config, input_ids, input_mask)
-
-    # Copied from tests.models.gpt_neo.test_modeling_flax_gpt_neo.FlaxGPTNeoModelTester.prepare_config_and_inputs_for_common
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-    # Copied from tests.models.gpt_neo.test_modeling_flax_gpt_neo.FlaxGPTNeoModelTester.check_use_cache_forward
-    def check_use_cache_forward(self, model_class_name, config, input_ids, attention_mask):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
-        attention_mask = jnp.ones((input_ids.shape[0], max_decoder_length), dtype="i4")
-
-        position_ids = jnp.broadcast_to(
-            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
-        )
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-        )
-
-        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            attention_mask=attention_mask,
-            past_key_values=outputs_cache.past_key_values,
-            position_ids=position_ids,
-        )
-
-        outputs = model(input_ids)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    # Copied from tests.models.gpt_neo.test_modeling_flax_gpt_neo.FlaxGPTNeoModelTester.check_use_cache_forward_with_attn_mask
-    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input_ids, attention_mask):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        attention_mask_cache = jnp.concatenate(
-            [attention_mask, jnp.zeros((attention_mask.shape[0], max_decoder_length - attention_mask.shape[1]))],
-            axis=-1,
-        )
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
-        position_ids = jnp.broadcast_to(
-            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
-        )
-
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask_cache,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-        )
-        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            past_key_values=outputs_cache.past_key_values,
-            attention_mask=attention_mask_cache,
-            position_ids=position_ids,
-        )
-
-        outputs = model(input_ids, attention_mask=attention_mask)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-
-@require_flax
-class FlaxMistralModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxMistralModel, FlaxMistralForCausalLM) if is_flax_available() else ()
-
-    def setUp(self):
-        self.model_tester = FlaxMistralModelTester(self)
-
-    def test_use_cache_forward(self):
-        for model_class_name in self.all_model_classes:
-            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
-            self.model_tester.check_use_cache_forward(model_class_name, config, input_ids, attention_mask)
-
-    def test_use_cache_forward_with_attn_mask(self):
-        for model_class_name in self.all_model_classes:
-            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
-            self.model_tester.check_use_cache_forward_with_attn_mask(
-                model_class_name, config, input_ids, attention_mask
-            )
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("mistralai/Mistral-7B-v0.1", from_pt=True)
-            outputs = model(np.ones((1, 1)))
-            self.assertIsNotNone(outputs)
-
-
-@slow
-@require_flax
-class FlaxMistralIntegrationTest(unittest.TestCase):
-    def setUp(self):
-        self.model_id = "mistralai/Mistral-7B-v0.1"
-        self.model = FlaxMistralForCausalLM.from_pretrained(self.model_id, from_pt=True)
-        self.test_batch = jnp.arange(32).reshape(4, 8) + 1911
-
-    def test_model_logits(self):
-        input_ids = jnp.array([[1, 306, 4658, 278, 6593, 310, 2834, 338]])
-        EXPECTED_MEAN = np.array([[-2.5548, -2.5737, -3.0600, -2.5906, -2.8478, -2.8118, -2.9325, -2.7694]])
-        EXPECTED_SLICE = np.array([-5.8781,-5.8616,-0.1052,-4.7200,-5.8781,-5.8774,-5.8773,-5.8777,-5.8781,-5.8780,-5.8781,-5.8779,-1.0787,1.7583,-5.8779,-5.8780,-5.8783,-5.8778,-5.8776,-5.8781,-5.8784,-5.8778,-5.8778,-5.8777,-5.8779,-5.8778,-5.8776,-5.8780,-5.8779,-5.8781])  # fmt: skip
-
-        flax_logits = self.model(input_ids).logits
-        diff_mean = jnp.abs(flax_logits.mean(-1) - EXPECTED_MEAN).max()
-        diff_slice = jnp.abs(flax_logits[0, 0, :30] - EXPECTED_SLICE).max()
-
-        self.assertAlmostEqual(diff_mean, 0, places=3)
-        self.assertAlmostEqual(diff_slice, 0, places=3)
-
-    def test_generated_text(self):
-        tokenizer = LlamaTokenizerFast.from_pretrained(self.model_id)
-        tokenizer.pad_token_id = 2
-        EXPECTED_TEXT_COMPLETION = """My favourite condiment is 100% ketchup. I love it on everything. I’m not a big"""
-        prompt = "My favourite condiment is "
-        inputs = tokenizer(prompt, return_tensors="np", truncation=True, padding=True)
-        generated_ids = self.model.generate(**inputs, max_new_tokens=20, temperature=0).sequences
-        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        self.assertEqual(generated_text, EXPECTED_TEXT_COMPLETION)
diff --git a/tests/models/mistral/test_modeling_tf_mistral.py b/tests/models/mistral/test_modeling_tf_mistral.py
deleted file mode 100644
index aec7c6f23f..0000000000
--- a/tests/models/mistral/test_modeling_tf_mistral.py
+++ /dev/null
@@ -1,364 +0,0 @@
-# Copyright 2024 Mistral AI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TF 2.0 Mistral model."""
-
-import unittest
-
-import numpy as np
-
-from transformers import AutoTokenizer, MistralConfig, is_tf_available
-from transformers.testing_utils import (
-    require_tf,
-    slow,
-)
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers.models.mistral.modeling_tf_mistral import (
-        TFMistralForCausalLM,
-        TFMistralForSequenceClassification,
-        TFMistralModel,
-    )
-
-
-class TFMistralModelTester:
-    def __init__(self, parent):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = False
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.num_key_value_heads = 2
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.pad_token_id = 0
-        self.scope = None
-        self.bos_token_id = self.vocab_size - 1
-        self.eos_token_id = self.vocab_size - 1
-        self.pad_token_id = self.vocab_size - 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length], self.vocab_size)
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = MistralConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_key_value_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFMistralModel(config=config)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = TFMistralModel(config)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = TFMistralForCausalLM(config=config)
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = TFMistralForCausalLM(config=config)
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([input_mask, next_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFMistralModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (TFMistralModel, TFMistralForCausalLM, TFMistralForSequenceClassification) if is_tf_available() else ()
-    )
-    all_generative_model_classes = (TFMistralForCausalLM,) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFMistralModel,
-            "text-classification": TFMistralForSequenceClassification,
-            "text-generation": TFMistralForCausalLM,
-            "zero-shot": TFMistralForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_onnx = False
-    test_pruning = False
-    test_missing_keys = False
-    test_head_masking = False
-
-    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_case_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        image_processor_name,
-        feature_extractor_name,
-        processor_name,
-    ):
-        return True
-
-    def setUp(self):
-        self.model_tester = TFMistralModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MistralConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_Mistral_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = tf.not_equal(input_ids, 1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = TFMistralForSequenceClassification(config)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Mistral_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = tf.not_equal(input_ids, 1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = TFMistralForSequenceClassification(config)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Mistral_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = tf.not_equal(input_ids, 1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = TFMistralForSequenceClassification(config)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    @unittest.skip("Vocab resizing is not supported")
-    def test_save_load_after_resize_token_embeddings(self):
-        pass
-
-
-@require_tf
-class TFMistralIntegrationTest(unittest.TestCase):
-    @slow
-    def test_model_7b_logits(self):
-        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
-        model = TFMistralForCausalLM.from_pretrained(
-            "hf-internal-testing/tiny-random-MistralForCausalLM", from_pt=True
-        )
-        input_ids = tf.constant([input_ids])
-        out = model(input_ids).logits
-        # Expected mean on dim = -1
-        EXPECTED_MEAN = tf.constant(
-            [[-1.281e-04, -2.869e-04, -9.989e-05, -8.995e-05, 2.494e-04, -3.083e-04, -2.672e-04, -1.239e-04]]
-        )
-        tf.debugging.assert_near(tf.reduce_mean(out, axis=-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
-        # slicing logits[0, 0, 0:30]
-        EXPECTED_SLICE = tf.constant([0.1033,  0.1493, -0.0041, -0.0021, -0.1686,  0.0356,  0.0812,  0.2218, -0.1257,  0.1920,  0.0929,  0.1181,  0.0111,  0.0395, -0.0064,  0.1712, -0.0751,  0.0625, -0.2409,  0.1541, -0.1271, -0.2296, -0.0099, -0.0160, 0.0311, -0.0824, -0.1518,  0.0722,  0.0187,  0.0484])  # fmt: skip
-        tf.debugging.assert_near(out[0, 0, :30], EXPECTED_SLICE, atol=1e-4, rtol=1e-4)
-
-    @slow
-    def test_model_7b_generation(self):
-        EXPECTED_TEXT_COMPLETION = """My favourite condiment is  Werk a EgyadjustPrintfigiousPDFPHPct guns Ein motor conceti barSequ内 infrastructure millretval"""
-        prompt = "My favourite condiment is "
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM", use_fast=False)
-        model = TFMistralForCausalLM.from_pretrained(
-            "hf-internal-testing/tiny-random-MistralForCausalLM", from_pt=True
-        )
-        input_ids = tokenizer.encode(prompt, return_tensors="tf")
-
-        # greedy generation outputs
-        generated_ids = model.generate(input_ids, max_new_tokens=20, temperature=0)
-        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
diff --git a/tests/models/mobilebert/test_modeling_tf_mobilebert.py b/tests/models/mobilebert/test_modeling_tf_mobilebert.py
deleted file mode 100644
index 93a0ccad1f..0000000000
--- a/tests/models/mobilebert/test_modeling_tf_mobilebert.py
+++ /dev/null
@@ -1,341 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import MobileBertConfig, is_tf_available
-from transformers.models.auto import get_values
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TF_MODEL_FOR_PRETRAINING_MAPPING,
-        TFMobileBertForMaskedLM,
-        TFMobileBertForMultipleChoice,
-        TFMobileBertForNextSentencePrediction,
-        TFMobileBertForPreTraining,
-        TFMobileBertForQuestionAnswering,
-        TFMobileBertForSequenceClassification,
-        TFMobileBertForTokenClassification,
-        TFMobileBertModel,
-    )
-
-
-@require_tf
-class TFMobileBertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFMobileBertModel,
-            TFMobileBertForMaskedLM,
-            TFMobileBertForNextSentencePrediction,
-            TFMobileBertForPreTraining,
-            TFMobileBertForQuestionAnswering,
-            TFMobileBertForSequenceClassification,
-            TFMobileBertForTokenClassification,
-            TFMobileBertForMultipleChoice,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFMobileBertModel,
-            "fill-mask": TFMobileBertForMaskedLM,
-            "question-answering": TFMobileBertForQuestionAnswering,
-            "text-classification": TFMobileBertForSequenceClassification,
-            "token-classification": TFMobileBertForTokenClassification,
-            "zero-shot": TFMobileBertForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    # special case for ForPreTraining model, same as BERT tests
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class in get_values(TF_MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-
-        return inputs_dict
-
-    class TFMobileBertModelTester:
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            embedding_size=32,
-            num_hidden_layers=2,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-            self.embedding_size = embedding_size
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = MobileBertConfig(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range,
-                embedding_size=self.embedding_size,
-            )
-
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def create_and_check_mobilebert_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFMobileBertModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            result = model(inputs)
-
-            inputs = [input_ids, input_mask]
-            result = model(inputs)
-
-            result = model(input_ids)
-
-            self.parent.assertEqual(
-                result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)
-            )
-            self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-        def create_and_check_mobilebert_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFMobileBertForMaskedLM(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            result = model(inputs)
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-        def create_and_check_mobilebert_for_next_sequence_prediction(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFMobileBertForNextSentencePrediction(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            result = model(inputs)
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
-
-        def create_and_check_mobilebert_for_pretraining(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFMobileBertForPreTraining(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            result = model(inputs)
-            self.parent.assertEqual(
-                result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size)
-            )
-            self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
-
-        def create_and_check_mobilebert_for_sequence_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = TFMobileBertForSequenceClassification(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            result = model(inputs)
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-        def create_and_check_mobilebert_for_multiple_choice(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_choices = self.num_choices
-            model = TFMobileBertForMultipleChoice(config=config)
-            multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-            multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-            multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-            inputs = {
-                "input_ids": multiple_choice_inputs_ids,
-                "attention_mask": multiple_choice_input_mask,
-                "token_type_ids": multiple_choice_token_type_ids,
-            }
-            result = model(inputs)
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-        def create_and_check_mobilebert_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = TFMobileBertForTokenClassification(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            result = model(inputs)
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-        def create_and_check_mobilebert_for_question_answering(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFMobileBertForQuestionAnswering(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            result = model(inputs)
-            self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-            self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = TFMobileBertModelTest.TFMobileBertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MobileBertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_mobilebert_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mobilebert_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mobilebert_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mobilebert_for_multiple_choice(*config_and_inputs)
-
-    def test_for_next_sequence_prediction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mobilebert_for_next_sequence_prediction(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mobilebert_for_pretraining(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mobilebert_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mobilebert_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mobilebert_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        #     model_name = 'google/mobilebert-uncased'
-        for model_name in ["google/mobilebert-uncased"]:
-            model = TFMobileBertModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-@require_tf
-class TFMobileBertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = TFMobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
-        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        expected_shape = [1, 6, 30522]
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = tf.constant(
-            [
-                [
-                    [-4.5919547, -9.248295, -9.645256],
-                    [-6.7306175, -6.440284, -6.6052837],
-                    [-7.2743506, -6.7847915, -6.024673],
-                ]
-            ]
-        )
-        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
diff --git a/tests/models/mobilevit/test_modeling_tf_mobilevit.py b/tests/models/mobilevit/test_modeling_tf_mobilevit.py
deleted file mode 100644
index 13e3ab092b..0000000000
--- a/tests/models/mobilevit/test_modeling_tf_mobilevit.py
+++ /dev/null
@@ -1,424 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow MobileViT model."""
-
-from __future__ import annotations
-
-import inspect
-import unittest
-
-from transformers import MobileViTConfig
-from transformers.file_utils import is_tf_available, is_vision_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import numpy as np
-    import tensorflow as tf
-
-    from transformers import TFMobileViTForImageClassification, TFMobileViTForSemanticSegmentation, TFMobileViTModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import MobileViTImageProcessor
-
-
-class TFMobileViTConfigTester(ConfigTester):
-    def create_and_test_config_common_properties(self):
-        config = self.config_class(**self.inputs_dict)
-        self.parent.assertTrue(hasattr(config, "hidden_sizes"))
-        self.parent.assertTrue(hasattr(config, "neck_hidden_sizes"))
-        self.parent.assertTrue(hasattr(config, "num_attention_heads"))
-
-
-class TFMobileViTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=32,
-        patch_size=2,
-        num_channels=3,
-        last_hidden_size=32,
-        num_attention_heads=4,
-        hidden_act="silu",
-        conv_kernel_size=3,
-        output_stride=32,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        classifier_dropout_prob=0.1,
-        initializer_range=0.02,
-        is_training=True,
-        use_labels=True,
-        num_labels=10,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.last_hidden_size = last_hidden_size
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.conv_kernel_size = conv_kernel_size
-        self.output_stride = output_stride
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.classifier_dropout_prob = classifier_dropout_prob
-        self.use_labels = use_labels
-        self.is_training = is_training
-        self.num_labels = num_labels
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        pixel_labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-            pixel_labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels, pixel_labels
-
-    def get_config(self):
-        return MobileViTConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            num_attention_heads=self.num_attention_heads,
-            hidden_act=self.hidden_act,
-            conv_kernel_size=self.conv_kernel_size,
-            output_stride=self.output_stride,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            classifier_dropout_prob=self.classifier_dropout_prob,
-            initializer_range=self.initializer_range,
-            hidden_sizes=[12, 16, 20],
-            neck_hidden_sizes=[8, 8, 16, 16, 32, 32, 32],
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
-        model = TFMobileViTModel(config=config)
-        result = model(pixel_values, training=False)
-        expected_height = expected_width = self.image_size // self.output_stride
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.last_hidden_size, expected_height, expected_width)
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels, pixel_labels):
-        config.num_labels = self.num_labels
-        model = TFMobileViTForImageClassification(config)
-        result = model(pixel_values, labels=labels, training=False)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels, pixel_labels):
-        config.num_labels = self.num_labels
-        model = TFMobileViTForSemanticSegmentation(config)
-        expected_height = expected_width = self.image_size // self.output_stride
-
-        result = model(pixel_values, training=False)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, expected_height, expected_width)
-        )
-
-        result = model(pixel_values, labels=pixel_labels, training=False)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, expected_height, expected_width)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels, pixel_labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFMobileViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as MobileViT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (TFMobileViTModel, TFMobileViTForImageClassification, TFMobileViTForSemanticSegmentation)
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"feature-extraction": TFMobileViTModel, "image-classification": TFMobileViTForImageClassification}
-        if is_tf_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFMobileViTModelTester(self)
-        self.config_tester = TFMobileViTConfigTester(self, config_class=MobileViTConfig, has_text_modality=False)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="MobileViT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="MobileViT does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    @unittest.skip(reason="MobileViT does not output attentions")
-    def test_attention_outputs(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_stages = 5
-            self.assertEqual(len(hidden_states), expected_num_stages)
-
-            # MobileViT's feature maps are of shape (batch_size, num_channels, height, width)
-            # with the width and height being successively divided by 2.
-            divisor = 2
-            for i in range(len(hidden_states)):
-                self.assertListEqual(
-                    list(hidden_states[i].shape[-2:]),
-                    [self.model_tester.image_size // divisor, self.model_tester.image_size // divisor],
-                )
-                divisor *= 2
-
-            self.assertEqual(self.model_tester.output_stride, divisor // 2)
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    def test_for_semantic_segmentation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs)
-
-    @unittest.skipIf(
-        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
-        reason="TF does not support backprop for grouped convolutions on CPU.",
-    )
-    def test_dataset_conversion(self):
-        super().test_dataset_conversion()
-
-    def check_keras_fit_results(self, val_loss1, val_loss2, atol=2e-1, rtol=2e-1):
-        self.assertTrue(np.allclose(val_loss1, val_loss2, atol=atol, rtol=rtol))
-
-    @unittest.skipIf(
-        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
-        reason="TF does not support backprop for grouped convolutions on CPU.",
-    )
-    @slow
-    def test_keras_fit(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            # Since `TFMobileViTModel` cannot operate with the default `fit()` method.
-            if model_class.__name__ != "TFMobileViTModel":
-                model = model_class(config)
-                if getattr(model, "hf_compute_loss", None):
-                    super().test_keras_fit()
-
-    # The default test_loss_computation() uses -100 as a proxy ignore_index
-    # to test masked losses. Overriding to avoid -100 since semantic segmentation
-    #  models use `semantic_loss_ignore_index` from the config.
-    def test_loss_computation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            # set an ignore index to correctly test the masked loss used in
-            # `TFMobileViTForSemanticSegmentation`.
-            if model_class.__name__ != "TFMobileViTForSemanticSegmentation":
-                config.semantic_loss_ignore_index = 5
-
-            model = model_class(config)
-            if getattr(model, "hf_compute_loss", None):
-                # The number of elements in the loss should be the same as the number of elements in the label
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                added_label = prepared_for_class[
-                    sorted(prepared_for_class.keys() - inputs_dict.keys(), reverse=True)[0]
-                ]
-                expected_loss_size = added_label.shape.as_list()[:1]
-
-                # Test that model correctly compute the loss with kwargs
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                possible_input_names = {"input_ids", "pixel_values", "input_features"}
-                input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
-                model_input = prepared_for_class.pop(input_name)
-
-                loss = model(model_input, **prepared_for_class)[0]
-                self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
-
-                # Test that model correctly compute the loss when we mask some positions
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                possible_input_names = {"input_ids", "pixel_values", "input_features"}
-                input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
-                model_input = prepared_for_class.pop(input_name)
-                if "labels" in prepared_for_class:
-                    labels = prepared_for_class["labels"].numpy()
-                    if len(labels.shape) > 1 and labels.shape[1] != 1:
-                        # labels[0] = -100
-                        prepared_for_class["labels"] = tf.convert_to_tensor(labels)
-                        loss = model(model_input, **prepared_for_class)[0]
-                        self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
-                        self.assertTrue(not np.any(np.isnan(loss.numpy())))
-
-                # Test that model correctly compute the loss with a dict
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                loss = model(prepared_for_class)[0]
-                self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
-
-                # Test that model correctly compute the loss with a tuple
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-
-                # Get keys that were added with the _prepare_for_class function
-                label_keys = prepared_for_class.keys() - inputs_dict.keys()
-                signature = inspect.signature(model.call).parameters
-                signature_names = list(signature.keys())
-
-                # Create a dictionary holding the location of the tensors in the tuple
-                tuple_index_mapping = {0: input_name}
-                for label_key in label_keys:
-                    label_key_index = signature_names.index(label_key)
-                    tuple_index_mapping[label_key_index] = label_key
-                sorted_tuple_index_mapping = sorted(tuple_index_mapping.items())
-                # Initialize a list with their default values, update the values and convert to a tuple
-                list_input = []
-
-                for name in signature_names:
-                    if name != "kwargs":
-                        list_input.append(signature[name].default)
-
-                for index, value in sorted_tuple_index_mapping:
-                    list_input[index] = prepared_for_class[value]
-
-                tuple_input = tuple(list_input)
-
-                # Send to model
-                loss = model(tuple_input[:-1])[0]
-
-                self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "apple/mobilevit-small"
-        model = TFMobileViTModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_tf
-class TFMobileViTModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_image_classification_head(self):
-        model = TFMobileViTForImageClassification.from_pretrained("apple/mobilevit-xx-small")
-
-        image_processor = MobileViTImageProcessor.from_pretrained("apple/mobilevit-xx-small")
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="tf")
-
-        # forward pass
-        outputs = model(**inputs, training=False)
-
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = tf.constant([-1.9364, -1.2327, -0.4653])
-
-        tf.debugging.assert_near(outputs.logits[0, :3], expected_slice, atol=1e-4, rtol=1e-04)
-
-    @slow
-    def test_inference_semantic_segmentation(self):
-        # `from_pt` will be removed
-        model = TFMobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
-
-        image_processor = MobileViTImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
-
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="tf")
-
-        # forward pass
-        outputs = model(inputs.pixel_values, training=False)
-        logits = outputs.logits
-
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 21, 32, 32))
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = tf.constant(
-            [
-                [[6.9713, 6.9786, 7.2422], [7.2893, 7.2825, 7.4446], [7.6580, 7.8797, 7.9420]],
-                [[-10.6869, -10.3250, -10.3471], [-10.4228, -9.9868, -9.7132], [-11.0405, -11.0221, -10.7318]],
-                [[-3.3089, -2.8539, -2.6740], [-3.2706, -2.5621, -2.5108], [-3.2534, -2.6615, -2.6651]],
-            ]
-        )
-
-        tf.debugging.assert_near(logits[0, :3, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
diff --git a/tests/models/mpnet/test_modeling_tf_mpnet.py b/tests/models/mpnet/test_modeling_tf_mpnet.py
deleted file mode 100644
index 4a9ea86588..0000000000
--- a/tests/models/mpnet/test_modeling_tf_mpnet.py
+++ /dev/null
@@ -1,275 +0,0 @@
-# Copyright 2020 The HuggingFace Inc. team, Microsoft Corporation.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import MPNetConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers.models.mpnet.modeling_tf_mpnet import (
-        TFMPNetForMaskedLM,
-        TFMPNetForMultipleChoice,
-        TFMPNetForQuestionAnswering,
-        TFMPNetForSequenceClassification,
-        TFMPNetForTokenClassification,
-        TFMPNetModel,
-    )
-
-
-class TFMPNetModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=64,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=64,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = MPNetConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-        )
-        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def create_and_check_mpnet_model(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFMPNetModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-        result = model(inputs)
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_mpnet_for_masked_lm(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFMPNetForMaskedLM(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_mpnet_for_question_answering(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFMPNetForQuestionAnswering(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_mpnet_for_sequence_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFMPNetForSequenceClassification(config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_mpnet_for_multiple_choice(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = TFMPNetForMultipleChoice(config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "attention_mask": multiple_choice_input_mask,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_mpnet_for_token_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFMPNetForTokenClassification(config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFMPNetModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFMPNetForMaskedLM,
-            TFMPNetForMultipleChoice,
-            TFMPNetForQuestionAnswering,
-            TFMPNetForSequenceClassification,
-            TFMPNetForTokenClassification,
-            TFMPNetModel,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFMPNetModel,
-            "fill-mask": TFMPNetForMaskedLM,
-            "question-answering": TFMPNetForQuestionAnswering,
-            "text-classification": TFMPNetForSequenceClassification,
-            "token-classification": TFMPNetForTokenClassification,
-            "zero-shot": TFMPNetForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFMPNetModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MPNetConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_mpnet_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mpnet_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mpnet_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mpnet_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mpnet_for_sequence_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mpnet_for_multiple_choice(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mpnet_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in ["microsoft/mpnet-base"]:
-            model = TFMPNetModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-@require_tf
-class TFMPNetModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = TFMPNetModel.from_pretrained("microsoft/mpnet-base")
-        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        expected_shape = [1, 6, 768]
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = tf.constant(
-            [
-                [
-                    [-0.1067172, 0.08216473, 0.0024543],
-                    [-0.03465879, 0.8354118, -0.03252288],
-                    [-0.06569476, -0.12424111, -0.0494436],
-                ]
-            ]
-        )
-        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
diff --git a/tests/models/mt5/test_modeling_flax_mt5.py b/tests/models/mt5/test_modeling_flax_mt5.py
deleted file mode 100644
index 7b98d740b3..0000000000
--- a/tests/models/mt5/test_modeling_flax_mt5.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers import is_flax_available
-from transformers.testing_utils import require_flax, require_sentencepiece, require_tokenizers, require_torch, slow
-
-
-if is_flax_available():
-    import optax
-    from flax.training.common_utils import onehot
-
-    from transformers import AutoTokenizer, FlaxMT5ForConditionalGeneration
-    from transformers.models.t5.modeling_flax_t5 import shift_tokens_right
-
-
-@require_torch
-@require_sentencepiece
-@require_tokenizers
-@require_flax
-class MT5IntegrationTest(unittest.TestCase):
-    @slow
-    def test_small_integration_test(self):
-        """
-        For comparison run:
-        >>> import t5  # pip install t5==0.7.1
-        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
-
-        >>> path_to_mtf_small_mt5_checkpoint = '<fill_in>'
-        >>> path_to_mtf_small_mt5_spm_model_path = '<fill_in>'
-        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_mt5_checkpoint, batch_size=1, tpu=None)
-        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_mt5_spm_model_path)
-        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
-        """
-
-        model = FlaxMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
-        tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
-
-        input_ids = tokenizer("Hello there", return_tensors="np").input_ids
-        labels = tokenizer("Hi I am", return_tensors="np").input_ids
-
-        decoder_input_ids = shift_tokens_right(labels, model.config.pad_token_id, model.config.decoder_start_token_id)
-
-        logits = model(input_ids, decoder_input_ids=decoder_input_ids).logits
-        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])).mean()
-
-        mtf_score = -(labels.shape[-1] * loss.item())
-
-        EXPECTED_SCORE = -84.9127
-        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
diff --git a/tests/models/mt5/test_modeling_tf_mt5.py b/tests/models/mt5/test_modeling_tf_mt5.py
deleted file mode 100644
index f7b77014d4..0000000000
--- a/tests/models/mt5/test_modeling_tf_mt5.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
-
-
-@require_tf
-@require_sentencepiece
-@require_tokenizers
-class TFMT5ModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_small_integration_test(self):
-        """
-        For comparison run:
-        >>> import t5  # pip install t5==0.7.1
-        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
-
-        >>> path_to_mtf_small_mt5_checkpoint = '<fill_in>'
-        >>> path_to_mtf_small_mt5_spm_model_path = '<fill_in>'
-        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_mt5_checkpoint, batch_size=1, tpu=None)
-        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_mt5_spm_model_path, extra_ids=100)
-        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
-        """
-
-        model = TFAutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
-        tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
-
-        input_ids = tokenizer("Hello there", return_tensors="tf").input_ids
-        labels = tokenizer("Hi I am", return_tensors="tf").input_ids
-
-        loss = model(input_ids, labels=labels).loss
-        mtf_score = -tf.math.reduce_mean(loss).numpy()
-
-        EXPECTED_SCORE = -21.228168
-        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 2e-4)
diff --git a/tests/models/openai/test_modeling_tf_openai.py b/tests/models/openai/test_modeling_tf_openai.py
deleted file mode 100644
index 44dc5a4b63..0000000000
--- a/tests/models/openai/test_modeling_tf_openai.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import OpenAIGPTConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers.models.openai.modeling_tf_openai import (
-        TFOpenAIGPTDoubleHeadsModel,
-        TFOpenAIGPTForSequenceClassification,
-        TFOpenAIGPTLMHeadModel,
-        TFOpenAIGPTModel,
-    )
-
-
-class TFOpenAIGPTModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_token_type_ids = True
-        self.use_input_mask = True
-        self.use_labels = True
-        self.use_mc_token_ids = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-        self.pad_token_id = self.vocab_size - 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        mc_token_ids = None
-        if self.use_mc_token_ids:
-            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = OpenAIGPTConfig(
-            vocab_size=self.vocab_size,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            # intermediate_size=self.intermediate_size,
-            # hidden_act=self.hidden_act,
-            # hidden_dropout_prob=self.hidden_dropout_prob,
-            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            n_positions=self.max_position_embeddings,
-            # type_vocab_size=self.type_vocab_size,
-            # initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
-
-        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def create_and_check_openai_gpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = TFOpenAIGPTModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_openai_gpt_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = TFOpenAIGPTLMHeadModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_openai_gpt_double_head(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
-    ):
-        model = TFOpenAIGPTDoubleHeadsModel(config=config)
-
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "mc_token_ids": mc_token_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_choices, self.seq_length, self.vocab_size)
-        )
-        self.parent.assertEqual(result.mc_logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_openai_gpt_for_sequence_classification(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        config.num_labels = self.num_labels
-        sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "labels": sequence_labels,
-        }
-        model = TFOpenAIGPTForSequenceClassification(config)
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFOpenAIGPTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel, TFOpenAIGPTForSequenceClassification)
-        if is_tf_available()
-        else ()
-    )
-    all_generative_model_classes = (
-        (TFOpenAIGPTLMHeadModel,) if is_tf_available() else ()
-    )  # TODO (PVP): Add Double HeadsModel when generate() function is changed accordingly
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFOpenAIGPTModel,
-            "text-classification": TFOpenAIGPTForSequenceClassification,
-            "text-generation": TFOpenAIGPTLMHeadModel,
-            "zero-shot": TFOpenAIGPTForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_case_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        image_processor_name,
-        feature_extractor_name,
-        processor_name,
-    ):
-        if pipeline_test_case_name == "ZeroShotClassificationPipelineTests":
-            # Get `tokenizer does not have a padding token` error for both fast/slow tokenizers.
-            # `OpenAIGPTConfig` was never used in pipeline tests, either because of a missing checkpoint or because a
-            # tiny config could not be created.
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = TFOpenAIGPTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_openai_gpt_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_openai_gpt_model(*config_and_inputs)
-
-    def test_openai_gpt_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_openai_gpt_lm_head(*config_and_inputs)
-
-    def test_openai_gpt_double_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_openai_gpt_double_head(*config_and_inputs)
-
-    def test_openai_gpt_sequence_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_openai_gpt_for_sequence_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "openai-community/openai-gpt"
-        model = TFOpenAIGPTModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_tf
-class TFOPENAIGPTModelLanguageGenerationTest(unittest.TestCase):
-    @slow
-    def test_lm_generate_openai_gpt(self):
-        model = TFOpenAIGPTLMHeadModel.from_pretrained("openai-community/openai-gpt")
-        input_ids = tf.convert_to_tensor([[481, 4735, 544]], dtype=tf.int32)  # the president is
-        expected_output_ids = [
-            481,
-            4735,
-            544,
-            246,
-            963,
-            870,
-            762,
-            239,
-            244,
-            40477,
-            244,
-            249,
-            719,
-            881,
-            487,
-            544,
-            240,
-            244,
-            603,
-            481,
-        ]  # the president is a very good man. " \n " i\'m sure he is, " said the
-
-        output_ids = model.generate(input_ids, do_sample=False)
-        self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
diff --git a/tests/models/opt/test_modeling_flax_opt.py b/tests/models/opt/test_modeling_flax_opt.py
deleted file mode 100644
index d922775628..0000000000
--- a/tests/models/opt/test_modeling_flax_opt.py
+++ /dev/null
@@ -1,404 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-import numpy as np
-import timeout_decorator  # noqa
-
-from transformers import OPTConfig, is_flax_available
-from transformers.testing_utils import require_flax, require_sentencepiece, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
-
-
-if is_flax_available():
-    import os
-
-    # The slow tests are often failing with OOM error on GPU
-    # This makes JAX allocate exactly what is needed on demand, and deallocate memory that is no longer needed
-    # but will be slower as stated here https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
-    os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
-
-    import jax
-    import jax.numpy as jnp
-
-    from transformers import FlaxOPTForCausalLM, FlaxOPTModel, GPT2Tokenizer
-
-
-def prepare_opt_inputs_dict(config, input_ids, attention_mask=None, head_mask=None):
-    if attention_mask is None:
-        attention_mask = np.where(input_ids != config.pad_token_id, 1, 0)
-    return {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-    }
-
-
-@require_flax
-class FlaxOPTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-        embed_dim=16,
-        word_embed_proj_dim=16,
-        initializer_range=0.02,
-        attn_implementation="eager",
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.embed_dim = embed_dim
-        self.word_embed_proj_dim = word_embed_proj_dim
-        self.initializer_range = initializer_range
-        self.is_encoder_decoder = False
-        self.attn_implementation = attn_implementation
-
-    def prepare_config_and_inputs(self):
-        input_ids = np.clip(ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size), 3, self.vocab_size)
-        input_ids = np.concatenate((input_ids, 2 * np.ones((self.batch_size, 1), dtype=np.int64)), -1)
-
-        config = OPTConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            embed_dim=self.embed_dim,
-            is_encoder_decoder=False,
-            word_embed_proj_dim=self.word_embed_proj_dim,
-            initializer_range=self.initializer_range,
-            use_cache=False,
-            attn_implementation=self.attn_implementation,
-        )
-        inputs_dict = prepare_opt_inputs_dict(config, input_ids)
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def check_use_cache_forward(self, model_class_name, config, inputs_dict):
-        max_length = 20
-        model = model_class_name(config)
-
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_length)
-        attention_mask = jnp.ones((input_ids.shape[0], max_length), dtype="i4")
-
-        position_ids = jnp.broadcast_to(
-            jnp.arange(input_ids.shape[-1] - 1)[None, :],
-            (input_ids.shape[0], input_ids.shape[-1] - 1),
-        )
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-        )
-
-        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            attention_mask=attention_mask,
-            past_key_values=outputs_cache.past_key_values,
-            position_ids=position_ids,
-        )
-
-        outputs = model(input_ids)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, inputs_dict):
-        max_length = 20
-        model = model_class_name(config)
-
-        input_ids, attention_mask = (
-            inputs_dict["input_ids"],
-            inputs_dict["attention_mask"],
-        )
-
-        attention_mask_cache = jnp.concatenate(
-            [
-                attention_mask,
-                jnp.zeros((attention_mask.shape[0], max_length - attention_mask.shape[1])),
-            ],
-            axis=-1,
-        )
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_length)
-        position_ids = jnp.broadcast_to(
-            jnp.arange(input_ids.shape[-1] - 1)[None, :],
-            (input_ids.shape[0], input_ids.shape[-1] - 1),
-        )
-
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask_cache,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-        )
-        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            past_key_values=outputs_cache.past_key_values,
-            attention_mask=attention_mask_cache,
-            position_ids=position_ids,
-        )
-
-        outputs = model(input_ids, attention_mask=attention_mask)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-
-@require_flax
-class FlaxOPTModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxOPTModel, FlaxOPTForCausalLM) if is_flax_available() else ()
-
-    def setUp(self):
-        self.model_tester = FlaxOPTModelTester(self)
-
-    def test_use_cache_forward(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward(model_class, config, inputs_dict)
-
-    def test_use_cache_forward_with_attn_mask(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward_with_attn_mask(model_class, config, inputs_dict)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("facebook/opt-125m")
-            input_ids = np.ones((1, 1)) * model.config.eos_token_id
-            outputs = model(input_ids)
-            self.assertIsNotNone(outputs)
-
-
-@require_sentencepiece
-@require_flax
-class FlaxOPTModelIntegrationTests(unittest.TestCase):
-    @slow
-    def test_inference_no_head(self):
-        model = FlaxOPTModel.from_pretrained("facebook/opt-350m")
-        input_ids = jnp.array([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids=input_ids).last_hidden_state
-        expected_shape = (1, 11, 512)
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = jnp.array(
-            [[-0.2867, -1.9256, -0.3062], [-1.2711, -0.1337, -0.1897], [0.4109, 0.1187, -1.3142]]
-        )
-        self.assertTrue(jnp.allclose(output[:, :3, :3], expected_slice, atol=4e-2))
-
-
-@require_flax
-@slow
-class FlaxOPTEmbeddingsTest(unittest.TestCase):
-    def setUp(self):
-        super().setUp()
-        self.path_model = "facebook/opt-350m"
-
-    def test_logits(self):
-        model = FlaxOPTForCausalLM.from_pretrained(self.path_model)
-        tokenizer = GPT2Tokenizer.from_pretrained(self.path_model)
-
-        prompts = [
-            "Today is a beautiful day and I want to",
-            "In the city of",
-            "Paris is the capital of France and",
-            "Computers and mobile phones have taken",
-        ]
-        # verify that prompt without BOS token is identical to Metaseq -> add_special_tokens=False
-        inputs = tokenizer(prompts, return_tensors="jax", padding=True, add_special_tokens=False)
-        logits = model(inputs.input_ids, attention_mask=inputs.attention_mask)[0].mean(axis=-1)
-        logits_meta = jnp.array(
-            [
-                [1.3851, -13.8923, -10.5229, -10.7533, -0.2309, -10.2384, -0.5365, -9.0947, -5.1670],
-                [-4.7073, -10.6276, -3.9415, -21.5242, -0.2822, -0.2822, -0.2822, -0.2822, -0.2822],
-                [0.6247, -3.4229, -8.9179, -1.4297, -14.1650, 1.4146, -9.0218, -0.2703, -0.2703],
-                [6.4783, -1.9913, -10.7926, -2.3336, 1.5092, -0.9974, -6.8213, 1.3477, 1.3477],
-            ]
-        )
-        self.assertTrue(jnp.allclose(logits, logits_meta, atol=4e-2))
-
-        model = jax.jit(model)
-        logits = model(inputs.input_ids, attention_mask=inputs.attention_mask)[0].mean(axis=-1)
-        self.assertTrue(jnp.allclose(logits, logits_meta, atol=4e-2))
-
-
-@require_flax
-@slow
-class FlaxOPTGenerationTest(unittest.TestCase):
-    @property
-    def prompts(self):
-        return [
-            "Today is a beautiful day and I want",
-            "In the city of",
-            "Paris is the capital of France and",
-            "Computers and mobile phones have taken",
-        ]
-
-    def test_generation_pre_attn_layer_norm(self):
-        model_id = "facebook/opt-125m"
-
-        EXPECTED_OUTPUTS = [
-            "Today is a beautiful day and I want to",
-            "In the city of New York, the city",
-            "Paris is the capital of France and the capital",
-            "Computers and mobile phones have taken over the",
-        ]
-
-        predicted_outputs = []
-
-        model = FlaxOPTForCausalLM.from_pretrained(model_id)
-        tokenizer = GPT2Tokenizer.from_pretrained(model_id)
-
-        for prompt in self.prompts:
-            input_ids = tokenizer(prompt, return_tensors="jax").input_ids
-
-            generated_ids = model.generate(input_ids, max_length=10)
-            generated_ids = generated_ids[0]
-
-            generated_string = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-            predicted_outputs += generated_string
-
-        self.assertListEqual(predicted_outputs, EXPECTED_OUTPUTS)
-
-    def test_generation_post_attn_layer_norm(self):
-        model_id = "facebook/opt-350m"
-
-        EXPECTED_OUTPUTS = [
-            "Today is a beautiful day and I want to",
-            "In the city of San Francisco, the city",
-            "Paris is the capital of France and the capital",
-            "Computers and mobile phones have taken over the",
-        ]
-
-        predicted_outputs = []
-        model = FlaxOPTForCausalLM.from_pretrained(model_id)
-        tokenizer = GPT2Tokenizer.from_pretrained(model_id)
-
-        for prompt in self.prompts:
-            input_ids = tokenizer(prompt, return_tensors="jax").input_ids
-
-            generated_ids = model.generate(input_ids, max_length=10)
-            generated_ids = generated_ids[0]
-
-            generated_string = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-            predicted_outputs += generated_string
-
-        self.assertListEqual(predicted_outputs, EXPECTED_OUTPUTS)
-
-    def test_jitted_batch_generation(self):
-        model_id = "facebook/opt-125m"
-        EXPECTED_OUTPUTS = [
-            "Today is a beautiful day and I want to thank",
-            "In the city of Rome Canaver Canaver Canaver Canaver",
-        ]
-        model = FlaxOPTForCausalLM.from_pretrained(model_id)
-        tokenizer = GPT2Tokenizer.from_pretrained(model_id)
-        inputs = tokenizer(
-            [
-                "Today is a beautiful day and I want to",
-                "In the city of",
-            ],
-            return_tensors="jax",
-            padding=True,
-        )
-
-        jit_generate = jax.jit(model.generate)
-
-        output_sequences = jit_generate(inputs["input_ids"], attention_mask=inputs["attention_mask"]).sequences
-
-        output_string = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
-
-        self.assertIsNotNone(output_string, EXPECTED_OUTPUTS)
-
-    def test_batch_generation(self):
-        model_id = "facebook/opt-350m"
-
-        tokenizer = GPT2Tokenizer.from_pretrained(model_id)
-        model = FlaxOPTForCausalLM.from_pretrained(model_id)
-
-        tokenizer.padding_side = "left"
-
-        # use different length sentences to test batching
-        sentences = [
-            "Hello, my dog is a little",
-            "Today, I",
-        ]
-
-        inputs = tokenizer(sentences, return_tensors="jax", padding=True)
-        input_ids = inputs["input_ids"]
-
-        outputs = model.generate(input_ids=input_ids, attention_mask=inputs["attention_mask"], trace=False)
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="jax").input_ids
-        output_non_padded = model.generate(input_ids=inputs_non_padded)
-
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].sum()
-        inputs_padded = tokenizer(sentences[1], return_tensors="jax").input_ids
-        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs[0], skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0][0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0][0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "Hello, my dog is a little bit of a dork.\nI'm a little bit",
-            "Today, I was in the middle of a conversation with a friend about the",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(batch_out_sentence, [non_padded_sentence, padded_sentence])
diff --git a/tests/models/opt/test_modeling_tf_opt.py b/tests/models/opt/test_modeling_tf_opt.py
deleted file mode 100644
index d36fa885c9..0000000000
--- a/tests/models/opt/test_modeling_tf_opt.py
+++ /dev/null
@@ -1,405 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import unittest
-
-import numpy as np
-
-from transformers import OPTConfig, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import GPT2Tokenizer, TFOPTForCausalLM, TFOPTModel
-
-
-def prepare_opt_inputs_dict(config, input_ids, attention_mask=None, head_mask=None):
-    if attention_mask is None:
-        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8)
-    return {"input_ids": input_ids, "attention_mask": attention_mask}
-
-
-@require_tf
-class TFOPTModelTester:
-    config_cls = OPTConfig
-    config_updates = {}
-    hidden_act = "gelu"
-
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-        embed_dim=16,
-        word_embed_proj_dim=16,
-        attn_implementation="eager",
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.embed_dim = embed_dim
-        self.word_embed_proj_dim = word_embed_proj_dim
-        self.is_encoder_decoder = False
-        self.attn_implementation = attn_implementation
-
-    def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
-        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
-        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
-
-        config = self.config_cls(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            embed_dim=self.embed_dim,
-            word_embed_proj_dim=self.word_embed_proj_dim,
-            is_encoder_decoder=False,
-            attn_implementation=self.attn_implementation,
-            **self.config_updates,
-        )
-        inputs_dict = prepare_opt_inputs_dict(config, input_ids)
-        return config, inputs_dict
-
-    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = TFOPTModel(config=config)
-        input_ids = inputs_dict["input_ids"]
-
-        input_ids = input_ids[:1, :]
-        attention_mask = inputs_dict["attention_mask"][:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = tf.cast(ids_tensor((self.batch_size, 3), 2), tf.int8)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-
-@require_tf
-class TFOPTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TFOPTModel, TFOPTForCausalLM) if is_tf_available() else ()
-    all_generative_model_classes = (TFOPTForCausalLM,) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": TFOPTModel, "text-generation": TFOPTForCausalLM} if is_tf_available() else {}
-    )
-    is_encoder_decoder = False
-    test_pruning = False
-    test_onnx = False
-    onnx_min_opset = 10
-
-    def setUp(self):
-        self.model_tester = TFOPTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=OPTConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_resize_token_embeddings(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def _get_word_embedding_weight(model, embedding_layer):
-            if hasattr(embedding_layer, "weight"):
-                return embedding_layer.weight
-            else:
-                # Here we build the word embeddings weights if not exists.
-                # And then we retry to get the attribute once built.
-                model.build_in_name_scope()
-                if hasattr(embedding_layer, "weight"):
-                    return embedding_layer.weight
-                else:
-                    return None
-
-        for model_class in self.all_model_classes:
-            for size in [config.vocab_size - 10, config.vocab_size + 10]:
-                # build the embeddings
-                model = model_class(config=config)
-                old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
-                old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
-
-                # reshape the embeddings
-                model.resize_token_embeddings(size)
-                new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
-                new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
-
-                # check that the resized embeddings size matches the desired size.
-                assert_size = size if size is not None else config.vocab_size
-
-                self.assertEqual(new_input_embeddings.shape[0], assert_size)
-
-                # check that weights remain the same after resizing
-                models_equal = True
-                for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
-                    if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
-                        models_equal = False
-                self.assertTrue(models_equal)
-
-                if old_output_embeddings is not None and new_output_embeddings is not None:
-                    self.assertEqual(new_output_embeddings.shape[0], assert_size)
-
-                    models_equal = True
-                    for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
-                        if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
-                            models_equal = False
-                    self.assertTrue(models_equal)
-
-
-def _long_tensor(tok_lst):
-    return tf.constant(tok_lst, dtype=tf.int32)
-
-
-@require_tf
-class TFOPTHeadTests(unittest.TestCase):
-    vocab_size = 99
-
-    def _get_config_and_data(self):
-        eos_column_vector = tf.ones((4, 1), dtype=tf.int32) * 2
-        input_ids = tf.concat([ids_tensor((4, 6), self.vocab_size - 3) + 3, eos_column_vector], axis=1)
-        batch_size = input_ids.shape[0]
-        config = OPTConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=24,
-            num_hidden_layers=2,
-            num_attention_heads=2,
-            ffn_dim=32,
-            max_position_embeddings=48,
-            eos_token_id=2,
-            pad_token_id=1,
-            bos_token_id=0,
-        )
-        return config, input_ids, batch_size
-
-
-@require_sentencepiece
-@require_tf
-class OPTModelIntegrationTests(unittest.TestCase):
-    @slow
-    def test_inference_no_head(self):
-        model = TFOPTModel.from_pretrained("facebook/opt-350m")
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        attention_mask = tf.not_equal(input_ids, model.config.pad_token_id)
-        with tf.GradientTape():
-            output = model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
-        expected_shape = (1, 11, 512)
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = tf.constant(
-            [[-0.2873, -1.9218, -0.3033], [-1.2710, -0.1338, -0.1902], [0.4095, 0.1214, -1.3121]]
-        )
-        self.assertTrue(np.allclose(output[:, :3, :3], expected_slice, atol=4e-3))
-
-        xla_generate = tf.function(model, jit_compile=True)
-        output = xla_generate(input_ids, attention_mask)[0]
-        self.assertTrue(np.allclose(output[:, :3, :3], expected_slice, atol=4e-2))
-
-
-@require_tf
-@slow
-class TFOPTEmbeddingsTest(unittest.TestCase):
-    def setUp(self):
-        super().setUp()
-        self.path_model = "facebook/opt-350m"
-
-    def test_logits(self):
-        model = TFOPTForCausalLM.from_pretrained(self.path_model)
-        tokenizer = GPT2Tokenizer.from_pretrained(self.path_model)
-
-        prompts = [
-            "Today is a beautiful day and I want to",
-            "In the city of",
-            "Paris is the capital of France and",
-            "Computers and mobile phones have taken",
-        ]
-        # verify that prompt without BOS token is identical to Metaseq -> add_special_tokens=False
-        inputs = tokenizer(prompts, return_tensors="tf", padding=True, add_special_tokens=False)
-        logits = tf.math.reduce_mean(model(inputs.input_ids, attention_mask=inputs.attention_mask)[0], axis=-1)
-        logits_meta = tf.constant(
-            [
-                [1.3851, -13.8923, -10.5229, -10.7533, -0.2309, -10.2384, -0.5365, -9.0947, -5.1670],
-                [-4.7073, -10.6276, -3.9415, -21.5242, -0.2822, -0.2822, -0.2822, -0.2822, -0.2822],
-                [0.6247, -3.4229, -8.9179, -1.4297, -14.1650, 1.4146, -9.0218, -0.2703, -0.2703],
-                [6.4783, -1.9913, -10.7926, -2.3336, 1.5092, -0.9974, -6.8213, 1.3477, 1.3477],
-            ]
-        )
-        self.assertTrue(np.allclose(logits, logits_meta, atol=1e-4))
-
-        xla_generate = tf.function(model, jit_compile=True)
-        logits = tf.math.reduce_mean(xla_generate(inputs.input_ids, attention_mask=inputs.attention_mask)[0], axis=-1)
-        self.assertTrue(np.allclose(logits, logits_meta, atol=1e-4))
-
-
-@require_tf
-@slow
-class TFOPTGenerationTest(unittest.TestCase):
-    @property
-    def prompts(self):
-        return [
-            "Today is a beautiful day and I want",
-            "In the city of",
-            "Paris is the capital of France and",
-            "Computers and mobile phones have taken",
-        ]
-
-    def test_generation_pre_attn_layer_norm(self):
-        model_id = "facebook/opt-125m"
-
-        EXPECTED_OUTPUTS = [
-            "Today is a beautiful day and I want to",
-            "In the city of New York, the city",
-            "Paris is the capital of France and the capital",
-            "Computers and mobile phones have taken over the",
-        ]
-
-        predicted_outputs = []
-        tokenizer = GPT2Tokenizer.from_pretrained(model_id)
-        model = TFOPTForCausalLM.from_pretrained(model_id)
-
-        for prompt in self.prompts:
-            input_ids = tokenizer(prompt, return_tensors="tf").input_ids
-
-            generated_ids = model.generate(input_ids, max_length=10)
-
-            generated_string = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-            predicted_outputs += generated_string
-
-        self.assertListEqual(predicted_outputs, EXPECTED_OUTPUTS)
-
-    def test_batch_generation(self):
-        model_id = "facebook/opt-350m"
-
-        tokenizer = GPT2Tokenizer.from_pretrained(model_id)
-        model = TFOPTForCausalLM.from_pretrained(model_id)
-
-        tokenizer.padding_side = "left"
-
-        # use different length sentences to test batching
-        sentences = [
-            "Hello, my dog is a little",
-            "Today, I",
-        ]
-
-        inputs = tokenizer(sentences, return_tensors="tf", padding=True)
-        input_ids = inputs["input_ids"]
-
-        outputs = model.generate(input_ids=input_ids, attention_mask=inputs["attention_mask"])
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="tf").input_ids
-        output_non_padded = model.generate(input_ids=inputs_non_padded)
-
-        num_paddings = inputs_non_padded.shape[-1] - tf.math.reduce_sum(
-            tf.cast(inputs["attention_mask"][-1], tf.int64)
-        )
-        inputs_padded = tokenizer(sentences[1], return_tensors="tf").input_ids
-        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "Hello, my dog is a little bit of a dork.\nI'm a little bit",
-            "Today, I was in the middle of a conversation with a friend about the",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(batch_out_sentence, [non_padded_sentence, padded_sentence])
-
-    def test_generation_post_attn_layer_norm(self):
-        model_id = "facebook/opt-350m"
-
-        EXPECTED_OUTPUTS = [
-            "Today is a beautiful day and I want to",
-            "In the city of San Francisco, the city",
-            "Paris is the capital of France and the capital",
-            "Computers and mobile phones have taken over the",
-        ]
-
-        predicted_outputs = []
-        tokenizer = GPT2Tokenizer.from_pretrained(model_id)
-        model = TFOPTForCausalLM.from_pretrained(model_id)
-
-        for prompt in self.prompts:
-            input_ids = tokenizer(prompt, return_tensors="tf").input_ids
-
-            generated_ids = model.generate(input_ids, max_length=10)
-
-            generated_string = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-            predicted_outputs += generated_string
-
-        self.assertListEqual(predicted_outputs, EXPECTED_OUTPUTS)
diff --git a/tests/models/pegasus/test_modeling_flax_pegasus.py b/tests/models/pegasus/test_modeling_flax_pegasus.py
deleted file mode 100644
index 9201bdf5d3..0000000000
--- a/tests/models/pegasus/test_modeling_flax_pegasus.py
+++ /dev/null
@@ -1,336 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import PegasusConfig, PegasusTokenizer, is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
-
-
-if is_flax_available():
-    import os
-
-    # The slow tests are often failing with OOM error on GPU
-    # This makes JAX allocate exactly what is needed on demand, and deallocate memory that is no longer needed
-    # but will be slower as stated here https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
-    os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
-    import jax
-    import jax.numpy as jnp
-    import numpy as np
-
-    from transformers import FlaxPegasusForConditionalGeneration, FlaxPegasusModel
-
-
-@require_flax
-class FlaxPegasusModelTester:
-    config_cls = PegasusConfig
-    config_updates = {}
-    hidden_act = "gelu"
-
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size).clip(3, self.vocab_size)
-        eos_tensor = np.expand_dims(np.array([self.eos_token_id] * self.batch_size), 1)
-        input_ids = np.concatenate([input_ids, eos_tensor], axis=1)
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.config_cls(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_ids=[2],
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.pad_token_id,
-            **self.config_updates,
-        )
-        inputs_dict = prepare_pegasus_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def check_use_cache_forward(self, model_class_name, config, inputs_dict):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        encoder_outputs = model.encode(inputs_dict["input_ids"])
-
-        decoder_input_ids, decoder_attention_mask = (
-            inputs_dict["decoder_input_ids"],
-            inputs_dict["decoder_attention_mask"],
-        )
-
-        past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
-        decoder_attention_mask = jnp.ones((decoder_input_ids.shape[0], max_decoder_length), dtype="i4")
-
-        decoder_position_ids = jnp.broadcast_to(
-            jnp.arange(decoder_input_ids.shape[-1] - 1)[None, :],
-            (decoder_input_ids.shape[0], decoder_input_ids.shape[-1] - 1),
-        )
-        outputs_cache = model.decode(
-            decoder_input_ids[:, :-1],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            past_key_values=past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        decoder_position_ids = jnp.array(decoder_input_ids.shape[0] * [[decoder_input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model.decode(
-            decoder_input_ids[:, -1:],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            past_key_values=outputs_cache.past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        outputs = model.decode(decoder_input_ids, encoder_outputs)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, inputs_dict):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        encoder_outputs = model.encode(inputs_dict["input_ids"])
-
-        decoder_input_ids, decoder_attention_mask = (
-            inputs_dict["decoder_input_ids"],
-            inputs_dict["decoder_attention_mask"],
-        )
-
-        decoder_attention_mask_cache = jnp.concatenate(
-            [
-                decoder_attention_mask,
-                jnp.zeros((decoder_attention_mask.shape[0], max_decoder_length - decoder_attention_mask.shape[1])),
-            ],
-            axis=-1,
-        )
-
-        past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
-        decoder_position_ids = jnp.broadcast_to(
-            jnp.arange(decoder_input_ids.shape[-1] - 1)[None, :],
-            (decoder_input_ids.shape[0], decoder_input_ids.shape[-1] - 1),
-        )
-
-        outputs_cache = model.decode(
-            decoder_input_ids[:, :-1],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask_cache,
-            past_key_values=past_key_values,
-            decoder_position_ids=decoder_position_ids,
-        )
-        decoder_position_ids = jnp.array(decoder_input_ids.shape[0] * [[decoder_input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model.decode(
-            decoder_input_ids[:, -1:],
-            encoder_outputs,
-            past_key_values=outputs_cache.past_key_values,
-            decoder_attention_mask=decoder_attention_mask_cache,
-            decoder_position_ids=decoder_position_ids,
-        )
-
-        outputs = model.decode(decoder_input_ids, encoder_outputs, decoder_attention_mask=decoder_attention_mask)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-
-def prepare_pegasus_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = np.not_equal(input_ids, config.pad_token_id).astype(np.int8)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = np.concatenate(
-            [
-                np.ones(decoder_input_ids[:, :1].shape, dtype=np.int8),
-                np.not_equal(decoder_input_ids[:, 1:], config.pad_token_id).astype(np.int8),
-            ],
-            axis=-1,
-        )
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": decoder_attention_mask,
-    }
-
-
-@require_flax
-class FlaxPegasusModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            FlaxPegasusForConditionalGeneration,
-            FlaxPegasusModel,
-        )
-        if is_flax_available()
-        else ()
-    )
-    is_encoder_decoder = True
-    test_pruning = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = FlaxPegasusModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=PegasusConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_use_cache_forward(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward(model_class, config, inputs_dict)
-
-    def test_use_cache_forward_with_attn_mask(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward_with_attn_mask(model_class, config, inputs_dict)
-
-    def test_encode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def encode_jitted(input_ids, attention_mask=None, **kwargs):
-                    return model.encode(input_ids=input_ids, attention_mask=attention_mask)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    def test_decode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                model = model_class(config)
-                encoder_outputs = model.encode(inputs_dict["input_ids"], inputs_dict["attention_mask"])
-
-                prepared_inputs_dict = {
-                    "decoder_input_ids": inputs_dict["decoder_input_ids"],
-                    "decoder_attention_mask": inputs_dict["decoder_attention_mask"],
-                    "encoder_outputs": encoder_outputs,
-                }
-
-                @jax.jit
-                def decode_jitted(decoder_input_ids, decoder_attention_mask, encoder_outputs):
-                    return model.decode(
-                        decoder_input_ids=decoder_input_ids,
-                        decoder_attention_mask=decoder_attention_mask,
-                        encoder_outputs=encoder_outputs,
-                    )
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("google/pegasus-large", from_pt=True)
-            input_ids = np.ones((1, 1))
-            outputs = model(input_ids)
-            self.assertIsNotNone(outputs)
-
-    @slow
-    def test_pegasus_xsum_summary(self):
-        model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
-        tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
-
-        src_text = [
-            """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""",
-            """ The London trio are up for best UK act and best album, as well as getting two nominations in the best song category."We got told like this morning 'Oh I think you're nominated'", said Dappy."And I was like 'Oh yeah, which one?' And now we've got nominated for four awards. I mean, wow!"Bandmate Fazer added: "We thought it's best of us to come down and mingle with everyone and say hello to the cameras. And now we find we've got four nominations."The band have two shots at the best song prize, getting the nod for their Tynchy Stryder collaboration Number One, and single Strong Again.Their album Uncle B will also go up against records by the likes of Beyonce and Kanye West.N-Dubz picked up the best newcomer Mobo in 2007, but female member Tulisa said they wouldn't be too disappointed if they didn't win this time around."At the end of the day we're grateful to be where we are in our careers."If it don't happen then it don't happen - live to fight another day and keep on making albums and hits for the fans."Dappy also revealed they could be performing live several times on the night.The group will be doing Number One and also a possible rendition of the War Child single, I Got Soul.The charity song is a  re-working of The Killers' All These Things That I've Done and is set to feature artists like Chipmunk, Ironik and Pixie Lott.This year's Mobos will be held outside of London for the first time, in Glasgow on 30 September.N-Dubz said they were looking forward to performing for their Scottish fans and boasted about their recent shows north of the border."We just done Edinburgh the other day," said Dappy."We smashed up an N-Dubz show over there. We done Aberdeen about three or four months ago - we smashed up that show over there! Everywhere we go we smash it up!" """,
-        ]
-
-        tgt_text = [
-            "California's largest electricity provider has turned off power to hundreds of thousands of customers.",
-            "Pop group N-Dubz have revealed they were surprised to get four nominations for this year's Mobo Awards.",
-        ]
-
-        inputs = tokenizer(src_text, return_tensors="np", truncation=True, max_length=512, padding=True)
-        translated_tokens = model.generate(**inputs, num_beams=2).sequences
-        decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
-        assert tgt_text == decoded
diff --git a/tests/models/pegasus/test_modeling_tf_pegasus.py b/tests/models/pegasus/test_modeling_tf_pegasus.py
deleted file mode 100644
index 61f5fec2ef..0000000000
--- a/tests/models/pegasus/test_modeling_tf_pegasus.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import AutoTokenizer, PegasusConfig, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-from transformers.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFAutoModelForSeq2SeqLM, TFPegasusForConditionalGeneration, TFPegasusModel
-
-
-@require_tf
-class TFPegasusModelTester:
-    config_cls = PegasusConfig
-    config_updates = {}
-    hidden_act = "gelu"
-
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=40,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
-        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
-        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.config_cls(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_ids=[2],
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.pad_token_id,
-            **self.config_updates,
-        )
-        inputs_dict = prepare_pegasus_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = TFPegasusModel(config=config).get_decoder()
-        input_ids = inputs_dict["input_ids"]
-
-        input_ids = input_ids[:1, :]
-        attention_mask = inputs_dict["attention_mask"][:1, :]
-        head_mask = inputs_dict["head_mask"]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = tf.cast(ids_tensor((self.batch_size, 3), 2), tf.int8)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-
-def prepare_pegasus_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = tf.concat(
-            [
-                tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int8),
-                tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int8),
-            ],
-            axis=-1,
-        )
-    if head_mask is None:
-        head_mask = tf.ones((config.encoder_layers, config.encoder_attention_heads))
-    if decoder_head_mask is None:
-        decoder_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": decoder_attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-@require_tf
-class TFPegasusModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TFPegasusForConditionalGeneration, TFPegasusModel) if is_tf_available() else ()
-    all_generative_model_classes = (TFPegasusForConditionalGeneration,) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFPegasusModel,
-            "summarization": TFPegasusForConditionalGeneration,
-            "text2text-generation": TFPegasusForConditionalGeneration,
-            "translation": TFPegasusForConditionalGeneration,
-        }
-        if is_tf_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    test_pruning = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFPegasusModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=PegasusConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
-
-
-@require_sentencepiece
-@require_tokenizers
-@require_tf
-class TFPegasusIntegrationTests(unittest.TestCase):
-    src_text = [
-        """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""",
-        """ The London trio are up for best UK act and best album, as well as getting two nominations in the best song category."We got told like this morning 'Oh I think you're nominated'", said Dappy."And I was like 'Oh yeah, which one?' And now we've got nominated for four awards. I mean, wow!"Bandmate Fazer added: "We thought it's best of us to come down and mingle with everyone and say hello to the cameras. And now we find we've got four nominations."The band have two shots at the best song prize, getting the nod for their Tynchy Stryder collaboration Number One, and single Strong Again.Their album Uncle B will also go up against records by the likes of Beyonce and Kanye West.N-Dubz picked up the best newcomer Mobo in 2007, but female member Tulisa said they wouldn't be too disappointed if they didn't win this time around."At the end of the day we're grateful to be where we are in our careers."If it don't happen then it don't happen - live to fight another day and keep on making albums and hits for the fans."Dappy also revealed they could be performing live several times on the night.The group will be doing Number One and also a possible rendition of the War Child single, I Got Soul.The charity song is a  re-working of The Killers' All These Things That I've Done and is set to feature artists like Chipmunk, Ironik and Pixie Lott.This year's Mobos will be held outside of London for the first time, in Glasgow on 30 September.N-Dubz said they were looking forward to performing for their Scottish fans and boasted about their recent shows north of the border."We just done Edinburgh the other day," said Dappy."We smashed up an N-Dubz show over there. We done Aberdeen about three or four months ago - we smashed up that show over there! Everywhere we go we smash it up!" """,
-    ]
-    expected_text = [
-        "California's largest electricity provider has cut power to hundreds of thousands of customers in an effort to"
-        " reduce the risk of wildfires.",
-        'N-Dubz have revealed they\'re "grateful" to have been nominated for four Mobo Awards.',
-    ]  # differs slightly from pytorch, likely due to numerical differences in linear layers
-    model_name = "google/pegasus-xsum"
-
-    @cached_property
-    def tokenizer(self):
-        return AutoTokenizer.from_pretrained(self.model_name)
-
-    @cached_property
-    def model(self):
-        model = TFAutoModelForSeq2SeqLM.from_pretrained(self.model_name)
-        return model
-
-    def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
-        generated_words = self.translate_src_text(**tokenizer_kwargs)
-        assert self.expected_text == generated_words
-
-    def translate_src_text(self, **tokenizer_kwargs):
-        model_inputs = self.tokenizer(self.src_text, **tokenizer_kwargs, padding=True, return_tensors="tf")
-        generated_ids = self.model.generate(
-            model_inputs.input_ids,
-            attention_mask=model_inputs.attention_mask,
-            num_beams=2,
-            use_cache=True,
-        )
-        generated_words = self.tokenizer.batch_decode(generated_ids.numpy(), skip_special_tokens=True)
-        return generated_words
-
-    @slow
-    def test_batch_generation(self):
-        self._assert_generated_batch_equal_expected()
diff --git a/tests/models/rag/test_modeling_tf_rag.py b/tests/models/rag/test_modeling_tf_rag.py
deleted file mode 100644
index ed15cfd7b6..0000000000
--- a/tests/models/rag/test_modeling_tf_rag.py
+++ /dev/null
@@ -1,1091 +0,0 @@
-from __future__ import annotations
-
-import json
-import os
-import shutil
-import tempfile
-import unittest
-from unittest.mock import patch
-
-import numpy as np
-
-from transformers import BartTokenizer
-from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
-from transformers.models.dpr.tokenization_dpr import DPRQuestionEncoderTokenizer
-from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-from transformers.utils import cached_property, is_datasets_available, is_faiss_available, is_tf_available
-
-
-if is_tf_available() and is_datasets_available() and is_faiss_available():
-    import faiss
-    import tensorflow as tf
-    from datasets import Dataset
-
-    from transformers import (
-        AutoConfig,
-        RagConfig,
-        RagRetriever,
-        RagTokenizer,
-        TFAutoModel,
-        TFAutoModelForSeq2SeqLM,
-        TFRagModel,
-        TFRagSequenceForGeneration,
-        TFRagTokenForGeneration,
-    )
-    from transformers.modeling_tf_outputs import TFBaseModelOutput
-
-from ..bart.test_modeling_tf_bart import TFBartModelTester
-from ..dpr.test_modeling_tf_dpr import TFDPRModelTester
-
-
-TOLERANCE = 1e-3
-
-
-def require_retrieval(test_case):
-    """
-    Decorator marking a test that requires a set of dependencies necessary for pefrorm retrieval with
-    [`RagRetriever`].
-
-    These tests are skipped when respective libraries are not installed.
-
-    """
-    if not (is_tf_available() and is_datasets_available() and is_faiss_available()):
-        test_case = unittest.skip("test requires tensorflow, datasets and faiss")(test_case)
-    return test_case
-
-
-@require_tf
-@require_retrieval
-@require_sentencepiece
-class TFRagTestMixin:
-    all_model_classes = (
-        (TFRagModel, TFRagTokenForGeneration, TFRagSequenceForGeneration)
-        if is_tf_available() and is_datasets_available() and is_faiss_available()
-        else ()
-    )
-    all_generative_model_classes = (
-        (TFRagTokenForGeneration, TFRagSequenceForGeneration)
-        if is_tf_available() and is_datasets_available() and is_faiss_available()
-        else ()
-    )
-
-    retrieval_vector_size = 32
-    n_docs = 3
-    max_combined_length = 16
-
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-
-        # DPR tok
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        dpr_tokenizer_path = os.path.join(self.tmpdirname, "dpr_tokenizer")
-        os.makedirs(dpr_tokenizer_path, exist_ok=True)
-        self.vocab_file = os.path.join(dpr_tokenizer_path, DPR_VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-        # BART tok
-        vocab = [
-            "l",
-            "o",
-            "w",
-            "e",
-            "r",
-            "s",
-            "t",
-            "i",
-            "d",
-            "n",
-            "\u0120",
-            "\u0120l",
-            "\u0120n",
-            "\u0120lo",
-            "\u0120low",
-            "er",
-            "\u0120lowest",
-            "\u0120newer",
-            "\u0120wider",
-            "<unk>",
-        ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        bart_tokenizer_path = os.path.join(self.tmpdirname, "bart_tokenizer")
-        os.makedirs(bart_tokenizer_path, exist_ok=True)
-        self.vocab_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    @cached_property
-    def dpr_tokenizer(self) -> DPRQuestionEncoderTokenizer:
-        return DPRQuestionEncoderTokenizer.from_pretrained(os.path.join(self.tmpdirname, "dpr_tokenizer"))
-
-    @cached_property
-    def bart_tokenizer(self) -> BartTokenizer:
-        return BartTokenizer.from_pretrained(os.path.join(self.tmpdirname, "bart_tokenizer"))
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def get_retriever(self, config):
-        dataset = Dataset.from_dict(
-            {
-                "id": ["0", "1", "3"],
-                "text": ["foo", "bar", "qux"],
-                "title": ["Foo", "Bar", "Qux"],
-                "embeddings": [
-                    np.ones(self.retrieval_vector_size),
-                    2 * np.ones(self.retrieval_vector_size),
-                    3 * np.ones(self.retrieval_vector_size),
-                ],
-            }
-        )
-        dataset.add_faiss_index("embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT)
-        tokenizer = self.bart_tokenizer
-        with patch("transformers.models.rag.retrieval_rag.load_dataset") as mock_load_dataset:
-            mock_load_dataset.return_value = dataset
-            retriever = RagRetriever(
-                config,
-                question_encoder_tokenizer=self.dpr_tokenizer,
-                generator_tokenizer=tokenizer,
-            )
-        return retriever
-
-    def check_model_with_retriever(
-        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
-    ):
-        self.assertIsNotNone(config.question_encoder)
-        self.assertIsNotNone(config.generator)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config, retriever=self.get_retriever(config))
-
-            self.assertTrue(model.config.is_encoder_decoder)
-
-            outputs = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            # logits
-            self.assertEqual(
-                outputs.logits.shape,
-                (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
-            )
-            # generator encoder last hidden states
-            self.assertEqual(
-                outputs.generator_enc_last_hidden_state.shape,
-                (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
-            )
-            # doc scores
-            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
-
-    def check_model_generate_from_context_input_ids(
-        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
-    ):
-        self.assertIsNotNone(config.question_encoder)
-        self.assertIsNotNone(config.generator)
-
-        retriever = self.get_retriever(config)
-
-        for i, model_class in enumerate(self.all_generative_model_classes):
-            model = model_class(config)
-            self.assertTrue(model.config.is_encoder_decoder)
-
-            question_hidden_states = model.question_encoder(input_ids, attention_mask=attention_mask)[0]
-
-            out = retriever(
-                input_ids,
-                question_hidden_states.numpy(),
-                prefix=config.generator.prefix,
-                return_tensors="tf",
-            )
-
-            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
-                out["context_input_ids"],
-                out["context_attention_mask"],
-                out["retrieved_doc_embeds"],
-            )
-            retrieved_doc_embeds = tf.cast(retrieved_doc_embeds, tf.float32)
-
-            # compute doc_scores
-            doc_scores = tf.squeeze(
-                tf.matmul(tf.expand_dims(question_hidden_states, axis=[1]), retrieved_doc_embeds, transpose_b=True),
-                axis=[1],
-            )
-
-            outputs = model.generate(
-                context_input_ids=context_input_ids,
-                context_attention_mask=context_attention_mask,
-                doc_scores=doc_scores,
-            )
-
-            self.assertIsNotNone(outputs)
-
-    def check_model_generate(
-        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
-    ):
-        self.assertIsNotNone(config.question_encoder)
-        self.assertIsNotNone(config.generator)
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config, retriever=self.get_retriever(config))
-
-            self.assertTrue(model.config.is_encoder_decoder)
-
-            input_ids = tf.cast(input_ids, tf.int32)
-            outputs = model.generate(
-                input_ids=input_ids,
-                num_beams=2,
-                num_return_sequences=2,
-                decoder_start_token_id=config.generator.eos_token_id,
-                max_new_tokens=5,
-            )
-
-            self.assertIsNotNone(outputs)
-
-    def check_model_without_retriever(
-        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
-    ):
-        self.assertIsNotNone(config.question_encoder)
-        self.assertIsNotNone(config.generator)
-
-        retriever = self.get_retriever(config)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertTrue(model.config.is_encoder_decoder)
-
-            question_hidden_states = model.question_encoder(input_ids, attention_mask=attention_mask)[0]
-
-            out = retriever(
-                input_ids,
-                question_hidden_states.numpy(),
-                prefix=config.generator.prefix,
-                return_tensors="tf",
-            )
-
-            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
-                out["context_input_ids"],
-                out["context_attention_mask"],
-                out["retrieved_doc_embeds"],
-            )
-
-            retrieved_doc_embeds = tf.cast(retrieved_doc_embeds, tf.float32)
-
-            # compute doc_scores
-            doc_scores = tf.squeeze(
-                tf.matmul(tf.expand_dims(question_hidden_states, axis=[1]), retrieved_doc_embeds, transpose_b=True),
-                axis=[1],
-            )
-
-            outputs = model(
-                input_ids=None,
-                context_input_ids=context_input_ids,
-                context_attention_mask=context_attention_mask,
-                doc_scores=doc_scores,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            # logits
-            self.assertEqual(
-                outputs.logits.shape,
-                (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
-            )
-
-            # generator encoder last hidden states
-            self.assertEqual(
-                outputs.generator_enc_last_hidden_state.shape,
-                (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
-            )
-            # doc scores
-            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
-
-    def check_model_custom_n_docs(
-        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, n_docs, **kwargs
-    ):
-        self.assertIsNotNone(config.question_encoder)
-        self.assertIsNotNone(config.generator)
-
-        retriever = self.get_retriever(config)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertTrue(model.config.is_encoder_decoder)
-
-            question_hidden_states = model.question_encoder(input_ids, attention_mask=attention_mask)[0]
-
-            out = retriever(
-                input_ids,
-                question_hidden_states.numpy(),
-                prefix=config.generator.prefix,
-                return_tensors="tf",
-                n_docs=n_docs,
-            )
-
-            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
-                out["context_input_ids"],
-                out["context_attention_mask"],
-                out["retrieved_doc_embeds"],
-            )
-
-            retrieved_doc_embeds = tf.cast(retrieved_doc_embeds, tf.float32)
-
-            # compute doc_scores
-            doc_scores = tf.squeeze(
-                tf.matmul(tf.expand_dims(question_hidden_states, axis=[1]), retrieved_doc_embeds, transpose_b=True),
-                axis=[1],
-            )
-
-            outputs = model(
-                input_ids=None,
-                context_input_ids=context_input_ids,
-                context_attention_mask=context_attention_mask,
-                doc_scores=doc_scores,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-                n_docs=n_docs,
-            )
-
-            # logits
-            self.assertEqual(
-                outputs.logits.shape,
-                (n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
-            )
-            # generator encoder last hidden states
-            self.assertEqual(
-                outputs.generator_enc_last_hidden_state.shape,
-                (n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
-            )
-            # doc scores
-            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], n_docs))
-
-    def check_model_with_mismatch_n_docs_value(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        decoder_input_ids,
-        decoder_attention_mask,
-        retriever_n_docs,
-        generator_n_docs,
-        **kwargs,
-    ):
-        self.assertIsNotNone(config.question_encoder)
-        self.assertIsNotNone(config.generator)
-
-        retriever = self.get_retriever(config)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertTrue(model.config.is_encoder_decoder)
-
-            question_hidden_states = model.question_encoder(input_ids, attention_mask=attention_mask)[0]
-
-            out = retriever(
-                input_ids,
-                question_hidden_states.numpy(),
-                prefix=config.generator.prefix,
-                return_tensors="tf",
-                n_docs=retriever_n_docs,
-            )
-
-            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
-                out["context_input_ids"],
-                out["context_attention_mask"],
-                out["retrieved_doc_embeds"],
-            )
-
-            retrieved_doc_embeds = tf.cast(retrieved_doc_embeds, tf.float32)
-
-            # compute doc_scores
-            doc_scores = tf.squeeze(
-                tf.matmul(tf.expand_dims(question_hidden_states, axis=[1]), retrieved_doc_embeds, transpose_b=True),
-                axis=[1],
-            )
-
-            self.assertRaises(
-                AssertionError,
-                model.__call__,
-                input_ids=None,
-                context_input_ids=context_input_ids,
-                context_attention_mask=context_attention_mask,
-                doc_scores=doc_scores,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-                n_docs=generator_n_docs,
-            )
-
-    def check_model_with_encoder_outputs(
-        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
-    ):
-        self.assertIsNotNone(config.question_encoder)
-        self.assertIsNotNone(config.generator)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config, retriever=self.get_retriever(config))
-
-            self.assertTrue(model.config.is_encoder_decoder)
-
-            outputs = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            encoder_outputs = TFBaseModelOutput(outputs.generator_enc_last_hidden_state)
-
-            # run only generator
-            outputs = model(
-                input_ids=None,
-                encoder_outputs=encoder_outputs,
-                doc_scores=outputs.doc_scores,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            # logits
-            self.assertEqual(
-                outputs.logits.shape,
-                (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
-            )
-            # generator encoder last hidden states
-            self.assertEqual(
-                outputs.generator_enc_last_hidden_state.shape,
-                (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
-            )
-            # doc scores
-            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
-
-    def test_model_with_retriever(self):
-        inputs_dict = self.config_and_inputs
-        self.check_model_with_retriever(**inputs_dict)
-
-    def test_model_without_retriever(self):
-        inputs_dict = self.config_and_inputs
-        self.check_model_without_retriever(**inputs_dict)
-
-    @slow
-    def test_model_generate_from_context_input_ids(self):
-        inputs_dict = self.config_and_inputs
-        self.check_model_generate_from_context_input_ids(**inputs_dict)
-
-    def test_model_with_encoder_outputs(self):
-        inputs_dict = self.config_and_inputs
-        self.check_model_with_encoder_outputs(**inputs_dict)
-
-    @slow
-    def test_model_generate(self):
-        inputs_dict = self.config_and_inputs
-        self.check_model_generate(**inputs_dict)
-
-    def test_model_with_custom_n_docs(self):
-        inputs_dict = self.config_and_inputs
-        inputs_dict["n_docs"] = 1
-        self.check_model_custom_n_docs(**inputs_dict)
-
-    def test_model_with_mismatch_n_docs_value(self):
-        inputs_dict = self.config_and_inputs
-        inputs_dict["retriever_n_docs"] = 3
-        inputs_dict["generator_n_docs"] = 2
-        self.check_model_with_mismatch_n_docs_value(**inputs_dict)
-
-
-@require_tf
-@require_retrieval
-class TFRagDPRBartTest(TFRagTestMixin, unittest.TestCase):
-    @cached_property
-    def config_and_inputs(self):
-        question_encoder_tester = TFDPRModelTester(self)
-        dpr_config_and_inputs = question_encoder_tester.prepare_config_and_inputs()
-        generator_tester = TFBartModelTester(self)
-        bart_config_and_inputs = generator_tester.prepare_config_and_inputs_for_common()
-
-        (question_encoder_config, input_ids, _, input_mask, _, _, _) = dpr_config_and_inputs
-        (generator_config, bart_inputs_dict) = bart_config_and_inputs
-        decoder_input_ids, decoder_attention_mask = bart_inputs_dict["input_ids"], bart_inputs_dict["attention_mask"]
-
-        config = RagConfig.from_question_encoder_generator_configs(
-            question_encoder_config,
-            generator_config,
-            n_docs=self.n_docs,
-            retrieval_vector_size=self.retrieval_vector_size,
-            max_combined_length=self.max_combined_length,
-        )
-
-        return {
-            "config": config,
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-
-
-@require_tf
-@require_retrieval
-@require_sentencepiece
-@require_tokenizers
-class TFRagModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def token_model(self):
-        return TFRagTokenForGeneration.from_pretrained_question_encoder_generator(
-            "facebook/dpr-question_encoder-single-nq-base", "facebook/bart-large-cnn"
-        )
-
-    @cached_property
-    def sequence_model(self):
-        return TFRagSequenceForGeneration.from_pretrained_question_encoder_generator(
-            "facebook/dpr-question_encoder-single-nq-base", "facebook/bart-large-cnn"
-        )
-
-    def token_model_nq_checkpoint(self, retriever):
-        return TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
-
-    def get_rag_config(self):
-        question_encoder_config = AutoConfig.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
-        generator_config = AutoConfig.from_pretrained("facebook/bart-large-cnn")
-        return RagConfig.from_question_encoder_generator_configs(
-            question_encoder_config,
-            generator_config,
-            bos_token_id=0,
-            decoder_start_token_id=2,
-            eos_token_id=2,
-            is_encoder_decoder=True,
-            pad_token_id=1,
-            vocab_size=50264,
-            title_sep=" / ",
-            doc_sep=" // ",
-            n_docs=5,
-            max_combined_length=300,
-            dataset="wiki_dpr",
-            dataset_split="train",
-            index_name="exact",
-            index_path=None,
-            use_dummy_dataset=True,
-            retrieval_vector_size=768,
-            retrieval_batch_size=8,
-            dataset_revision="b24a417",
-        )
-
-    @slow
-    def test_rag_sequence_inference(self):
-        rag_config = self.get_rag_config()
-        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
-            "facebook/dpr-question_encoder-single-nq-base"
-        )
-        rag_retriever = RagRetriever(
-            rag_config,
-            question_encoder_tokenizer=rag_question_encoder_tokenizer,
-            generator_tokenizer=rag_decoder_tokenizer,
-        )
-
-        rag_sequence = self.sequence_model
-        rag_sequence.set_retriever(rag_retriever)
-
-        input_ids = rag_question_encoder_tokenizer(
-            "who sings does he love me with reba", return_tensors="tf"
-        ).input_ids
-        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
-
-        output = rag_sequence(
-            input_ids,
-            labels=decoder_input_ids,
-        )
-
-        expected_shape = tf.TensorShape([5, 5, 50264])
-        self.assertEqual(output.logits.shape, expected_shape)
-
-        expected_doc_scores = tf.convert_to_tensor([[75.0286, 74.4998, 74.0804, 74.0306, 73.9504]])
-        expected_loss = tf.convert_to_tensor([36.7368])
-
-        tf.debugging.assert_near(output.loss, expected_loss, atol=1e-3)
-        tf.debugging.assert_near(output.doc_scores, expected_doc_scores, atol=1e-3)
-
-    @slow
-    def test_rag_token_inference(self):
-        rag_config = self.get_rag_config()
-        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
-            "facebook/dpr-question_encoder-single-nq-base"
-        )
-        rag_retriever = RagRetriever(
-            rag_config,
-            question_encoder_tokenizer=rag_question_encoder_tokenizer,
-            generator_tokenizer=rag_decoder_tokenizer,
-        )
-
-        rag_token = self.token_model
-        rag_token.set_retriever(rag_retriever)
-
-        input_ids = rag_question_encoder_tokenizer(
-            "who sings does he love me with reba", return_tensors="tf"
-        ).input_ids
-        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
-
-        output = rag_token(
-            input_ids,
-            labels=decoder_input_ids,
-        )
-
-        expected_shape = tf.TensorShape([5, 5, 50264])
-        self.assertEqual(output.logits.shape, expected_shape)
-
-        expected_doc_scores = tf.convert_to_tensor([[75.0286, 74.4998, 74.0804, 74.0306, 73.9504]])
-        expected_loss = tf.convert_to_tensor([36.3557])
-
-        tf.debugging.assert_near(output.loss, expected_loss, atol=1e-3)
-        tf.debugging.assert_near(output.doc_scores, expected_doc_scores, atol=1e-3)
-
-    @slow
-    def test_rag_token_inference_nq_checkpoint(self):
-        rag_config = self.get_rag_config()
-        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
-            "facebook/dpr-question_encoder-single-nq-base"
-        )
-        rag_retriever = RagRetriever(
-            rag_config,
-            question_encoder_tokenizer=rag_question_encoder_tokenizer,
-            generator_tokenizer=rag_decoder_tokenizer,
-        )
-
-        rag_token = self.token_model_nq_checkpoint(retriever=rag_retriever)
-
-        # check that outputs after saving and loading are equal
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            rag_token.save_pretrained(tmpdirname)
-            rag_token = TFRagTokenForGeneration.from_pretrained(tmpdirname, retriever=rag_retriever)
-
-        input_ids = rag_question_encoder_tokenizer(
-            "who sings does he love me with reba", return_tensors="tf"
-        ).input_ids
-        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
-
-        output = rag_token(
-            input_ids,
-            labels=decoder_input_ids,
-        )
-
-        expected_shape = tf.TensorShape([5, 5, 50265])
-        self.assertEqual(output.logits.shape, expected_shape)
-
-        expected_doc_scores = tf.convert_to_tensor([[62.9402, 62.7107, 62.2382, 62.1194, 61.8578]])
-        expected_loss = tf.convert_to_tensor([32.521812])
-
-        tf.debugging.assert_near(output.loss, expected_loss, atol=1e-3)
-        tf.debugging.assert_near(output.doc_scores, expected_doc_scores, atol=1e-3)
-
-    @slow
-    def test_rag_token_inference_save_pretrained(self):
-        rag_config = self.get_rag_config()
-        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
-            "facebook/dpr-question_encoder-single-nq-base"
-        )
-        rag_retriever = RagRetriever(
-            rag_config,
-            question_encoder_tokenizer=rag_question_encoder_tokenizer,
-            generator_tokenizer=rag_decoder_tokenizer,
-        )
-
-        rag_token = self.token_model
-        rag_token.set_retriever(rag_retriever)
-
-        input_ids = rag_question_encoder_tokenizer(
-            "who sings does he love me with reba", return_tensors="tf"
-        ).input_ids
-        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
-
-        # model must run once to be functional before loading/saving works
-        rag_token(
-            input_ids,
-            labels=decoder_input_ids,
-        )
-
-        # check that outputs after saving and loading are equal
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            rag_token.save_pretrained(tmpdirname)
-            rag_token = TFRagTokenForGeneration.from_pretrained(tmpdirname, retriever=rag_retriever)
-
-        output = rag_token(
-            input_ids,
-            labels=decoder_input_ids,
-        )
-
-        expected_shape = tf.TensorShape([5, 5, 50264])
-        self.assertEqual(output.logits.shape, expected_shape)
-
-        expected_doc_scores = tf.convert_to_tensor([[75.0286, 74.4998, 74.0804, 74.0306, 73.9504]])
-        expected_loss = tf.convert_to_tensor([36.3557])
-
-        tf.debugging.assert_near(output.loss, expected_loss, atol=1e-3)
-        tf.debugging.assert_near(output.doc_scores, expected_doc_scores, atol=1e-3)
-
-    @slow
-    def test_init_and_from_pretrained(self):
-        rag_config = self.get_rag_config()
-        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
-            "facebook/dpr-question_encoder-single-nq-base"
-        )
-        rag_retriever = RagRetriever(
-            rag_config,
-            question_encoder_tokenizer=rag_question_encoder_tokenizer,
-            generator_tokenizer=rag_decoder_tokenizer,
-        )
-
-        rag_config = RagConfig.from_pretrained("facebook/rag-sequence-base")
-        rag = TFRagTokenForGeneration(rag_config, retriever=rag_retriever)
-
-        input_ids = rag_question_encoder_tokenizer(
-            "who sings does he love me with reba", return_tensors="tf"
-        ).input_ids
-        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
-
-        rag(
-            input_ids,
-            decoder_input_ids=decoder_input_ids,
-        )
-
-        # this should not give any warnings
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            rag.save_pretrained(tmpdirname)
-            rag = TFRagTokenForGeneration.from_pretrained(tmpdirname, retriever=rag_retriever)
-
-    @property
-    def test_data_questions(self):
-        return [
-            "who got the first nobel prize in physics",
-            "when is the next deadpool movie being released",
-            "which mode is used for short wave broadcast service",
-            "who is the owner of reading football club",
-            "when is the next scandal episode coming out",
-            "when is the last time the philadelphia won the superbowl",
-            "what is the most current adobe flash player version",
-            "how many episodes are there in dragon ball z",
-        ]
-
-    @slow
-    def test_rag_token_greedy_search(self):
-        tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
-        retriever = RagRetriever.from_pretrained(
-            "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417"
-        )
-        rag_token = TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
-
-        # check first two questions
-        input_dict = tokenizer(
-            self.test_data_questions[:2],
-            return_tensors="tf",
-            padding=True,
-            truncation=True,
-        )
-
-        input_ids = input_dict.input_ids
-        attention_mask = input_dict.attention_mask
-
-        # make sure only 1 beam is used
-        rag_token.config.num_beams = 1
-
-        output_ids = rag_token.generate(
-            input_ids,
-            attention_mask=attention_mask,
-        )
-
-        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-
-        EXPECTED_OUTPUTS = [
-            " albert einstein",
-            " september 22, 2017",
-        ]
-        self.assertListEqual(outputs, EXPECTED_OUTPUTS)
-
-    @slow
-    def test_rag_token_generate_batch(self):
-        # NOTE: gold labels comes from num_beam=4, so this is effectively beam-search test
-        tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
-        retriever = RagRetriever.from_pretrained(
-            "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417"
-        )
-        rag_token = TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
-
-        input_dict = tokenizer(
-            self.test_data_questions,
-            return_tensors="tf",
-            padding=True,
-            truncation=True,
-        )
-
-        input_ids = input_dict.input_ids
-        attention_mask = input_dict.attention_mask
-
-        EXPECTED_OUTPUTS = [
-            " albert einstein",
-            " september 22, 2017",
-            " amplitude modulation",
-            " stefan persson",
-            " april 20, 2018",
-            " the 1970s",
-            " 7.1. 2",
-            " 13",
-        ]
-
-        # Split into 2 batches of 4 examples to avoid GPU OOM.
-        output_ids = rag_token.generate(
-            input_ids[:4],
-            attention_mask=attention_mask[:4],
-        )
-        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-        self.assertListEqual(outputs, EXPECTED_OUTPUTS[:4])
-
-        output_ids = rag_token.generate(
-            input_ids[4:],
-            attention_mask=attention_mask[4:],
-        )
-        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-        self.assertListEqual(outputs, EXPECTED_OUTPUTS[4:])
-
-    @slow
-    def test_rag_sequence_generate_batch(self):
-        tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
-        retriever = RagRetriever.from_pretrained(
-            "facebook/rag-sequence-nq",
-            index_name="exact",
-            use_dummy_dataset=True,
-            dataset_revision="b24a417",
-        )
-        rag_sequence = TFRagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)
-
-        input_dict = tokenizer(
-            self.test_data_questions,
-            return_tensors="tf",
-            padding=True,
-            truncation=True,
-        )
-
-        input_ids = input_dict.input_ids
-        attention_mask = input_dict.attention_mask
-
-        output_ids = rag_sequence.generate(
-            input_ids,
-            attention_mask=attention_mask,
-        )
-
-        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-
-        EXPECTED_OUTPUTS = [
-            " albert einstein",
-            " june 22, 2018",
-            " amplitude modulation",
-            " tim besley ( chairman )",
-            " june 20, 2018",
-            " 1980",
-            " 7.0",
-            " 8",
-        ]
-        self.assertListEqual(outputs, EXPECTED_OUTPUTS)
-
-    @slow
-    def test_rag_sequence_generate_batch_from_context_input_ids(self):
-        tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
-        retriever = RagRetriever.from_pretrained(
-            "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417"
-        )
-        rag_sequence = TFRagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)
-        input_dict = tokenizer(
-            self.test_data_questions,
-            return_tensors="tf",
-            padding=True,
-            truncation=True,
-        )
-
-        input_ids = input_dict.input_ids
-
-        question_hidden_states = rag_sequence.question_encoder(input_ids)[0]
-        docs_dict = retriever(input_ids.numpy(), question_hidden_states.numpy(), return_tensors="tf")
-        doc_scores = tf.squeeze(
-            tf.matmul(
-                tf.expand_dims(question_hidden_states, axis=[1]), docs_dict["retrieved_doc_embeds"], transpose_b=True
-            ),
-            axis=[1],
-        )
-        output_ids = rag_sequence.generate(
-            context_input_ids=docs_dict["context_input_ids"],
-            context_attention_mask=docs_dict["context_attention_mask"],
-            doc_scores=doc_scores,
-            do_deduplication=True,
-        )
-
-        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-
-        EXPECTED_OUTPUTS = [
-            " albert einstein",
-            " june 22, 2018",
-            " amplitude modulation",
-            " tim besley ( chairman )",
-            " june 20, 2018",
-            " 1980",
-            " 7.0",
-            " 8",
-        ]
-        self.assertListEqual(outputs, EXPECTED_OUTPUTS)
-
-
-@require_tf
-@require_retrieval
-class TFRagModelSaveLoadTests(unittest.TestCase):
-    def get_rag_config(self):
-        question_encoder_config = AutoConfig.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
-        generator_config = AutoConfig.from_pretrained("facebook/bart-large-cnn")
-        return RagConfig.from_question_encoder_generator_configs(
-            question_encoder_config,
-            generator_config,
-            bos_token_id=0,
-            decoder_start_token_id=2,
-            eos_token_id=2,
-            is_encoder_decoder=True,
-            pad_token_id=1,
-            vocab_size=50264,
-            title_sep=" / ",
-            doc_sep=" // ",
-            n_docs=5,
-            max_combined_length=300,
-            dataset="wiki_dpr",
-            dataset_split="train",
-            index_name="exact",
-            index_path=None,
-            use_dummy_dataset=True,
-            retrieval_vector_size=768,
-            retrieval_batch_size=8,
-            dataset_revision="b24a417",
-        )
-
-    @slow
-    def test_rag_sequence_from_pretrained(self):
-        load_weight_prefix = "tf_rag_model_1"
-
-        rag_config = self.get_rag_config()
-        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
-            "facebook/dpr-question_encoder-single-nq-base"
-        )
-        rag_retriever = RagRetriever(
-            rag_config,
-            question_encoder_tokenizer=rag_question_encoder_tokenizer,
-            generator_tokenizer=rag_decoder_tokenizer,
-        )
-
-        input_ids = rag_question_encoder_tokenizer(
-            "who sings does he love me with reba", return_tensors="tf"
-        ).input_ids
-        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            rag_sequence = TFRagSequenceForGeneration.from_pretrained_question_encoder_generator(
-                "facebook/dpr-question_encoder-single-nq-base",
-                "facebook/bart-large-cnn",
-                retriever=rag_retriever,
-                config=rag_config,
-            )
-            rag_sequence.build_in_name_scope()
-            # check that the from pretrained methods work
-            rag_sequence.save_pretrained(tmp_dirname)
-            rag_sequence.from_pretrained(tmp_dirname, retriever=rag_retriever)
-
-            output = rag_sequence(input_ids, labels=decoder_input_ids)
-
-            loss_pretrained = output.loss
-            del rag_sequence
-
-        question_encoder = TFAutoModel.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
-        generator = TFAutoModelForSeq2SeqLM.from_pretrained(
-            "facebook/bart-large-cnn", load_weight_prefix=load_weight_prefix, name="generator"
-        )
-
-        rag_sequence = TFRagSequenceForGeneration(
-            config=rag_config, question_encoder=question_encoder, generator=generator, retriever=rag_retriever
-        )
-
-        output = rag_sequence(input_ids, labels=decoder_input_ids)
-
-        loss_init = output.loss
-
-        self.assertAlmostEqual(loss_pretrained, loss_init, places=4)
-
-    @slow
-    def test_rag_token_from_pretrained(self):
-        load_weight_prefix = "tf_rag_model_1"
-
-        rag_config = self.get_rag_config()
-        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
-            "facebook/dpr-question_encoder-single-nq-base"
-        )
-        rag_retriever = RagRetriever(
-            rag_config,
-            question_encoder_tokenizer=rag_question_encoder_tokenizer,
-            generator_tokenizer=rag_decoder_tokenizer,
-        )
-
-        input_ids = rag_question_encoder_tokenizer(
-            "who sings does he love me with reba", return_tensors="tf"
-        ).input_ids
-        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            rag_token = TFRagTokenForGeneration.from_pretrained_question_encoder_generator(
-                "facebook/dpr-question_encoder-single-nq-base",
-                "facebook/bart-large-cnn",
-                retriever=rag_retriever,
-                config=rag_config,
-            )
-            rag_token.build_in_name_scope()
-            # check that the from pretrained methods work
-            rag_token.save_pretrained(tmp_dirname)
-            rag_token.from_pretrained(tmp_dirname, retriever=rag_retriever)
-
-            output = rag_token(input_ids, labels=decoder_input_ids)
-
-            loss_pretrained = output.loss
-            del rag_token
-
-        question_encoder = TFAutoModel.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
-        generator = TFAutoModelForSeq2SeqLM.from_pretrained(
-            "facebook/bart-large-cnn", load_weight_prefix=load_weight_prefix, name="generator"
-        )
-        rag_token = TFRagTokenForGeneration(
-            config=rag_config, question_encoder=question_encoder, generator=generator, retriever=rag_retriever
-        )
-
-        output = rag_token(input_ids, labels=decoder_input_ids)
-
-        loss_init = output.loss
-
-        self.assertAlmostEqual(loss_pretrained, loss_init, places=4)
diff --git a/tests/models/regnet/test_modeling_flax_regnet.py b/tests/models/regnet/test_modeling_flax_regnet.py
deleted file mode 100644
index f7158efdfe..0000000000
--- a/tests/models/regnet/test_modeling_flax_regnet.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import unittest
-
-from transformers import RegNetConfig, is_flax_available
-from transformers.testing_utils import require_flax, slow
-from transformers.utils import cached_property, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor
-
-
-if is_flax_available():
-    import jax
-    import jax.numpy as jnp
-
-    from transformers.models.regnet.modeling_flax_regnet import FlaxRegNetForImageClassification, FlaxRegNetModel
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import AutoImageProcessor
-
-
-class FlaxRegNetModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,
-        image_size=32,
-        num_channels=3,
-        embeddings_size=10,
-        hidden_sizes=[10, 20, 30, 40],
-        depths=[1, 1, 2, 1],
-        is_training=True,
-        use_labels=True,
-        hidden_act="relu",
-        num_labels=3,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.embeddings_size = embeddings_size
-        self.hidden_sizes = hidden_sizes
-        self.depths = depths
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_act = hidden_act
-        self.num_labels = num_labels
-        self.scope = scope
-        self.num_stages = len(hidden_sizes)
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return RegNetConfig(
-            num_channels=self.num_channels,
-            embeddings_size=self.embeddings_size,
-            hidden_sizes=self.hidden_sizes,
-            depths=self.depths,
-            hidden_act=self.hidden_act,
-            num_labels=self.num_labels,
-            image_size=self.image_size,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = FlaxRegNetModel(config=config)
-        result = model(pixel_values)
-
-        # Output shape (b, c, h, w)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values):
-        config.num_labels = self.num_labels
-        model = FlaxRegNetForImageClassification(config=config)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_flax
-class FlaxResNetModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxRegNetModel, FlaxRegNetForImageClassification) if is_flax_available() else ()
-
-    is_encoder_decoder = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self) -> None:
-        self.model_tester = FlaxRegNetModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RegNetConfig, has_text_modality=False)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @unittest.skip(reason="RegNet does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="RegNet does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.__call__)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_stages = self.model_tester.num_stages
-            self.assertEqual(len(hidden_states), expected_num_stages + 1)
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_jit_compilation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def model_jitted(pixel_values, **kwargs):
-                    return model(pixel_values=pixel_values, **kwargs)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_flax
-class FlaxRegNetModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("facebook/regnet-y-040") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = FlaxRegNetForImageClassification.from_pretrained("facebook/regnet-y-040")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="np")
-
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = jnp.array([-0.4180, -1.5051, -3.4836])
-
-        self.assertTrue(jnp.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/regnet/test_modeling_tf_regnet.py b/tests/models/regnet/test_modeling_tf_regnet.py
deleted file mode 100644
index df017fb416..0000000000
--- a/tests/models/regnet/test_modeling_tf_regnet.py
+++ /dev/null
@@ -1,288 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow RegNet model."""
-
-from __future__ import annotations
-
-import inspect
-import unittest
-
-from transformers import RegNetConfig
-from transformers.testing_utils import require_tf, require_vision, slow
-from transformers.utils import cached_property, is_tf_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFRegNetForImageClassification, TFRegNetModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import AutoImageProcessor
-
-
-class TFRegNetModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,
-        image_size=32,
-        num_channels=3,
-        embeddings_size=10,
-        hidden_sizes=[10, 20, 30, 40],
-        depths=[1, 1, 2, 1],
-        is_training=True,
-        use_labels=True,
-        hidden_act="relu",
-        num_labels=3,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.embeddings_size = embeddings_size
-        self.hidden_sizes = hidden_sizes
-        self.depths = depths
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_act = hidden_act
-        self.num_labels = num_labels
-        self.scope = scope
-        self.num_stages = len(hidden_sizes)
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return RegNetConfig(
-            num_channels=self.num_channels,
-            embeddings_size=self.embeddings_size,
-            hidden_sizes=self.hidden_sizes,
-            depths=self.depths,
-            hidden_act=self.hidden_act,
-            num_labels=self.num_labels,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = TFRegNetModel(config=config)
-        result = model(pixel_values, training=False)
-        # expected last hidden states: B, C, H // 32, W // 32
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
-        model = TFRegNetForImageClassification(config)
-        result = model(pixel_values, labels=labels, training=False)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFRegNetModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as RegNet does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (TFRegNetModel, TFRegNetForImageClassification) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": TFRegNetModel, "image-classification": TFRegNetForImageClassification}
-        if is_tf_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_onnx = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = TFRegNetModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RegNetConfig, has_text_modality=False)
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    @unittest.skip(reason="RegNet does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skipIf(
-        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
-        reason="TF does not support backprop for grouped convolutions on CPU.",
-    )
-    @slow
-    def test_keras_fit(self):
-        super().test_keras_fit()
-
-    @unittest.skip(reason="RegNet does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_stages = self.model_tester.num_stages
-            self.assertEqual(len(hidden_states), expected_num_stages + 1)
-
-            # RegNet's feature maps are of shape (batch_size, num_channels, height, width)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.model_tester.image_size // 2, self.model_tester.image_size // 2],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        layers_type = ["basic", "bottleneck"]
-        for model_class in self.all_model_classes:
-            for layer_type in layers_type:
-                config.layer_type = layer_type
-                inputs_dict["output_hidden_states"] = True
-                check_hidden_states_output(inputs_dict, config, model_class)
-
-                # check that output_hidden_states also work using config
-                del inputs_dict["output_hidden_states"]
-                config.output_hidden_states = True
-
-                check_hidden_states_output(inputs_dict, config, model_class)
-
-    # Since RegNet does not have any attention we need to rewrite this test.
-    def test_model_outputs_equivalence(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
-            dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
-
-            def recursive_check(tuple_object, dict_object):
-                if isinstance(tuple_object, (list, tuple)):
-                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
-                        recursive_check(tuple_iterable_value, dict_iterable_value)
-                elif tuple_object is None:
-                    return
-                else:
-                    self.assertTrue(
-                        all(tf.equal(tuple_object, dict_object)),
-                        msg=(
-                            "Tuple and dict output are not equal. Difference:"
-                            f" {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}"
-                        ),
-                    )
-
-                recursive_check(tuple_output, dict_output)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/regnet-y-040"
-        model = TFRegNetModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_tf
-@require_vision
-class RegNetModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("facebook/regnet-y-040") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = TFRegNetForImageClassification.from_pretrained("facebook/regnet-y-040")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="tf")
-
-        # forward pass
-        outputs = model(**inputs, training=False)
-
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = tf.constant([-0.4180, -1.5051, -3.4836])
-
-        tf.debugging.assert_near(outputs.logits[0, :3], expected_slice, atol=1e-4)
diff --git a/tests/models/rembert/test_modeling_tf_rembert.py b/tests/models/rembert/test_modeling_tf_rembert.py
deleted file mode 100644
index b07d84d607..0000000000
--- a/tests/models/rembert/test_modeling_tf_rembert.py
+++ /dev/null
@@ -1,727 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import RemBertConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TFRemBertForCausalLM,
-        TFRemBertForMaskedLM,
-        TFRemBertForMultipleChoice,
-        TFRemBertForQuestionAnswering,
-        TFRemBertForSequenceClassification,
-        TFRemBertForTokenClassification,
-        TFRemBertModel,
-    )
-
-
-class TFRemBertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        input_embedding_size=18,
-        output_embedding_size=43,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.input_embedding_size = input_embedding_size
-        self.output_embedding_size = output_embedding_size
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = RemBertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            input_embedding_size=self.input_embedding_size,
-            output_embedding_size=self.output_embedding_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            return_dict=True,
-        )
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFRemBertModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_causal_lm_base_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.is_decoder = True
-
-        model = TFRemBertModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFRemBertModel(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "encoder_hidden_states": encoder_hidden_states,
-            "encoder_attention_mask": encoder_attention_mask,
-        }
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
-
-        # Also check the case where encoder outputs are not passed
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_causal_lm_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.is_decoder = True
-        model = TFRemBertForCausalLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        prediction_scores = model(inputs)["logits"]
-        self.parent.assertListEqual(
-            list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
-        )
-
-    def create_and_check_causal_lm_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFRemBertForCausalLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "encoder_hidden_states": encoder_hidden_states,
-            "encoder_attention_mask": encoder_attention_mask,
-        }
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
-
-        prediction_scores = result["logits"]
-        self.parent.assertListEqual(
-            list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
-        )
-
-    def create_and_check_causal_lm_model_past(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFRemBertForCausalLM(config=config)
-
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and attn_mask
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-
-        output_from_no_past = model(next_input_ids, output_hidden_states=True).hidden_states[0]
-        output_from_past = model(
-            next_tokens, past_key_values=past_key_values, output_hidden_states=True
-        ).hidden_states[0]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
-    def create_and_check_causal_lm_model_past_with_attn_mask(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFRemBertForCausalLM(config=config)
-
-        # create attention mask
-        half_seq_length = self.seq_length // 2
-        attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
-        attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
-        attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attn_mask, use_cache=True)
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        past_key_values = outputs.past_key_values
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
-        vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
-        condition = tf.transpose(
-            tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
-        )
-        input_ids = tf.where(condition, random_other_next_tokens, input_ids)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        attn_mask = tf.concat(
-            [attn_mask, tf.ones((attn_mask.shape[0], 1), dtype=tf.int32)],
-            axis=1,
-        )
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=attn_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens, past_key_values=past_key_values, attention_mask=attn_mask, output_hidden_states=True
-        ).hidden_states[0]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
-    def create_and_check_causal_lm_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFRemBertForCausalLM(config=config)
-
-        input_ids = input_ids[:1, :]
-        input_mask = input_mask[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=input_mask, use_cache=True)
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        ).hidden_states[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFRemBertForCausalLM(config=config)
-
-        input_ids = input_ids[:1, :]
-        input_mask = input_mask[:1, :]
-        encoder_hidden_states = encoder_hidden_states[:1, :, :]
-        encoder_attention_mask = encoder_attention_mask[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        ).hidden_states[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFRemBertForMaskedLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFRemBertForSequenceClassification(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = TFRemBertForMultipleChoice(config=config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFRemBertForTokenClassification(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFRemBertForQuestionAnswering(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-
-        result = model(inputs)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFRemBertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFRemBertModel,
-            TFRemBertForCausalLM,
-            TFRemBertForMaskedLM,
-            TFRemBertForQuestionAnswering,
-            TFRemBertForSequenceClassification,
-            TFRemBertForTokenClassification,
-            TFRemBertForMultipleChoice,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFRemBertModel,
-            "fill-mask": TFRemBertForMaskedLM,
-            "question-answering": TFRemBertForQuestionAnswering,
-            "text-classification": TFRemBertForSequenceClassification,
-            "text-generation": TFRemBertForCausalLM,
-            "token-classification": TFRemBertForTokenClassification,
-            "zero-shot": TFRemBertForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFRemBertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RemBertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        """Test the base model"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_causal_lm_base_model(self):
-        """Test the base model of the causal LM model
-
-        is_decoder=True, no cross_attention, no encoder outputs
-        """
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_base_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        """Test the base model as a decoder (of an encoder-decoder architecture)
-
-        is_decoder=True + cross_attention + pass encoder outputs
-        """
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_causal_lm(self):
-        """Test the causal LM model"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model(*config_and_inputs)
-
-    def test_causal_lm_model_as_decoder(self):
-        """Test the causal LM model as a decoder"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_causal_lm_model_as_decoder(*config_and_inputs)
-
-    def test_causal_lm_model_past(self):
-        """Test causal LM model with `past_key_values`"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past(*config_and_inputs)
-
-    def test_causal_lm_model_past_with_attn_mask(self):
-        """Test the causal LM model with `past_key_values` and `attention_mask`"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past_with_attn_mask(*config_and_inputs)
-
-    def test_causal_lm_model_past_with_large_inputs(self):
-        """Test the causal LM model with `past_key_values` and a longer decoder sequence length"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past_large_inputs(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        """Similar to `test_causal_lm_model_past_with_large_inputs` but with cross-attention"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TFRemBertModel.from_pretrained("google/rembert")
-        self.assertIsNotNone(model)
-
-
-@require_tf
-class TFRemBertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_model(self):
-        model = TFRemBertModel.from_pretrained("google/rembert")
-
-        input_ids = tf.constant([[312, 56498, 313, 2125, 313]])
-        segment_ids = tf.constant([[0, 0, 0, 1, 1]])
-        output = model(input_ids, token_type_ids=segment_ids, output_hidden_states=True)
-
-        hidden_size = 1152
-
-        expected_shape = [1, 5, hidden_size]
-        self.assertEqual(output["last_hidden_state"].shape, expected_shape)
-
-        expected_implementation = tf.constant(
-            [
-                [
-                    [0.0754, -0.2022, 0.1904],
-                    [-0.3354, -0.3692, -0.4791],
-                    [-0.2314, -0.6729, -0.0749],
-                    [-0.0396, -0.3105, -0.4234],
-                    [-0.1571, -0.0525, 0.5353],
-                ]
-            ]
-        )
-        tf.debugging.assert_near(output["last_hidden_state"][:, :, :3], expected_implementation, atol=1e-4)
-
-        # Running on the original tf implementation gives slightly different results here.
-        # Not clear why this variations is present
-        # TODO: Find reason for discrepancy
-        # expected_original_implementation = [[
-        #     [0.07630594074726105, -0.20146065950393677, 0.19107051193714142],
-        #     [-0.3405614495277405, -0.36971670389175415, -0.4808273911476135],
-        #     [-0.22587086260318756, -0.6656315922737122, -0.07844287157058716],
-        #     [-0.04145475849509239, -0.3077218234539032, -0.42316967248916626],
-        #     [-0.15887849032878876, -0.054529931396245956, 0.5356100797653198]
-        # ]]
diff --git a/tests/models/resnet/test_modeling_flax_resnet.py b/tests/models/resnet/test_modeling_flax_resnet.py
deleted file mode 100644
index 7399405f00..0000000000
--- a/tests/models/resnet/test_modeling_flax_resnet.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import unittest
-
-from transformers import ResNetConfig, is_flax_available
-from transformers.testing_utils import require_flax, slow
-from transformers.utils import cached_property, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor
-
-
-if is_flax_available():
-    import jax
-    import jax.numpy as jnp
-
-    from transformers.models.resnet.modeling_flax_resnet import FlaxResNetForImageClassification, FlaxResNetModel
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import AutoImageProcessor
-
-
-class FlaxResNetModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,
-        image_size=32,
-        num_channels=3,
-        embeddings_size=10,
-        hidden_sizes=[10, 20, 30, 40],
-        depths=[1, 1, 2, 1],
-        is_training=True,
-        use_labels=True,
-        hidden_act="relu",
-        num_labels=3,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.embeddings_size = embeddings_size
-        self.hidden_sizes = hidden_sizes
-        self.depths = depths
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_act = hidden_act
-        self.num_labels = num_labels
-        self.scope = scope
-        self.num_stages = len(hidden_sizes)
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return ResNetConfig(
-            num_channels=self.num_channels,
-            embeddings_size=self.embeddings_size,
-            hidden_sizes=self.hidden_sizes,
-            depths=self.depths,
-            hidden_act=self.hidden_act,
-            num_labels=self.num_labels,
-            image_size=self.image_size,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = FlaxResNetModel(config=config)
-        result = model(pixel_values)
-
-        # Output shape (b, c, h, w)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values):
-        config.num_labels = self.num_labels
-        model = FlaxResNetForImageClassification(config=config)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_flax
-class FlaxResNetModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxResNetModel, FlaxResNetForImageClassification) if is_flax_available() else ()
-
-    is_encoder_decoder = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self) -> None:
-        self.model_tester = FlaxResNetModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ResNetConfig, has_text_modality=False)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @unittest.skip(reason="ResNet does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="ResNet does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.__call__)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_stages = self.model_tester.num_stages
-            self.assertEqual(len(hidden_states), expected_num_stages + 1)
-
-    @unittest.skip(reason="ResNet does not use feedforward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    def test_jit_compilation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def model_jitted(pixel_values, **kwargs):
-                    return model(pixel_values=pixel_values, **kwargs)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_flax
-class FlaxResNetModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("microsoft/resnet-50") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = FlaxResNetForImageClassification.from_pretrained("microsoft/resnet-50")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="np")
-
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = jnp.array([-11.1069, -9.7877, -8.3777])
-
-        self.assertTrue(jnp.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/resnet/test_modeling_tf_resnet.py b/tests/models/resnet/test_modeling_tf_resnet.py
deleted file mode 100644
index a64a3479ed..0000000000
--- a/tests/models/resnet/test_modeling_tf_resnet.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the Tensorflow ResNet model."""
-
-from __future__ import annotations
-
-import inspect
-import unittest
-
-import numpy as np
-
-from transformers import ResNetConfig
-from transformers.testing_utils import require_tf, require_vision, slow
-from transformers.utils import cached_property, is_tf_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFResNetForImageClassification, TFResNetModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import AutoImageProcessor
-
-
-class TFResNetModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,
-        image_size=32,
-        num_channels=3,
-        embeddings_size=10,
-        hidden_sizes=[10, 20, 30, 40],
-        depths=[1, 1, 2, 1],
-        is_training=True,
-        use_labels=True,
-        hidden_act="relu",
-        num_labels=3,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.embeddings_size = embeddings_size
-        self.hidden_sizes = hidden_sizes
-        self.depths = depths
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_act = hidden_act
-        self.num_labels = num_labels
-        self.scope = scope
-        self.num_stages = len(hidden_sizes)
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return ResNetConfig(
-            num_channels=self.num_channels,
-            embeddings_size=self.embeddings_size,
-            hidden_sizes=self.hidden_sizes,
-            depths=self.depths,
-            hidden_act=self.hidden_act,
-            num_labels=self.num_labels,
-            image_size=self.image_size,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = TFResNetModel(config=config)
-        result = model(pixel_values)
-        # expected last hidden states: B, C, H // 32, W // 32
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
-        model = TFResNetForImageClassification(config)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFResNetModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ResNet does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (TFResNetModel, TFResNetForImageClassification) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": TFResNetModel, "image-classification": TFResNetForImageClassification}
-        if is_tf_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_onnx = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = TFResNetModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ResNetConfig, has_text_modality=False)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    @unittest.skip(reason="ResNet does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="ResNet does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_stages = self.model_tester.num_stages
-            self.assertEqual(len(hidden_states), expected_num_stages + 1)
-
-            # ResNet's feature maps are of shape (batch_size, num_channels, height, width)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        layers_type = ["basic", "bottleneck"]
-        for model_class in self.all_model_classes:
-            for layer_type in layers_type:
-                config.layer_type = layer_type
-                inputs_dict["output_hidden_states"] = True
-                check_hidden_states_output(inputs_dict, config, model_class)
-
-                # check that output_hidden_states also work using config
-                del inputs_dict["output_hidden_states"]
-                config.output_hidden_states = True
-
-                check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/resnet-50"
-        model = TFResNetModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_tf
-@require_vision
-class TFResNetModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("microsoft/resnet-50") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = TFResNetForImageClassification.from_pretrained("microsoft/resnet-50")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="tf")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = tf.constant([-11.1069, -9.7877, -8.3777])
-
-        self.assertTrue(np.allclose(outputs.logits[0, :3].numpy(), expected_slice, atol=1e-4))
diff --git a/tests/models/roberta/test_modeling_flax_roberta.py b/tests/models/roberta/test_modeling_flax_roberta.py
deleted file mode 100644
index b9a877d2bd..0000000000
--- a/tests/models/roberta/test_modeling_flax_roberta.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from transformers import RobertaConfig, is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-if is_flax_available():
-    from transformers.models.roberta.modeling_flax_roberta import (
-        FlaxRobertaForCausalLM,
-        FlaxRobertaForMaskedLM,
-        FlaxRobertaForMultipleChoice,
-        FlaxRobertaForQuestionAnswering,
-        FlaxRobertaForSequenceClassification,
-        FlaxRobertaForTokenClassification,
-        FlaxRobertaModel,
-    )
-
-
-class FlaxRobertaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_attention_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_choices=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_choices = num_choices
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        config = RobertaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, token_type_ids, attention_mask
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, token_type_ids, attention_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_decoder(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, token_type_ids, attention_mask = config_and_inputs
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-
-@require_flax
-class FlaxRobertaModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    test_head_masking = True
-
-    all_model_classes = (
-        (
-            FlaxRobertaModel,
-            FlaxRobertaForCausalLM,
-            FlaxRobertaForMaskedLM,
-            FlaxRobertaForSequenceClassification,
-            FlaxRobertaForTokenClassification,
-            FlaxRobertaForMultipleChoice,
-            FlaxRobertaForQuestionAnswering,
-        )
-        if is_flax_available()
-        else ()
-    )
-
-    def setUp(self):
-        self.model_tester = FlaxRobertaModelTester(self)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("FacebookAI/roberta-base", from_pt=True)
-            outputs = model(np.ones((1, 1)))
-            self.assertIsNotNone(outputs)
diff --git a/tests/models/roberta/test_modeling_tf_roberta.py b/tests/models/roberta/test_modeling_tf_roberta.py
deleted file mode 100644
index d2dbc30928..0000000000
--- a/tests/models/roberta/test_modeling_tf_roberta.py
+++ /dev/null
@@ -1,700 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import RobertaConfig, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import numpy
-    import tensorflow as tf
-
-    from transformers.models.roberta.modeling_tf_roberta import (
-        TFRobertaForCausalLM,
-        TFRobertaForMaskedLM,
-        TFRobertaForMultipleChoice,
-        TFRobertaForQuestionAnswering,
-        TFRobertaForSequenceClassification,
-        TFRobertaForTokenClassification,
-        TFRobertaModel,
-    )
-
-
-class TFRobertaModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = RobertaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFRobertaModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_causal_lm_base_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.is_decoder = True
-
-        model = TFRobertaModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFRobertaModel(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "encoder_hidden_states": encoder_hidden_states,
-            "encoder_attention_mask": encoder_attention_mask,
-        }
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
-
-        # Also check the case where encoder outputs are not passed
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_causal_lm_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.is_decoder = True
-
-        model = TFRobertaForCausalLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        prediction_scores = model(inputs)["logits"]
-        self.parent.assertListEqual(
-            list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
-        )
-
-    def create_and_check_causal_lm_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFRobertaForCausalLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "encoder_hidden_states": encoder_hidden_states,
-            "encoder_attention_mask": encoder_attention_mask,
-        }
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
-
-        prediction_scores = result["logits"]
-        self.parent.assertListEqual(
-            list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
-        )
-
-    def create_and_check_causal_lm_model_past(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFRobertaForCausalLM(config=config)
-
-        # special to `RobertaEmbeddings` in `Roberta`:
-        #   - its `padding_idx` and its effect on `position_ids`
-        #     (TFRobertaEmbeddings.create_position_ids_from_input_ids)
-        #   - `1` here is `TFRobertaEmbeddings.padding_idx`
-        input_ids = tf.where(input_ids == 1, 2, input_ids)
-
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and attn_mask
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-
-        output_from_no_past = model(next_input_ids, output_hidden_states=True).hidden_states[0]
-        output_from_past = model(
-            next_tokens, past_key_values=past_key_values, output_hidden_states=True
-        ).hidden_states[0]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
-    def create_and_check_causal_lm_model_past_with_attn_mask(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFRobertaForCausalLM(config=config)
-
-        # special to `RobertaEmbeddings` in `Roberta`:
-        #   - its `padding_idx` and its effect on `position_ids`
-        #     (TFRobertaEmbeddings.create_position_ids_from_input_ids)
-        #   - `1` here is `TFRobertaEmbeddings.padding_idx`
-        # avoid `padding_idx` in the past
-        input_ids = tf.where(input_ids == 1, 2, input_ids)
-
-        # create attention mask
-        half_seq_length = self.seq_length // 2
-        attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
-        attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
-        attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attn_mask, use_cache=True)
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        past_key_values = outputs.past_key_values
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
-        vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
-        condition = tf.transpose(
-            tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
-        )
-        input_ids = tf.where(condition, random_other_next_tokens, input_ids)
-        # avoid `padding_idx` in the past
-        input_ids = tf.where(input_ids == 1, 2, input_ids)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        attn_mask = tf.concat(
-            [attn_mask, tf.ones((attn_mask.shape[0], 1), dtype=tf.int32)],
-            axis=1,
-        )
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=attn_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens, past_key_values=past_key_values, attention_mask=attn_mask, output_hidden_states=True
-        ).hidden_states[0]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
-    def create_and_check_causal_lm_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFRobertaForCausalLM(config=config)
-
-        # special to `RobertaEmbeddings` in `Roberta`:
-        #   - its `padding_idx` and its effect on `position_ids`
-        #     (TFRobertaEmbeddings.create_position_ids_from_input_ids)
-        #   - `1` here is `TFRobertaEmbeddings.padding_idx`
-        # avoid `padding_idx` in the past
-        input_ids = tf.where(input_ids == 1, 2, input_ids)
-
-        input_ids = input_ids[:1, :]
-        input_mask = input_mask[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=input_mask, use_cache=True)
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        ).hidden_states[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFRobertaForCausalLM(config=config)
-
-        # special to `RobertaEmbeddings` in `Roberta`:
-        #   - its `padding_idx` and its effect on `position_ids`
-        #     (TFRobertaEmbeddings.create_position_ids_from_input_ids)
-        #   - `1` here is `TFRobertaEmbeddings.padding_idx`
-        # avoid `padding_idx` in the past
-        input_ids = tf.where(input_ids == 1, 2, input_ids)
-
-        input_ids = input_ids[:1, :]
-        input_mask = input_mask[:1, :]
-        encoder_hidden_states = encoder_hidden_states[:1, :, :]
-        encoder_attention_mask = encoder_attention_mask[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        ).hidden_states[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFRobertaForMaskedLM(config=config)
-        result = model([input_ids, input_mask, token_type_ids])
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFRobertaForTokenClassification(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFRobertaForQuestionAnswering(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = TFRobertaForMultipleChoice(config=config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFRobertaModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFRobertaModel,
-            TFRobertaForCausalLM,
-            TFRobertaForMaskedLM,
-            TFRobertaForSequenceClassification,
-            TFRobertaForTokenClassification,
-            TFRobertaForQuestionAnswering,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFRobertaModel,
-            "fill-mask": TFRobertaForMaskedLM,
-            "question-answering": TFRobertaForQuestionAnswering,
-            "text-classification": TFRobertaForSequenceClassification,
-            "text-generation": TFRobertaForCausalLM,
-            "token-classification": TFRobertaForTokenClassification,
-            "zero-shot": TFRobertaForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFRobertaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        """Test the base model"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_causal_lm_base_model(self):
-        """Test the base model of the causal LM model
-
-        is_decoder=True, no cross_attention, no encoder outputs
-        """
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_base_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        """Test the base model as a decoder (of an encoder-decoder architecture)
-
-        is_decoder=True + cross_attention + pass encoder outputs
-        """
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_causal_lm(self):
-        """Test the causal LM model"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model(*config_and_inputs)
-
-    def test_causal_lm_model_as_decoder(self):
-        """Test the causal LM model as a decoder"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_causal_lm_model_as_decoder(*config_and_inputs)
-
-    def test_causal_lm_model_past(self):
-        """Test causal LM model with `past_key_values`"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past(*config_and_inputs)
-
-    def test_causal_lm_model_past_with_attn_mask(self):
-        """Test the causal LM model with `past_key_values` and `attention_mask`"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past_with_attn_mask(*config_and_inputs)
-
-    def test_causal_lm_model_past_with_large_inputs(self):
-        """Test the causal LM model with `past_key_values` and a longer decoder sequence length"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past_large_inputs(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        """Similar to `test_causal_lm_model_past_with_large_inputs` but with cross-attention"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "FacebookAI/roberta-base"
-        model = TFRobertaModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_tf
-@require_sentencepiece
-@require_tokenizers
-class TFRobertaModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = TFRobertaForMaskedLM.from_pretrained("FacebookAI/roberta-base")
-
-        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        expected_shape = [1, 11, 50265]
-        self.assertEqual(list(output.numpy().shape), expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = tf.constant(
-            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
-        )
-        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
-
-    @slow
-    def test_inference_no_head(self):
-        model = TFRobertaModel.from_pretrained("FacebookAI/roberta-base")
-
-        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        expected_slice = tf.constant(
-            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
-        )
-        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
-
-    @slow
-    def test_inference_classification_head(self):
-        model = TFRobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-large-mnli")
-
-        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        expected_shape = [1, 3]
-        self.assertEqual(list(output.numpy().shape), expected_shape)
-        expected_tensor = tf.constant([[-0.9469, 0.3913, 0.5118]])
-        self.assertTrue(numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-4))
diff --git a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
deleted file mode 100644
index d464e28640..0000000000
--- a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from transformers import RobertaPreLayerNormConfig, is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-if is_flax_available():
-    import jax.numpy as jnp
-
-    from transformers.models.roberta_prelayernorm.modeling_flax_roberta_prelayernorm import (
-        FlaxRobertaPreLayerNormForCausalLM,
-        FlaxRobertaPreLayerNormForMaskedLM,
-        FlaxRobertaPreLayerNormForMultipleChoice,
-        FlaxRobertaPreLayerNormForQuestionAnswering,
-        FlaxRobertaPreLayerNormForSequenceClassification,
-        FlaxRobertaPreLayerNormForTokenClassification,
-        FlaxRobertaPreLayerNormModel,
-    )
-
-
-# Copied from tests.models.roberta.test_modeling_flax_roberta.FlaxRobertaModelTester with Roberta->RobertaPreLayerNorm
-class FlaxRobertaPreLayerNormModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_attention_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_choices=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_choices = num_choices
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        config = RobertaPreLayerNormConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, token_type_ids, attention_mask
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, token_type_ids, attention_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_decoder(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, token_type_ids, attention_mask = config_and_inputs
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-
-@require_flax
-# Copied from tests.models.roberta.test_modeling_flax_roberta.FlaxRobertaModelTest with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,FacebookAI/roberta-base->andreasmadsen/efficient_mlm_m0.40
-class FlaxRobertaPreLayerNormModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    test_head_masking = True
-
-    all_model_classes = (
-        (
-            FlaxRobertaPreLayerNormModel,
-            FlaxRobertaPreLayerNormForCausalLM,
-            FlaxRobertaPreLayerNormForMaskedLM,
-            FlaxRobertaPreLayerNormForSequenceClassification,
-            FlaxRobertaPreLayerNormForTokenClassification,
-            FlaxRobertaPreLayerNormForMultipleChoice,
-            FlaxRobertaPreLayerNormForQuestionAnswering,
-        )
-        if is_flax_available()
-        else ()
-    )
-
-    def setUp(self):
-        self.model_tester = FlaxRobertaPreLayerNormModelTester(self)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("andreasmadsen/efficient_mlm_m0.40", from_pt=True)
-            outputs = model(np.ones((1, 1)))
-            self.assertIsNotNone(outputs)
-
-
-@require_flax
-class TFRobertaPreLayerNormModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = FlaxRobertaPreLayerNormForMaskedLM.from_pretrained("andreasmadsen/efficient_mlm_m0.40", from_pt=True)
-
-        input_ids = np.array([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]], dtype=jnp.int32)
-        output = model(input_ids)[0]
-        expected_shape = [1, 11, 50265]
-        self.assertEqual(list(output.shape), expected_shape)
-        # compare the actual values for a slice.
-        EXPECTED_SLICE = np.array(
-            [[[40.4880, 18.0199, -5.2367], [-1.8877, -4.0885, 10.7085], [-2.2613, -5.6110, 7.2665]]], dtype=np.float32
-        )
-        self.assertTrue(np.allclose(output[:, :3, :3], EXPECTED_SLICE, atol=1e-4))
-
-    @slow
-    def test_inference_no_head(self):
-        model = FlaxRobertaPreLayerNormModel.from_pretrained("andreasmadsen/efficient_mlm_m0.40", from_pt=True)
-
-        input_ids = np.array([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]], dtype=jnp.int32)
-        output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        EXPECTED_SLICE = np.array(
-            [[[0.0208, -0.0356, 0.0237], [-0.1569, -0.0411, -0.2626], [0.1879, 0.0125, -0.0089]]], dtype=np.float32
-        )
-        self.assertTrue(np.allclose(output[:, :3, :3], EXPECTED_SLICE, atol=1e-4))
diff --git a/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py
deleted file mode 100644
index 835a6d3e3a..0000000000
--- a/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py
+++ /dev/null
@@ -1,691 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import RobertaPreLayerNormConfig, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import numpy
-    import tensorflow as tf
-
-    from transformers.models.roberta_prelayernorm.modeling_tf_roberta_prelayernorm import (
-        TFRobertaPreLayerNormForCausalLM,
-        TFRobertaPreLayerNormForMaskedLM,
-        TFRobertaPreLayerNormForMultipleChoice,
-        TFRobertaPreLayerNormForQuestionAnswering,
-        TFRobertaPreLayerNormForSequenceClassification,
-        TFRobertaPreLayerNormForTokenClassification,
-        TFRobertaPreLayerNormModel,
-    )
-
-
-# Copied from tests.models.roberta.test_modeling_tf_roberta.TFRobertaModelTester with Roberta->RobertaPreLayerNorm
-class TFRobertaPreLayerNormModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = RobertaPreLayerNormConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFRobertaPreLayerNormModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_causal_lm_base_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.is_decoder = True
-
-        model = TFRobertaPreLayerNormModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFRobertaPreLayerNormModel(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "encoder_hidden_states": encoder_hidden_states,
-            "encoder_attention_mask": encoder_attention_mask,
-        }
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
-
-        # Also check the case where encoder outputs are not passed
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_causal_lm_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.is_decoder = True
-
-        model = TFRobertaPreLayerNormForCausalLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        prediction_scores = model(inputs)["logits"]
-        self.parent.assertListEqual(
-            list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
-        )
-
-    def create_and_check_causal_lm_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFRobertaPreLayerNormForCausalLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "encoder_hidden_states": encoder_hidden_states,
-            "encoder_attention_mask": encoder_attention_mask,
-        }
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
-
-        prediction_scores = result["logits"]
-        self.parent.assertListEqual(
-            list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
-        )
-
-    def create_and_check_causal_lm_model_past(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFRobertaPreLayerNormForCausalLM(config=config)
-
-        # special to `RobertaPreLayerNormEmbeddings` in `RobertaPreLayerNorm`:
-        #   - its `padding_idx` and its effect on `position_ids`
-        #     (TFRobertaPreLayerNormEmbeddings.create_position_ids_from_input_ids)
-        #   - `1` here is `TFRobertaPreLayerNormEmbeddings.padding_idx`
-        input_ids = tf.where(input_ids == 1, 2, input_ids)
-
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and attn_mask
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-
-        output_from_no_past = model(next_input_ids, output_hidden_states=True).hidden_states[0]
-        output_from_past = model(
-            next_tokens, past_key_values=past_key_values, output_hidden_states=True
-        ).hidden_states[0]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
-    def create_and_check_causal_lm_model_past_with_attn_mask(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFRobertaPreLayerNormForCausalLM(config=config)
-
-        # special to `RobertaPreLayerNormEmbeddings` in `RobertaPreLayerNorm`:
-        #   - its `padding_idx` and its effect on `position_ids`
-        #     (TFRobertaPreLayerNormEmbeddings.create_position_ids_from_input_ids)
-        #   - `1` here is `TFRobertaPreLayerNormEmbeddings.padding_idx`
-        # avoid `padding_idx` in the past
-        input_ids = tf.where(input_ids == 1, 2, input_ids)
-
-        # create attention mask
-        half_seq_length = self.seq_length // 2
-        attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
-        attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
-        attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attn_mask, use_cache=True)
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        past_key_values = outputs.past_key_values
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
-        vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
-        condition = tf.transpose(
-            tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
-        )
-        input_ids = tf.where(condition, random_other_next_tokens, input_ids)
-        # avoid `padding_idx` in the past
-        input_ids = tf.where(input_ids == 1, 2, input_ids)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        attn_mask = tf.concat(
-            [attn_mask, tf.ones((attn_mask.shape[0], 1), dtype=tf.int32)],
-            axis=1,
-        )
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=attn_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens, past_key_values=past_key_values, attention_mask=attn_mask, output_hidden_states=True
-        ).hidden_states[0]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
-    def create_and_check_causal_lm_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFRobertaPreLayerNormForCausalLM(config=config)
-
-        # special to `RobertaPreLayerNormEmbeddings` in `RobertaPreLayerNorm`:
-        #   - its `padding_idx` and its effect on `position_ids`
-        #     (TFRobertaPreLayerNormEmbeddings.create_position_ids_from_input_ids)
-        #   - `1` here is `TFRobertaPreLayerNormEmbeddings.padding_idx`
-        # avoid `padding_idx` in the past
-        input_ids = tf.where(input_ids == 1, 2, input_ids)
-
-        input_ids = input_ids[:1, :]
-        input_mask = input_mask[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=input_mask, use_cache=True)
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        ).hidden_states[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFRobertaPreLayerNormForCausalLM(config=config)
-
-        # special to `RobertaPreLayerNormEmbeddings` in `RobertaPreLayerNorm`:
-        #   - its `padding_idx` and its effect on `position_ids`
-        #     (TFRobertaPreLayerNormEmbeddings.create_position_ids_from_input_ids)
-        #   - `1` here is `TFRobertaPreLayerNormEmbeddings.padding_idx`
-        # avoid `padding_idx` in the past
-        input_ids = tf.where(input_ids == 1, 2, input_ids)
-
-        input_ids = input_ids[:1, :]
-        input_mask = input_mask[:1, :]
-        encoder_hidden_states = encoder_hidden_states[:1, :, :]
-        encoder_attention_mask = encoder_attention_mask[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        ).hidden_states[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFRobertaPreLayerNormForMaskedLM(config=config)
-        result = model([input_ids, input_mask, token_type_ids])
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFRobertaPreLayerNormForTokenClassification(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFRobertaPreLayerNormForQuestionAnswering(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = TFRobertaPreLayerNormForMultipleChoice(config=config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-# Copied from tests.models.roberta.test_modeling_tf_roberta.TFRobertaModelTest with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,FacebookAI/roberta-base->andreasmadsen/efficient_mlm_m0.15
-class TFRobertaPreLayerNormModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFRobertaPreLayerNormModel,
-            TFRobertaPreLayerNormForCausalLM,
-            TFRobertaPreLayerNormForMaskedLM,
-            TFRobertaPreLayerNormForSequenceClassification,
-            TFRobertaPreLayerNormForTokenClassification,
-            TFRobertaPreLayerNormForQuestionAnswering,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFRobertaPreLayerNormModel,
-            "fill-mask": TFRobertaPreLayerNormForMaskedLM,
-            "question-answering": TFRobertaPreLayerNormForQuestionAnswering,
-            "text-classification": TFRobertaPreLayerNormForSequenceClassification,
-            "text-generation": TFRobertaPreLayerNormForCausalLM,
-            "token-classification": TFRobertaPreLayerNormForTokenClassification,
-            "zero-shot": TFRobertaPreLayerNormForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFRobertaPreLayerNormModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RobertaPreLayerNormConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        """Test the base model"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_causal_lm_base_model(self):
-        """Test the base model of the causal LM model
-
-        is_decoder=True, no cross_attention, no encoder outputs
-        """
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_base_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        """Test the base model as a decoder (of an encoder-decoder architecture)
-
-        is_decoder=True + cross_attention + pass encoder outputs
-        """
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_causal_lm(self):
-        """Test the causal LM model"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model(*config_and_inputs)
-
-    def test_causal_lm_model_as_decoder(self):
-        """Test the causal LM model as a decoder"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_causal_lm_model_as_decoder(*config_and_inputs)
-
-    def test_causal_lm_model_past(self):
-        """Test causal LM model with `past_key_values`"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past(*config_and_inputs)
-
-    def test_causal_lm_model_past_with_attn_mask(self):
-        """Test the causal LM model with `past_key_values` and `attention_mask`"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past_with_attn_mask(*config_and_inputs)
-
-    def test_causal_lm_model_past_with_large_inputs(self):
-        """Test the causal LM model with `past_key_values` and a longer decoder sequence length"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past_large_inputs(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        """Similar to `test_causal_lm_model_past_with_large_inputs` but with cross-attention"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "andreasmadsen/efficient_mlm_m0.15"
-        model = TFRobertaPreLayerNormModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_tf
-@require_sentencepiece
-@require_tokenizers
-class TFRobertaPreLayerNormModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = TFRobertaPreLayerNormForMaskedLM.from_pretrained("andreasmadsen/efficient_mlm_m0.40")
-
-        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        expected_shape = [1, 11, 50265]
-        self.assertEqual(list(output.numpy().shape), expected_shape)
-        # compare the actual values for a slice.
-        EXPECTED_SLICE = tf.constant(
-            [[[40.4880, 18.0199, -5.2367], [-1.8877, -4.0885, 10.7085], [-2.2613, -5.6110, 7.2665]]]
-        )
-        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), EXPECTED_SLICE.numpy(), atol=1e-4))
-
-    @slow
-    def test_inference_no_head(self):
-        model = TFRobertaPreLayerNormModel.from_pretrained("andreasmadsen/efficient_mlm_m0.40")
-
-        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        EXPECTED_SLICE = tf.constant(
-            [[[0.0208, -0.0356, 0.0237], [-0.1569, -0.0411, -0.2626], [0.1879, 0.0125, -0.0089]]]
-        )
-        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), EXPECTED_SLICE.numpy(), atol=1e-4))
diff --git a/tests/models/roformer/test_modeling_flax_roformer.py b/tests/models/roformer/test_modeling_flax_roformer.py
deleted file mode 100644
index 856ed90606..0000000000
--- a/tests/models/roformer/test_modeling_flax_roformer.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from transformers import RoFormerConfig, is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_flax_available():
-    import jax.numpy as jnp
-
-    from transformers.models.roformer.modeling_flax_roformer import (
-        FlaxRoFormerForMaskedLM,
-        FlaxRoFormerForMultipleChoice,
-        FlaxRoFormerForQuestionAnswering,
-        FlaxRoFormerForSequenceClassification,
-        FlaxRoFormerForTokenClassification,
-        FlaxRoFormerModel,
-    )
-
-
-class FlaxRoFormerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_attention_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_choices=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_choices = num_choices
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        config = RoFormerConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, token_type_ids, attention_mask
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, token_type_ids, attention_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_flax
-class FlaxRoFormerModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    test_head_masking = True
-
-    all_model_classes = (
-        (
-            FlaxRoFormerModel,
-            FlaxRoFormerForMaskedLM,
-            FlaxRoFormerForSequenceClassification,
-            FlaxRoFormerForTokenClassification,
-            FlaxRoFormerForMultipleChoice,
-            FlaxRoFormerForQuestionAnswering,
-        )
-        if is_flax_available()
-        else ()
-    )
-
-    def setUp(self):
-        self.model_tester = FlaxRoFormerModelTester(self)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("junnyu/roformer_chinese_small", from_pt=True)
-            outputs = model(np.ones((1, 1)))
-            self.assertIsNotNone(outputs)
-
-
-@require_flax
-class FlaxRoFormerModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = FlaxRoFormerForMaskedLM.from_pretrained("junnyu/roformer_chinese_base")
-        input_ids = jnp.array([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        vocab_size = 50000
-
-        expected_shape = (1, 6, vocab_size)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = jnp.array(
-            [[[-0.1205, -1.0265, 0.2922], [-1.5134, 0.1974, 0.1519], [-5.0135, -3.9003, -0.8404]]]
-        )
-
-        self.assertTrue(jnp.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/roformer/test_modeling_tf_roformer.py b/tests/models/roformer/test_modeling_tf_roformer.py
deleted file mode 100644
index c379672430..0000000000
--- a/tests/models/roformer/test_modeling_tf_roformer.py
+++ /dev/null
@@ -1,430 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import RoFormerConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TFRoFormerForCausalLM,
-        TFRoFormerForMaskedLM,
-        TFRoFormerForMultipleChoice,
-        TFRoFormerForQuestionAnswering,
-        TFRoFormerForSequenceClassification,
-        TFRoFormerForTokenClassification,
-        TFRoFormerModel,
-    )
-    from transformers.models.roformer.modeling_tf_roformer import (
-        TFRoFormerSelfAttention,
-        TFRoFormerSinusoidalPositionalEmbedding,
-    )
-
-
-class TFRoFormerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = RoFormerConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            return_dict=True,
-        )
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFRoFormerModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_lm_head(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.is_decoder = True
-        model = TFRoFormerForCausalLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        prediction_scores = model(inputs)["logits"]
-        self.parent.assertListEqual(
-            list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
-        )
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFRoFormerForMaskedLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFRoFormerForSequenceClassification(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = TFRoFormerForMultipleChoice(config=config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFRoFormerForTokenClassification(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFRoFormerForQuestionAnswering(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-
-        result = model(inputs)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFRoFormerModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFRoFormerModel,
-            TFRoFormerForCausalLM,
-            TFRoFormerForMaskedLM,
-            TFRoFormerForQuestionAnswering,
-            TFRoFormerForSequenceClassification,
-            TFRoFormerForTokenClassification,
-            TFRoFormerForMultipleChoice,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFRoFormerModel,
-            "fill-mask": TFRoFormerForMaskedLM,
-            "question-answering": TFRoFormerForQuestionAnswering,
-            "text-classification": TFRoFormerForSequenceClassification,
-            "text-generation": TFRoFormerForCausalLM,
-            "token-classification": TFRoFormerForTokenClassification,
-            "zero-shot": TFRoFormerForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-
-    test_head_masking = False
-    test_onnx = False
-
-    # TODO: add `prepare_inputs_for_generation` for `TFRoFormerForCausalLM`
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_case_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        image_processor_name,
-        feature_extractor_name,
-        processor_name,
-    ):
-        if pipeline_test_case_name == "TextGenerationPipelineTests":
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = TFRoFormerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RoFormerConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TFRoFormerModel.from_pretrained("junnyu/roformer_chinese_base")
-        self.assertIsNotNone(model)
-
-
-@require_tf
-class TFRoFormerModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = TFRoFormerForMaskedLM.from_pretrained("junnyu/roformer_chinese_base")
-        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        # TODO Replace vocab size
-        vocab_size = 50000
-
-        expected_shape = [1, 6, vocab_size]
-        self.assertEqual(output.shape, expected_shape)
-
-        print(output[:, :3, :3])
-
-        # TODO Replace values below with what was printed above.
-        expected_slice = tf.constant(
-            [
-                [
-                    [-0.12053341, -1.0264901, 0.29221946],
-                    [-1.5133783, 0.197433, 0.15190607],
-                    [-5.0135403, -3.900256, -0.84038764],
-                ]
-            ]
-        )
-        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
-
-
-@require_tf
-class TFRoFormerSinusoidalPositionalEmbeddingTest(unittest.TestCase):
-    tolerance = 1e-4
-
-    def test_basic(self):
-        input_ids = tf.constant([[4, 10]])
-        emb1 = TFRoFormerSinusoidalPositionalEmbedding(num_positions=6, embedding_dim=6)
-
-        emb = emb1(input_ids.shape)
-        desired_weights = tf.constant(
-            [[0.0000, 0.0000, 0.0000, 1.0000, 1.0000, 1.0000], [0.8415, 0.0464, 0.0022, 0.5403, 0.9989, 1.0000]]
-        )
-
-        tf.debugging.assert_near(emb, desired_weights, atol=self.tolerance)
-
-    def test_positional_emb_weights_against_roformer(self):
-        desired_weights = tf.constant(
-            [
-                [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                [0.8415, 0.8219, 0.8020, 0.7819, 0.7617],
-                [0.9093, 0.9364, 0.9581, 0.9749, 0.9870],
-            ]
-        )
-        emb1 = TFRoFormerSinusoidalPositionalEmbedding(num_positions=512, embedding_dim=512)
-        emb1([2, 16, 512])
-        weights = emb1.weight[:3, :5]
-
-        tf.debugging.assert_near(weights, desired_weights, atol=self.tolerance)
-
-
-@require_tf
-class TFRoFormerSelfAttentionRotaryPositionEmbeddingTest(unittest.TestCase):
-    tolerance = 1e-4
-
-    def test_apply_rotary_position_embeddings(self):
-        # 2,12,16,64
-        query_layer = tf.reshape(tf.range(2 * 12 * 16 * 64, dtype=tf.float32), shape=(2, 12, 16, 64)) / 100
-
-        key_layer = -tf.reshape(tf.range(2 * 12 * 16 * 64, dtype=tf.float32), shape=(2, 12, 16, 64)) / 100
-
-        embed_positions = TFRoFormerSinusoidalPositionalEmbedding(num_positions=32, embedding_dim=64)
-        sinusoidal_pos = embed_positions([2, 16, 768])[None, None, :, :]
-
-        query_layer, key_layer = TFRoFormerSelfAttention.apply_rotary_position_embeddings(
-            sinusoidal_pos, query_layer, key_layer
-        )
-
-        desired_query_layer = tf.constant(
-            [
-                [0.0000, 0.0100, 0.0200, 0.0300, 0.0400, 0.0500, 0.0600, 0.0700],
-                [-0.2012, 0.8897, 0.0263, 0.9401, 0.2074, 0.9463, 0.3481, 0.9343],
-                [-1.7057, 0.6271, -1.2145, 1.3897, -0.6303, 1.7647, -0.1173, 1.8985],
-                [-2.1731, -1.6397, -2.7358, 0.2854, -2.1840, 1.7183, -1.3018, 2.4871],
-                [0.2717, -3.6173, -2.9206, -2.1988, -3.6638, 0.3858, -2.9155, 2.2980],
-                [3.9859, -2.1580, -0.7984, -4.4904, -4.1181, -2.0252, -4.4782, 1.1253],
-            ]
-        )
-        desired_key_layer = tf.constant(
-            [
-                [0.0000, -0.0100, -0.0200, -0.0300, -0.0400, -0.0500, -0.0600, -0.0700],
-                [0.2012, -0.8897, -0.0263, -0.9401, -0.2074, -0.9463, -0.3481, -0.9343],
-                [1.7057, -0.6271, 1.2145, -1.3897, 0.6303, -1.7647, 0.1173, -1.8985],
-                [2.1731, 1.6397, 2.7358, -0.2854, 2.1840, -1.7183, 1.3018, -2.4871],
-                [-0.2717, 3.6173, 2.9206, 2.1988, 3.6638, -0.3858, 2.9155, -2.2980],
-                [-3.9859, 2.1580, 0.7984, 4.4904, 4.1181, 2.0252, 4.4782, -1.1253],
-            ]
-        )
-
-        tf.debugging.assert_near(query_layer[0, 0, :6, :8], desired_query_layer, atol=self.tolerance)
-        tf.debugging.assert_near(key_layer[0, 0, :6, :8], desired_key_layer, atol=self.tolerance)
diff --git a/tests/models/sam/test_modeling_tf_sam.py b/tests/models/sam/test_modeling_tf_sam.py
deleted file mode 100644
index 6b4cd75467..0000000000
--- a/tests/models/sam/test_modeling_tf_sam.py
+++ /dev/null
@@ -1,858 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow SAM model."""
-
-from __future__ import annotations
-
-import inspect
-import unittest
-
-import numpy as np
-import requests
-
-from transformers import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig
-from transformers.testing_utils import require_tf, slow
-from transformers.utils import is_tf_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import SamProcessor, TFSamModel, TFSamVisionModel
-    from transformers.modeling_tf_utils import keras
-
-if is_vision_available():
-    from PIL import Image
-
-
-class TFSamVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        hidden_size=36,
-        intermediate_size=72,
-        projection_dim=62,
-        output_channels=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        num_channels=3,
-        image_size=24,
-        patch_size=2,
-        hidden_act="gelu",
-        layer_norm_eps=1e-06,
-        dropout=0.0,
-        attention_dropout=0.0,
-        initializer_range=0.02,
-        initializer_factor=1.0,
-        qkv_bias=True,
-        mlp_ratio=4.0,
-        use_abs_pos=True,
-        use_rel_pos=True,
-        rel_pos_zero_init=False,
-        window_size=14,
-        global_attn_indexes=[2, 5, 8, 11],
-        num_pos_feats=16,
-        mlp_dim=None,
-        batch_size=2,
-    ):
-        self.parent = parent
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.projection_dim = projection_dim
-        self.output_channels = output_channels
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.hidden_act = hidden_act
-        self.layer_norm_eps = layer_norm_eps
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.initializer_factor = initializer_factor
-        self.qkv_bias = qkv_bias
-        self.mlp_ratio = mlp_ratio
-        self.use_abs_pos = use_abs_pos
-        self.use_rel_pos = use_rel_pos
-        self.rel_pos_zero_init = rel_pos_zero_init
-        self.window_size = window_size
-        self.global_attn_indexes = global_attn_indexes
-        self.num_pos_feats = num_pos_feats
-        self.mlp_dim = mlp_dim
-        self.batch_size = batch_size
-
-    def get_config(self):
-        return SamVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-            initializer_factor=self.initializer_factor,
-            output_channels=self.output_channels,
-            qkv_bias=self.qkv_bias,
-            mlp_ratio=self.mlp_ratio,
-            use_abs_pos=self.use_abs_pos,
-            use_rel_pos=self.use_rel_pos,
-            rel_pos_zero_init=self.rel_pos_zero_init,
-            window_size=self.window_size,
-            global_attn_indexes=self.global_attn_indexes,
-            num_pos_feats=self.num_pos_feats,
-            mlp_dim=self.mlp_dim,
-        )
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def create_and_check_model(self, config, pixel_values):
-        model = TFSamVisionModel(config=config)
-        result = model(pixel_values)
-        output_size = self.image_size // self.patch_size
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_channels, output_size, output_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFSamVisionModelTest(TFModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as SAM's vision encoder does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (TFSamVisionModel,) if is_tf_available() else ()
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFSamVisionModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=SamVisionConfig, has_text_modality=False)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="SAM's vision encoder does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, keras.layers.Dense))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        expected_attention_shape = (
-            self.model_tester.batch_size * self.model_tester.num_attention_heads,
-            196,
-            196,
-        )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-4:]),
-                list(expected_attention_shape),
-            )
-
-    @unittest.skip(reason="Hidden_states is tested in create_and_check_model tests")
-    def test_hidden_states_output(self):
-        pass
-
-
-class TFSamPromptEncoderTester:
-    def __init__(
-        self,
-        hidden_size=32,
-        input_image_size=24,
-        patch_size=2,
-        mask_input_channels=4,
-        num_point_embeddings=4,
-        hidden_act="gelu",
-    ):
-        self.hidden_size = hidden_size
-        self.input_image_size = input_image_size
-        self.patch_size = patch_size
-        self.mask_input_channels = mask_input_channels
-        self.num_point_embeddings = num_point_embeddings
-        self.hidden_act = hidden_act
-
-    def get_config(self):
-        return SamPromptEncoderConfig(
-            image_size=self.input_image_size,
-            patch_size=self.patch_size,
-            mask_input_channels=self.mask_input_channels,
-            hidden_size=self.hidden_size,
-            num_point_embeddings=self.num_point_embeddings,
-            hidden_act=self.hidden_act,
-        )
-
-    def prepare_config_and_inputs(self):
-        dummy_points = floats_tensor([self.batch_size, 3, 2])
-        config = self.get_config()
-
-        return config, dummy_points
-
-
-class TFSamMaskDecoderTester:
-    def __init__(
-        self,
-        hidden_size=32,
-        hidden_act="relu",
-        mlp_dim=64,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        attention_downsample_rate=2,
-        num_multimask_outputs=3,
-        iou_head_depth=3,
-        iou_head_hidden_dim=32,
-        layer_norm_eps=1e-6,
-    ):
-        self.hidden_size = hidden_size
-        self.hidden_act = hidden_act
-        self.mlp_dim = mlp_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.attention_downsample_rate = attention_downsample_rate
-        self.num_multimask_outputs = num_multimask_outputs
-        self.iou_head_depth = iou_head_depth
-        self.iou_head_hidden_dim = iou_head_hidden_dim
-        self.layer_norm_eps = layer_norm_eps
-
-    def get_config(self):
-        return SamMaskDecoderConfig(
-            hidden_size=self.hidden_size,
-            hidden_act=self.hidden_act,
-            mlp_dim=self.mlp_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            attention_downsample_rate=self.attention_downsample_rate,
-            num_multimask_outputs=self.num_multimask_outputs,
-            iou_head_depth=self.iou_head_depth,
-            iou_head_hidden_dim=self.iou_head_hidden_dim,
-            layer_norm_eps=self.layer_norm_eps,
-        )
-
-    def prepare_config_and_inputs(self):
-        config = self.get_config()
-
-        dummy_inputs = {
-            "image_embedding": floats_tensor([self.batch_size, self.hidden_size]),
-        }
-
-        return config, dummy_inputs
-
-
-class TFSamModelTester:
-    def __init__(
-        self,
-        parent,
-        hidden_size=36,
-        intermediate_size=72,
-        projection_dim=62,
-        output_channels=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        num_channels=3,
-        image_size=24,
-        patch_size=2,
-        hidden_act="gelu",
-        layer_norm_eps=1e-06,
-        dropout=0.0,
-        attention_dropout=0.0,
-        initializer_range=0.02,
-        initializer_factor=1.0,
-        qkv_bias=True,
-        mlp_ratio=4.0,
-        use_abs_pos=True,
-        use_rel_pos=True,
-        rel_pos_zero_init=False,
-        window_size=14,
-        global_attn_indexes=[2, 5, 8, 11],
-        num_pos_feats=16,
-        mlp_dim=None,
-        batch_size=2,
-    ):
-        self.parent = parent
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.output_channels = output_channels
-        self.num_channels = num_channels
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.initializer_factor = initializer_factor
-        self.hidden_act = hidden_act
-        self.layer_norm_eps = layer_norm_eps
-        self.qkv_bias = qkv_bias
-        self.mlp_ratio = mlp_ratio
-        self.use_abs_pos = use_abs_pos
-        self.use_rel_pos = use_rel_pos
-        self.rel_pos_zero_init = rel_pos_zero_init
-        self.window_size = window_size
-        self.global_attn_indexes = global_attn_indexes
-        self.num_pos_feats = num_pos_feats
-        self.mlp_dim = mlp_dim
-        self.batch_size = batch_size
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-        self.prompt_encoder_tester = TFSamPromptEncoderTester()
-        self.mask_decoder_tester = TFSamMaskDecoderTester()
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        vision_config = SamVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-            initializer_factor=self.initializer_factor,
-            output_channels=self.output_channels,
-            qkv_bias=self.qkv_bias,
-            mlp_ratio=self.mlp_ratio,
-            use_abs_pos=self.use_abs_pos,
-            use_rel_pos=self.use_rel_pos,
-            rel_pos_zero_init=self.rel_pos_zero_init,
-            window_size=self.window_size,
-            global_attn_indexes=self.global_attn_indexes,
-            num_pos_feats=self.num_pos_feats,
-            mlp_dim=self.mlp_dim,
-        )
-
-        prompt_encoder_config = self.prompt_encoder_tester.get_config()
-
-        mask_decoder_config = self.mask_decoder_tester.get_config()
-
-        return SamConfig(
-            vision_config=vision_config,
-            prompt_encoder_config=prompt_encoder_config,
-            mask_decoder_config=mask_decoder_config,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = TFSamModel(config=config)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.iou_scores.shape, (self.batch_size, 1, 3))
-        self.parent.assertEqual(result.pred_masks.shape[:3], (self.batch_size, 1, 3))
-
-    def create_and_check_get_image_features(self, config, pixel_values):
-        model = TFSamModel(config=config)
-        result = model.get_image_embeddings(pixel_values)
-        self.parent.assertEqual(result[0].shape, (self.output_channels, 12, 12))
-
-    def create_and_check_get_image_hidden_states(self, config, pixel_values):
-        model = TFSamModel(config=config)
-        result = model.vision_encoder(
-            pixel_values,
-            output_hidden_states=True,
-            return_dict=True,
-        )
-
-        # after computing the convolutional features
-        expected_hidden_states_shape = (self.batch_size, 12, 12, 36)
-        self.parent.assertEqual(len(result[1]), self.num_hidden_layers + 1)
-        self.parent.assertEqual(result[1][0].shape, expected_hidden_states_shape)
-
-        result = model.vision_encoder(
-            pixel_values,
-            output_hidden_states=True,
-            return_dict=False,
-        )
-
-        # after computing the convolutional features
-        expected_hidden_states_shape = (self.batch_size, 12, 12, 36)
-        self.parent.assertEqual(len(result[1]), self.num_hidden_layers + 1)
-        self.parent.assertEqual(result[1][0].shape, expected_hidden_states_shape)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFSamModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as SAM's vision encoder does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (TFSamModel,) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": TFSamModel, "mask-generation": TFSamModel} if is_tf_available() else {}
-    )
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_onnx = False
-
-    # TODO: Fix me @Arthur: `run_batch_test` in `tests/test_pipeline_mixin.py` not working
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_case_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        image_processor_name,
-        feature_extractor_name,
-        processor_name,
-    ):
-        return True
-
-    def setUp(self):
-        self.model_tester = TFSamModelTester(self)
-        self.vision_config_tester = ConfigTester(self, config_class=SamVisionConfig, has_text_modality=False)
-        self.prompt_encoder_config_tester = ConfigTester(
-            self,
-            config_class=SamPromptEncoderConfig,
-            has_text_modality=False,
-            num_attention_heads=12,
-            num_hidden_layers=2,
-        )
-        self.mask_decoder_config_tester = ConfigTester(
-            self, config_class=SamMaskDecoderConfig, has_text_modality=False
-        )
-
-    def test_config(self):
-        self.vision_config_tester.run_common_tests()
-        self.prompt_encoder_config_tester.run_common_tests()
-        self.mask_decoder_config_tester.run_common_tests()
-
-    @unittest.skip(reason="SAM's vision encoder does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, keras.layers.Dense))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_get_image_features(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_get_image_features(*config_and_inputs)
-
-    def test_image_hidden_states(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_get_image_hidden_states(*config_and_inputs)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        expected_vision_attention_shape = (
-            self.model_tester.batch_size * self.model_tester.num_attention_heads,
-            196,
-            196,
-        )
-        expected_mask_decoder_attention_shape = (self.model_tester.batch_size, 1, 144, 32)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class._from_config(config, attn_implementation="eager")
-            config = model.config
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            vision_attentions = outputs.vision_attentions
-            self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers)
-
-            mask_decoder_attentions = outputs.mask_decoder_attentions
-            self.assertEqual(len(mask_decoder_attentions), self.model_tester.mask_decoder_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            vision_attentions = outputs.vision_attentions
-            self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers)
-
-            mask_decoder_attentions = outputs.mask_decoder_attentions
-            self.assertEqual(len(mask_decoder_attentions), self.model_tester.mask_decoder_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(vision_attentions[0].shape[-4:]),
-                list(expected_vision_attention_shape),
-            )
-
-            self.assertListEqual(
-                list(mask_decoder_attentions[0].shape[-4:]),
-                list(expected_mask_decoder_attention_shape),
-            )
-
-    @unittest.skip(reason="Hidden_states is tested in create_and_check_model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-base")  # sam-vit-huge blows out our memory
-        self.assertIsNotNone(model)
-
-
-def prepare_image():
-    img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
-    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
-    return raw_image
-
-
-def prepare_dog_img():
-    img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/dog-sam.png"
-    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
-    return raw_image
-
-
-@require_tf
-@slow
-class TFSamModelIntegrationTest(unittest.TestCase):
-    def test_inference_mask_generation_no_point(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        raw_image = prepare_image()
-        inputs = processor(images=raw_image, return_tensors="tf")
-
-        outputs = model(**inputs)
-        scores = tf.squeeze(outputs.iou_scores)
-        masks = outputs.pred_masks[0, 0, 0, 0, :3]
-        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.4515), atol=2e-4))
-        self.assertTrue(np.allclose(masks.numpy(), np.array([-4.1807, -3.4949, -3.4483]), atol=1e-2))
-
-    def test_inference_mask_generation_one_point_one_bb(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        raw_image = prepare_image()
-        input_boxes = [[[650, 900, 1000, 1250]]]
-        input_points = [[[820, 1080]]]
-
-        inputs = processor(images=raw_image, input_boxes=input_boxes, input_points=input_points, return_tensors="tf")
-
-        outputs = model(**inputs)
-        scores = tf.squeeze(outputs.iou_scores)
-        masks = outputs.pred_masks[0, 0, 0, 0, :3]
-
-        self.assertTrue(np.allclose(scores[-1], np.array(0.9566), atol=2e-4))
-        self.assertTrue(np.allclose(masks.numpy(), np.array([-12.7657, -12.3683, -12.5985]), atol=2e-2))
-
-    def test_inference_mask_generation_batched_points_batched_images(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        raw_image = prepare_image()
-        input_points = [
-            [[[820, 1080]], [[820, 1080]], [[820, 1080]], [[820, 1080]]],
-            [[[510, 1080]], [[820, 1080]], [[820, 1080]], [[820, 1080]]],
-        ]
-
-        inputs = processor(images=[raw_image, raw_image], input_points=input_points, return_tensors="tf")
-
-        outputs = model(**inputs)
-        scores = tf.squeeze(outputs.iou_scores)
-        masks = outputs.pred_masks[0, 0, 0, 0, :3]
-
-        EXPECTED_SCORES = np.array(
-            [
-                [
-                    [0.6765, 0.9379, 0.8803],
-                    [0.6765, 0.9379, 0.8803],
-                    [0.6765, 0.9379, 0.8803],
-                    [0.6765, 0.9379, 0.8803],
-                ],
-                [
-                    [0.3317, 0.7264, 0.7646],
-                    [0.6765, 0.9379, 0.8803],
-                    [0.6765, 0.9379, 0.8803],
-                    [0.6765, 0.9379, 0.8803],
-                ],
-            ]
-        )
-        EXPECTED_MASKS = np.array([-2.8552, -2.7990, -2.9612])
-        self.assertTrue(np.allclose(scores.numpy(), EXPECTED_SCORES, atol=1e-3))
-        self.assertTrue(np.allclose(masks.numpy(), EXPECTED_MASKS, atol=3e-2))
-
-    def test_inference_mask_generation_one_point_one_bb_zero(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        raw_image = prepare_image()
-        input_boxes = [[[620, 900, 1000, 1255]]]
-        input_points = [[[820, 1080]]]
-        labels = [[0]]
-
-        inputs = processor(
-            images=raw_image,
-            input_boxes=input_boxes,
-            input_points=input_points,
-            input_labels=labels,
-            return_tensors="tf",
-        )
-
-        outputs = model(**inputs)
-        scores = tf.squeeze(outputs.iou_scores)
-        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.7894), atol=1e-4))
-
-    def test_inference_mask_generation_one_point(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        raw_image = prepare_image()
-
-        input_points = [[[400, 650]]]
-        input_labels = [[1]]
-
-        inputs = processor(images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors="tf")
-
-        outputs = model(**inputs)
-        scores = tf.squeeze(outputs.iou_scores)
-
-        self.assertTrue(np.allclose(scores[-1], np.array(0.9675), atol=1e-4))
-
-        # With no label
-        input_points = [[[400, 650]]]
-
-        inputs = processor(images=raw_image, input_points=input_points, return_tensors="tf")
-
-        outputs = model(**inputs)
-        scores = tf.squeeze(outputs.iou_scores)
-
-        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9675), atol=1e-4))
-
-    def test_inference_mask_generation_two_points(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-        raw_image = prepare_image()
-
-        input_points = [[[400, 650], [800, 650]]]
-        input_labels = [[1, 1]]
-
-        inputs = processor(images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors="tf")
-
-        outputs = model(**inputs)
-        scores = tf.squeeze(outputs.iou_scores)
-
-        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9762), atol=1e-4))
-
-        # no labels
-        inputs = processor(images=raw_image, input_points=input_points, return_tensors="tf")
-
-        outputs = model(**inputs)
-        scores = tf.squeeze(outputs.iou_scores)
-
-        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9762), atol=1e-4))
-
-    def test_inference_mask_generation_two_points_batched(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        raw_image = prepare_image()
-
-        input_points = [[[400, 650], [800, 650]], [[400, 650]]]
-        input_labels = [[1, 1], [1]]
-
-        inputs = processor(
-            images=[raw_image, raw_image], input_points=input_points, input_labels=input_labels, return_tensors="tf"
-        )
-
-        outputs = model(**inputs)
-        scores = tf.squeeze(outputs.iou_scores)
-
-        self.assertTrue(np.allclose(scores[0][-1].numpy(), np.array(0.9762), atol=1e-4))
-        self.assertTrue(np.allclose(scores[1][-1], np.array(0.9637), atol=1e-4))
-
-    def test_inference_mask_generation_one_box(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        raw_image = prepare_image()
-
-        input_boxes = [[[75, 275, 1725, 850]]]
-
-        inputs = processor(images=raw_image, input_boxes=input_boxes, return_tensors="tf")
-
-        outputs = model(**inputs)
-        scores = tf.squeeze(outputs.iou_scores)
-
-        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.7937), atol=1e-4))
-
-    def test_inference_mask_generation_batched_image_one_point(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        raw_image = prepare_image()
-        raw_dog_image = prepare_dog_img()
-
-        input_points = [[[820, 1080]], [[220, 470]]]
-
-        inputs = processor(images=[raw_image, raw_dog_image], input_points=input_points, return_tensors="tf")
-
-        outputs = model(**inputs)
-        scores_batched = tf.squeeze(outputs.iou_scores)
-
-        input_points = [[[220, 470]]]
-
-        inputs = processor(images=raw_dog_image, input_points=input_points, return_tensors="tf")
-
-        outputs = model(**inputs)
-        scores_single = tf.squeeze(outputs.iou_scores)
-        self.assertTrue(np.allclose(scores_batched[1, :].numpy(), scores_single.numpy(), atol=1e-4))
-
-    def test_inference_mask_generation_two_points_point_batch(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        raw_image = prepare_image()
-
-        input_points = tf.convert_to_tensor([[[400, 650]], [[220, 470]]])  # fmt: skip
-
-        input_points = tf.expand_dims(input_points, 0)
-
-        inputs = processor(raw_image, input_points=input_points, return_tensors="tf")
-
-        outputs = model(**inputs)
-
-        iou_scores = outputs.iou_scores
-        self.assertTrue(iou_scores.shape == (1, 2, 3))
-        self.assertTrue(
-            np.allclose(
-                iou_scores.numpy(),
-                np.array([[[0.9105, 0.9825, 0.9675], [0.7646, 0.7943, 0.7774]]]),
-                atol=1e-4,
-                rtol=1e-4,
-            )
-        )
-
-    def test_inference_mask_generation_three_boxes_point_batch(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        raw_image = prepare_image()
-
-        # fmt: off
-        input_boxes = tf.convert_to_tensor([[[620, 900, 1000, 1255]], [[75, 275, 1725, 850]],  [[75, 275, 1725, 850]]])
-        EXPECTED_IOU = np.array([[[0.9773, 0.9881, 0.9522],
-         [0.5996, 0.7661, 0.7937],
-         [0.5996, 0.7661, 0.7937]]])
-        # fmt: on
-        input_boxes = tf.expand_dims(input_boxes, 0)
-
-        inputs = processor(raw_image, input_boxes=input_boxes, return_tensors="tf")
-
-        outputs = model(**inputs)
-
-        iou_scores = outputs.iou_scores
-        self.assertTrue(iou_scores.shape == (1, 3, 3))
-        self.assertTrue(np.allclose(iou_scores.numpy(), EXPECTED_IOU, atol=1e-4, rtol=1e-4))
diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py
deleted file mode 100644
index 8e61e3e001..0000000000
--- a/tests/models/segformer/test_modeling_tf_segformer.py
+++ /dev/null
@@ -1,499 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow SegFormer model."""
-
-from __future__ import annotations
-
-import inspect
-import unittest
-
-from transformers import SegformerConfig
-from transformers.file_utils import is_tf_available, is_vision_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import numpy as np
-    import tensorflow as tf
-
-    from transformers import TFSegformerForImageClassification, TFSegformerForSemanticSegmentation, TFSegformerModel
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import SegformerImageProcessor
-
-
-class TFSegformerConfigTester(ConfigTester):
-    def create_and_test_config_common_properties(self):
-        config = self.config_class(**self.inputs_dict)
-        self.parent.assertTrue(hasattr(config, "hidden_sizes"))
-        self.parent.assertTrue(hasattr(config, "num_attention_heads"))
-        self.parent.assertTrue(hasattr(config, "num_encoder_blocks"))
-
-
-class TFSegformerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=64,
-        num_channels=3,
-        num_encoder_blocks=4,
-        depths=[1, 1, 1, 1],
-        sr_ratios=[8, 4, 2, 1],
-        hidden_sizes=[8, 8, 16, 16],
-        downsampling_rates=[1, 4, 8, 16],
-        num_attention_heads=[1, 1, 2, 2],
-        is_training=True,
-        use_labels=True,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        initializer_range=0.02,
-        num_labels=3,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.num_encoder_blocks = num_encoder_blocks
-        self.sr_ratios = sr_ratios
-        self.depths = depths
-        self.hidden_sizes = hidden_sizes
-        self.downsampling_rates = downsampling_rates
-        self.num_attention_heads = num_attention_heads
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
-
-        config = self.get_config()
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return SegformerConfig(
-            image_size=self.image_size,
-            num_channels=self.num_channels,
-            num_encoder_blocks=self.num_encoder_blocks,
-            depths=self.depths,
-            hidden_sizes=self.hidden_sizes,
-            num_attention_heads=self.num_attention_heads,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            initializer_range=self.initializer_range,
-            num_labels=self.num_labels,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = TFSegformerModel(config=config)
-        result = model(pixel_values, training=False)
-        expected_height = expected_width = self.image_size // (self.downsampling_rates[-1] * 2)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.hidden_sizes[-1], expected_height, expected_width)
-        )
-
-    def create_and_check_for_image_segmentation(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
-        model = TFSegformerForSemanticSegmentation(config)
-        result = model(pixel_values, training=False)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, self.image_size // 4, self.image_size // 4)
-        )
-        result = model(pixel_values, labels=labels, training=False)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, self.image_size // 4, self.image_size // 4)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_keras_fit(self, for_segmentation: bool = False):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, seg_labels = config_and_inputs
-        if for_segmentation:
-            inputs_dict = {"pixel_values": pixel_values, "labels": seg_labels}
-        else:
-            inputs_dict = {"pixel_values": pixel_values, "labels": tf.zeros(self.batch_size)}
-        return config, inputs_dict
-
-
-@require_tf
-class TFSegformerModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (TFSegformerModel, TFSegformerForImageClassification, TFSegformerForSemanticSegmentation)
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"feature-extraction": TFSegformerModel, "image-classification": TFSegformerForImageClassification}
-        if is_tf_available()
-        else {}
-    )
-
-    test_head_masking = False
-    test_onnx = False
-    test_pruning = False
-    test_resize_embeddings = False
-
-    def setUp(self):
-        self.model_tester = TFSegformerModelTester(self)
-        self.config_tester = TFSegformerConfigTester(self, config_class=SegformerConfig, has_text_modality=False)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip("SegFormer does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip("SegFormer does not have get_input_embeddings method and get_output_embeddings methods")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-
-            expected_num_attentions = sum(self.model_tester.depths)
-            self.assertEqual(len(attentions), expected_num_attentions)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-
-            self.assertEqual(len(attentions), expected_num_attentions)
-
-            # verify the first attentions (first block, first layer)
-            expected_seq_len = (self.model_tester.image_size // 4) ** 2
-            expected_reduced_seq_len = (self.model_tester.image_size // (4 * self.model_tester.sr_ratios[0])) ** 2
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads[0], expected_seq_len, expected_reduced_seq_len],
-            )
-
-            # verify the last attentions (last block, last layer)
-            expected_seq_len = (self.model_tester.image_size // 32) ** 2
-            expected_reduced_seq_len = (self.model_tester.image_size // (32 * self.model_tester.sr_ratios[-1])) ** 2
-            self.assertListEqual(
-                list(attentions[-1].shape[-3:]),
-                [self.model_tester.num_attention_heads[-1], expected_seq_len, expected_reduced_seq_len],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            self.assertEqual(out_len + 1, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), expected_num_attentions)
-            # verify the first attentions (first block, first layer)
-            expected_seq_len = (self.model_tester.image_size // 4) ** 2
-            expected_reduced_seq_len = (self.model_tester.image_size // (4 * self.model_tester.sr_ratios[0])) ** 2
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads[0], expected_seq_len, expected_reduced_seq_len],
-            )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_layers = self.model_tester.num_encoder_blocks
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            # verify the first hidden states (first block)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-3:]),
-                [
-                    self.model_tester.hidden_sizes[0],
-                    self.model_tester.image_size // 4,
-                    self.model_tester.image_size // 4,
-                ],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_model_outputs_equivalence(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
-            dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
-
-            def recursive_check(tuple_object, dict_object):
-                if isinstance(tuple_object, (list, tuple)):
-                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
-                        recursive_check(tuple_iterable_value, dict_iterable_value)
-                elif tuple_object is None:
-                    return
-                else:
-                    self.assertTrue(
-                        all(tf.equal(tuple_object, dict_object)),
-                        msg=(
-                            "Tuple and dict output are not equal. Difference:"
-                            f" {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}"
-                        ),
-                    )
-
-                recursive_check(tuple_output, dict_output)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            if self.has_attentions:
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
-
-            # todo: incorporate label support for semantic segmentation in `test_modeling_tf_common.py`.
-
-    @unittest.skipIf(
-        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
-        reason="TF does not support backprop for grouped convolutions on CPU.",
-    )
-    def test_dataset_conversion(self):
-        super().test_dataset_conversion()
-
-    def check_keras_fit_results(self, val_loss1, val_loss2, atol=2e-1, rtol=2e-1):
-        self.assertTrue(np.allclose(val_loss1, val_loss2, atol=atol, rtol=rtol))
-
-    @unittest.skipIf(
-        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
-        reason="TF does not support backprop for grouped convolutions on CPU.",
-    )
-    @slow
-    def test_keras_fit(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            # Since `TFSegformerModel` cannot operate with the default `fit()` method.
-            if model_class.__name__ != "TFSegformerModel":
-                model = model_class(config)
-                if getattr(model, "hf_compute_loss", None):
-                    super().test_keras_fit()
-
-    def test_loss_computation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def apply(model):
-            for_segmentation = True if model_class.__name__ == "TFSegformerForSemanticSegmentation" else False
-            # The number of elements in the loss should be the same as the number of elements in the label
-            _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit(
-                for_segmentation=for_segmentation
-            )
-            added_label = prepared_for_class[sorted(prepared_for_class.keys() - inputs_dict.keys(), reverse=True)[0]]
-            loss_size = tf.size(added_label)
-
-            # Test that model correctly compute the loss with kwargs
-            possible_input_names = {"input_ids", "pixel_values", "input_features"}
-            input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
-            model_input = prepared_for_class.pop(input_name)
-
-            loss = model(model_input, **prepared_for_class)[0]
-
-            if model_class.__name__ == "TFSegformerForSemanticSegmentation":
-                # Semantic segmentation loss is computed similarly as
-                # https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_tf_utils.py#L210.
-                self.assertEqual(loss.shape, (1,))
-            else:
-                self.assertEqual(loss.shape, [loss_size])
-
-            # Test that model correctly compute the loss with a dict
-            _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit(
-                for_segmentation=for_segmentation
-            )
-            loss = model(**prepared_for_class)[0]
-
-            if model_class.__name__ == "TFSegformerForSemanticSegmentation":
-                self.assertEqual(loss.shape, (1,))
-            else:
-                self.assertEqual(loss.shape, [loss_size])
-
-            # Test that model correctly compute the loss with a tuple
-            label_keys = prepared_for_class.keys() - inputs_dict.keys()
-            signature = inspect.signature(model.call).parameters
-            signature_names = list(signature.keys())
-
-            # Create a dictionary holding the location of the tensors in the tuple
-            tuple_index_mapping = {0: input_name}
-            for label_key in label_keys:
-                label_key_index = signature_names.index(label_key)
-                tuple_index_mapping[label_key_index] = label_key
-            sorted_tuple_index_mapping = sorted(tuple_index_mapping.items())
-            # Initialize a list with their default values, update the values and convert to a tuple
-            list_input = []
-
-            for name in signature_names:
-                if name != "kwargs":
-                    list_input.append(signature[name].default)
-
-            for index, value in sorted_tuple_index_mapping:
-                list_input[index] = prepared_for_class[value]
-
-            tuple_input = tuple(list_input)
-
-            # Send to model
-            loss = model(tuple_input[:-1])[0]
-            if model_class.__name__ == "TFSegformerForSemanticSegmentation":
-                self.assertEqual(loss.shape, (1,))
-            else:
-                self.assertEqual(loss.shape, [loss_size])
-
-        for model_class in self.all_model_classes:
-            # Since `TFSegformerModel` won't have labels against which we
-            # could compute loss.
-            if model_class.__name__ != "TFSegformerModel":
-                model = model_class(config)
-                apply(model)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "nvidia/segformer-b0-finetuned-ade-512-512"
-        model = TFSegformerModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_tf
-class TFSegformerModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_image_segmentation_ade(self):
-        # only resize + normalize
-        image_processor = SegformerImageProcessor(
-            image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
-        )
-        model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
-
-        image = prepare_img()
-        encoded_inputs = image_processor(images=image, return_tensors="tf")
-        pixel_values = encoded_inputs.pixel_values
-
-        outputs = model(pixel_values, training=False)
-
-        expected_shape = tf.TensorShape((1, model.config.num_labels, 128, 128))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = tf.constant(
-            [
-                [[-4.6310, -5.5232, -6.2356], [-5.1921, -6.1444, -6.5996], [-5.4424, -6.2790, -6.7574]],
-                [[-12.1391, -13.3122, -13.9554], [-12.8732, -13.9352, -14.3563], [-12.9438, -13.8226, -14.2513]],
-                [[-12.5134, -13.4686, -14.4915], [-12.8669, -14.4343, -14.7758], [-13.2523, -14.5819, -15.0694]],
-            ]
-        )
-        tf.debugging.assert_near(outputs.logits[0, :3, :3, :3], expected_slice, atol=1e-4)
-
-    @slow
-    def test_inference_image_segmentation_city(self):
-        # only resize + normalize
-        image_processor = SegformerImageProcessor(
-            image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
-        )
-        model = TFSegformerForSemanticSegmentation.from_pretrained(
-            "nvidia/segformer-b1-finetuned-cityscapes-1024-1024"
-        )
-
-        image = prepare_img()
-        encoded_inputs = image_processor(images=image, return_tensors="tf")
-        pixel_values = encoded_inputs.pixel_values
-
-        outputs = model(pixel_values, training=False)
-
-        expected_shape = tf.TensorShape((1, model.config.num_labels, 128, 128))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = tf.constant(
-            [
-                [[-13.5748, -13.9111, -12.6500], [-14.3500, -15.3683, -14.2328], [-14.7532, -16.0424, -15.6087]],
-                [[-17.1651, -15.8725, -12.9653], [-17.2580, -17.3718, -14.8223], [-16.6058, -16.8783, -16.7452]],
-                [[-3.6456, -3.0209, -1.4203], [-3.0797, -3.1959, -2.0000], [-1.8757, -1.9217, -1.6997]],
-            ]
-        )
-        tf.debugging.assert_near(outputs.logits[0, :3, :3, :3], expected_slice, atol=1e-1)
diff --git a/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
deleted file mode 100644
index 649ac3e292..0000000000
--- a/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
+++ /dev/null
@@ -1,621 +0,0 @@
-# Copyright 2022 HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-import unittest
-
-import numpy as np
-
-from transformers import is_flax_available, is_torch_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import floats_tensor, ids_tensor, random_attention_mask
-from ..bart.test_modeling_flax_bart import FlaxBartStandaloneDecoderModelTester
-from ..bert.test_modeling_flax_bert import FlaxBertModelTester
-from ..gpt2.test_modeling_flax_gpt2 import FlaxGPT2ModelTester
-from ..wav2vec2.test_modeling_flax_wav2vec2 import FlaxWav2Vec2ModelTester
-
-
-if is_flax_available():
-    import jax
-    import jax.numpy as jnp
-    from flax.training.common_utils import onehot
-    from flax.traverse_util import flatten_dict
-
-    from transformers import (
-        FlaxBartForCausalLM,
-        FlaxBertForCausalLM,
-        FlaxGPT2LMHeadModel,
-        FlaxSpeechEncoderDecoderModel,
-        FlaxWav2Vec2Model,
-        SpeechEncoderDecoderConfig,
-    )
-    from transformers.modeling_flax_outputs import FlaxBaseModelOutput
-
-if is_torch_available():
-    from transformers import SpeechEncoderDecoderModel
-
-
-@require_flax
-class FlaxEncoderDecoderMixin:
-    def get_encoder_decoder_model(self, config, decoder_config):
-        raise NotImplementedError
-
-    def prepare_config_and_inputs(self):
-        raise NotImplementedError
-
-    def get_pretrained_model(self):
-        raise NotImplementedError
-
-    def check_encoder_decoder_model_from_pretrained_configs(
-        self,
-        config,
-        inputs,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-        self.assertTrue(encoder_decoder_config.decoder.is_decoder)
-
-        enc_dec_model = FlaxSpeechEncoderDecoderModel(encoder_decoder_config)
-
-        self.assertTrue(enc_dec_model.config.is_encoder_decoder)
-        self.assertFalse(enc_dec_model.config.tie_word_embeddings)
-
-        outputs_encoder_decoder = enc_dec_model(
-            inputs=inputs,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-
-    def check_encoder_decoder_model(
-        self,
-        config,
-        inputs,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        self.assertTrue(enc_dec_model.config.decoder.is_decoder)
-        self.assertTrue(enc_dec_model.config.decoder.add_cross_attention)
-        self.assertTrue(enc_dec_model.config.is_encoder_decoder)
-
-        outputs_encoder_decoder = enc_dec_model(
-            inputs=inputs,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-
-        encoder_outputs = FlaxBaseModelOutput(last_hidden_state=outputs_encoder_decoder.encoder_hidden_states[-1])
-
-        outputs_encoder_decoder = enc_dec_model(
-            attention_mask, decoder_input_ids, decoder_attention_mask, encoder_outputs=encoder_outputs
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-
-    def check_encoder_decoder_model_from_pretrained(
-        self,
-        config,
-        inputs,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        return_dict,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
-        enc_dec_model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
-        outputs_encoder_decoder = enc_dec_model(
-            inputs=inputs,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            output_hidden_states=True,
-            return_dict=True,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-
-    def check_save_and_load(
-        self,
-        config,
-        inputs,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model}
-        enc_dec_model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
-
-        outputs = enc_dec_model(
-            inputs=inputs,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        out_2 = np.array(outputs[0])
-        out_2[np.isnan(out_2)] = 0
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            enc_dec_model.save_pretrained(tmpdirname)
-            FlaxSpeechEncoderDecoderModel.from_pretrained(tmpdirname)
-
-            after_outputs = enc_dec_model(
-                inputs=inputs,
-                attention_mask=attention_mask,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-            out_1 = np.array(after_outputs[0])
-            out_1[np.isnan(out_1)] = 0
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 4e-2)
-
-    def check_encoder_decoder_model_from_encoder_decoder_pretrained(
-        self,
-        config,
-        inputs,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        # assert that loading encoder and decoder models from configs has been correctly executed
-        self.assertEqual(config.add_adapter, encoder_model.config.add_adapter)
-        self.assertEqual(decoder_config.use_cache, decoder_model.config.use_cache)
-
-        with tempfile.TemporaryDirectory() as enc_tmpdir:
-            with tempfile.TemporaryDirectory() as dec_tmpdir:
-                encoder_model.save_pretrained(enc_tmpdir)
-                decoder_model.save_pretrained(dec_tmpdir)
-                # load a model from pretrained encoder and decoder checkpoints, setting one encoder and one decoder kwarg opposite to that specified in their respective configs
-                enc_dec_model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
-                    encoder_pretrained_model_name_or_path=enc_tmpdir,
-                    decoder_pretrained_model_name_or_path=dec_tmpdir,
-                    encoder_add_adapter=not config.add_adapter,
-                    decoder_use_cache=not decoder_config.use_cache,
-                )
-
-        # assert that setting encoder and decoder kwargs opposite to those in the configs has correctly been applied
-        self.assertNotEqual(config.add_adapter, enc_dec_model.config.encoder.add_adapter)
-        self.assertNotEqual(decoder_config.use_cache, enc_dec_model.config.decoder.use_cache)
-
-        outputs_encoder_decoder = enc_dec_model(
-            inputs=inputs,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            output_hidden_states=True,
-            return_dict=True,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-
-    def check_encoder_decoder_model_output_attentions(
-        self,
-        config,
-        inputs,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        # make the decoder inputs a different shape from the encoder inputs to harden the test
-        decoder_input_ids = decoder_input_ids[:, :-1]
-        decoder_attention_mask = decoder_attention_mask[:, :-1]
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model}
-        enc_dec_model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
-        outputs_encoder_decoder = enc_dec_model(
-            inputs=inputs,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            output_attentions=True,
-        )
-
-        encoder_attentions = outputs_encoder_decoder["encoder_attentions"]
-        self.assertEqual(len(encoder_attentions), config.num_hidden_layers)
-
-        seq_len = enc_dec_model._get_feat_extract_output_lengths(inputs.shape[1])
-        self.assertEqual(encoder_attentions[0].shape[-3:], (config.num_attention_heads, seq_len, seq_len))
-
-        decoder_attentions = outputs_encoder_decoder["decoder_attentions"]
-        num_decoder_layers = (
-            decoder_config.num_decoder_layers
-            if hasattr(decoder_config, "num_decoder_layers")
-            else decoder_config.num_hidden_layers
-        )
-        self.assertEqual(len(decoder_attentions), num_decoder_layers)
-
-        self.assertEqual(
-            decoder_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]),
-        )
-
-        cross_attentions = outputs_encoder_decoder["cross_attentions"]
-        self.assertEqual(len(cross_attentions), num_decoder_layers)
-
-        cross_attention_input_seq_len = decoder_input_ids.shape[-1]
-
-        self.assertEqual(
-            cross_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, cross_attention_input_seq_len, seq_len),
-        )
-
-    def check_encoder_decoder_model_generate(self, inputs, config, decoder_config, **kwargs):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model}
-        enc_dec_model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
-
-        pad_token_id = enc_dec_model.config.decoder.pad_token_id
-        eos_token_id = enc_dec_model.config.decoder.eos_token_id
-        decoder_start_token_id = enc_dec_model.config.decoder.decoder_start_token_id
-
-        # Copied from generation.utils (GPT2 doesn't have `pad_token_id`)
-        if pad_token_id is None and eos_token_id is not None:
-            pad_token_id = eos_token_id
-        if decoder_start_token_id is None:
-            decoder_start_token_id = enc_dec_model.config.decoder.bos_token_id
-
-        # Bert does not have a bos token id, so use pad_token_id instead
-        # Copied from `test_modeling_encoder_decoder.py`
-        if decoder_start_token_id is None:
-            decoder_start_token_id = pad_token_id
-
-        generated_output = enc_dec_model.generate(
-            inputs,
-            pad_token_id=pad_token_id,
-            eos_token_id=eos_token_id,
-            decoder_start_token_id=decoder_start_token_id,
-        )
-        generated_sequences = generated_output.sequences
-        self.assertEqual(generated_sequences.shape, (inputs.shape[0],) + (decoder_config.max_length,))
-
-    def check_freeze_feature_encoder(
-        self,
-        config,
-        inputs,
-        attention_mask,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-        enc_dec_model = FlaxSpeechEncoderDecoderModel(encoder_decoder_config)
-        params = enc_dec_model.params
-
-        def cross_entropy(logits, labels):
-            return -jnp.sum(labels * jax.nn.log_softmax(logits, axis=-1), axis=-1)
-
-        # define a dummy loss function for computing the loss over a forward pass
-        def compute_loss(
-            params,
-            inputs,
-            attention_mask,
-            decoder_input_ids,
-            freeze_feature_encoder: bool = False,
-        ):
-            outputs_enc_dec = enc_dec_model(
-                inputs=inputs,
-                attention_mask=attention_mask,
-                decoder_input_ids=decoder_input_ids,
-                freeze_feature_encoder=freeze_feature_encoder,
-                params=params,
-            )
-            logits = outputs_enc_dec.logits
-            vocab_size = logits.shape[-1]
-            loss = cross_entropy(logits, onehot(labels=decoder_input_ids, num_classes=vocab_size)).sum()
-            return (loss, logits)
-
-        # transform the loss function to get the gradients
-        grad_fn = jax.value_and_grad(compute_loss, has_aux=True)
-
-        # compute the loss, logits, and gradients for the unfrozen model
-        (loss, logits), grads = grad_fn(
-            params, inputs, attention_mask, decoder_input_ids, freeze_feature_encoder=False
-        )
-
-        # compare to the loss, logits and gradients for the frozen model
-        (loss_frozen, logits_frozen), grads_frozen = grad_fn(
-            params, inputs, attention_mask, decoder_input_ids, freeze_feature_encoder=True
-        )
-
-        # ensure that the logits and losses remain precisely equal
-        self.assertTrue((logits == logits_frozen).all())
-        self.assertEqual(loss, loss_frozen)
-
-        grads = flatten_dict(grads)
-        grads_frozen = flatten_dict(grads_frozen)
-
-        # ensure that the dicts of gradients contain the same keys
-        self.assertEqual(grads.keys(), grads_frozen.keys())
-
-        # ensure that the gradients of the feature extractor layers are precisely zero when frozen and contain non-zero entries when unfrozen
-        feature_extractor_grads = tuple(grads[k] for k in grads if "feature_extractor" in k)
-        feature_extractor_grads_frozen = tuple(grads_frozen[k] for k in grads_frozen if "feature_extractor" in k)
-
-        for feature_extractor_grad, feature_extractor_grad_frozen in zip(
-            feature_extractor_grads, feature_extractor_grads_frozen
-        ):
-            self.assertTrue((feature_extractor_grad_frozen == 0.0).all())
-            self.assertTrue((feature_extractor_grad > 0.0).any())
-
-        # ensure that the gradients of all unfrozen layers remain precisely equal, i.e. all layers excluding the frozen 'feature_extractor'
-        grads = tuple(grads[k] for k in grads if "feature_extractor" not in k)
-        grads_frozen = tuple(grads_frozen[k] for k in grads_frozen if "feature_extractor" not in k)
-
-        for grad, grad_frozen in zip(grads, grads_frozen):
-            self.assertTrue((grad == grad_frozen).all())
-
-    def test_encoder_decoder_model_from_pretrained_configs(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained_configs(**input_ids_dict)
-
-    def test_encoder_decoder_model_from_pretrained(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=False)
-
-    def test_encoder_decoder_model_from_pretrained_return_dict(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=True)
-
-    def test_save_and_load_from_pretrained(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_save_and_load(**input_ids_dict)
-
-    def test_encoder_decoder_model_from_encoder_decoder_pretrained(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_encoder_decoder_pretrained(**input_ids_dict)
-
-    def test_encoder_decoder_model_output_attentions(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_output_attentions(**input_ids_dict)
-
-    def test_freeze_feature_encoder(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_freeze_feature_encoder(**input_ids_dict)
-
-    def test_encoder_decoder_model_generate(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_generate(**input_ids_dict)
-
-    def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
-        diff = np.abs(a - b).max()
-        self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
-
-    @slow
-    def test_real_model_save_load_from_pretrained(self):
-        model_2 = self.get_pretrained_model()
-        inputs = ids_tensor([13, 5], model_2.config.encoder.vocab_size)
-        decoder_input_ids = ids_tensor([13, 1], model_2.config.decoder.vocab_size)
-        attention_mask = ids_tensor([13, 5], vocab_size=2)
-
-        outputs = model_2(
-            inputs=inputs,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-        )
-        out_2 = np.array(outputs[0])
-        out_2[np.isnan(out_2)] = 0
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            model_2.save_pretrained(tmp_dirname)
-            model_1 = FlaxSpeechEncoderDecoderModel.from_pretrained(tmp_dirname)
-
-            after_outputs = model_1(
-                inputs=inputs,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-            )
-            out_1 = np.array(after_outputs[0])
-            out_1[np.isnan(out_1)] = 0
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 4e-2)
-
-
-@require_flax
-class FlaxWav2Vec2GPT2ModelTest(FlaxEncoderDecoderMixin, unittest.TestCase):
-    def get_pretrained_model_and_inputs(self):
-        model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "facebook/wav2vec2-large-lv60", "openai-community/gpt2-medium"
-        )
-        batch_size = 13
-        input_values = floats_tensor([batch_size, 512], scale=1.0)
-        attention_mask = random_attention_mask([batch_size, 512])
-        decoder_input_ids = ids_tensor([batch_size, 4], model.config.decoder.vocab_size)
-        decoder_attention_mask = random_attention_mask([batch_size, 4])
-        inputs = {
-            "inputs": input_values,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-
-        return model, inputs
-
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = FlaxWav2Vec2Model(config)
-        decoder_model = FlaxGPT2LMHeadModel(decoder_config)
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        model_tester_encoder = FlaxWav2Vec2ModelTester(self, batch_size=13)
-        model_tester_decoder = FlaxGPT2ModelTester(self, batch_size=13)
-        encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
-        decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder()
-        (config, inputs, attention_mask) = encoder_config_and_inputs
-        (
-            decoder_config,
-            decoder_input_ids,
-            decoder_attention_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        return {
-            "config": config,
-            "inputs": inputs,
-            "attention_mask": attention_mask,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "encoder_hidden_states": encoder_hidden_states,
-        }
-
-
-@require_flax
-class FlaxWav2Vec2BartModelTest(FlaxEncoderDecoderMixin, unittest.TestCase):
-    def get_pretrained_model_and_inputs(self):
-        model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "facebook/wav2vec2-large-lv60", "bart-large"
-        )
-        batch_size = 13
-        input_values = floats_tensor([batch_size, 512], scale=1.0)
-        attention_mask = random_attention_mask([batch_size, 512])
-        decoder_input_ids = ids_tensor([batch_size, 4], model.config.decoder.vocab_size)
-        decoder_attention_mask = random_attention_mask([batch_size, 4])
-        inputs = {
-            "inputs": input_values,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-
-        return model, inputs
-
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = FlaxWav2Vec2Model(config)
-        decoder_model = FlaxBartForCausalLM(decoder_config)
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        model_tester_encoder = FlaxWav2Vec2ModelTester(self, batch_size=13)
-        model_tester_decoder = FlaxBartStandaloneDecoderModelTester(self, batch_size=13)
-        encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
-        decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder()
-        (config, inputs, attention_mask) = encoder_config_and_inputs
-        (
-            decoder_config,
-            decoder_input_ids,
-            decoder_attention_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        return {
-            "config": config,
-            "inputs": inputs,
-            "attention_mask": attention_mask,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "encoder_hidden_states": encoder_hidden_states,
-        }
-
-
-@require_flax
-class FlaxWav2Vec2BertModelTest(FlaxEncoderDecoderMixin, unittest.TestCase):
-    def get_pretrained_model_and_inputs(self):
-        model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "facebook/wav2vec2-large-lv60", "google-bert/bert-large-uncased"
-        )
-        batch_size = 13
-        input_values = floats_tensor([batch_size, 512], model.config.encoder.vocab_size)
-        attention_mask = random_attention_mask([batch_size, 512])
-        decoder_input_ids = ids_tensor([batch_size, 4], model.config.decoder.vocab_size)
-        decoder_attention_mask = random_attention_mask([batch_size, 4])
-        inputs = {
-            "inputs": input_values,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-
-        return model, inputs
-
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = FlaxWav2Vec2Model(config)
-        decoder_model = FlaxBertForCausalLM(decoder_config)
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        model_tester_encoder = FlaxWav2Vec2ModelTester(self, batch_size=13)
-        model_tester_decoder = FlaxBertModelTester(self, batch_size=13)
-        encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
-        decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder()
-        (config, inputs, attention_mask) = encoder_config_and_inputs
-        (
-            decoder_config,
-            decoder_input_ids,
-            decoder_attention_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        return {
-            "config": config,
-            "inputs": inputs,
-            "attention_mask": attention_mask,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "encoder_hidden_states": encoder_hidden_states,
-        }
diff --git a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
deleted file mode 100644
index 613081a82e..0000000000
--- a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
+++ /dev/null
@@ -1,490 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow Speech2Text model."""
-
-from __future__ import annotations
-
-import inspect
-import unittest
-
-from transformers import Speech2TextConfig
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-from transformers.utils import cached_property, is_tf_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import Speech2TextProcessor, TFSpeech2TextForConditionalGeneration, TFSpeech2TextModel
-
-
-def prepare_speech_to_text_inputs_dict(
-    config,
-    input_features,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = tf.math.not_equal(input_features, 0)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = tf.math.not_equal(decoder_input_ids, config.pad_token_id)
-    if head_mask is None:
-        head_mask = tf.ones((config.encoder_layers, config.encoder_attention_heads))
-    if decoder_head_mask is None:
-        decoder_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
-    return {
-        "input_features": input_features,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-@require_tf
-class TFSpeech2TextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        num_conv_layers=2,
-        conv_kernel_sizes=(5, 5),
-        conv_channels=32,
-        input_feat_per_channel=24,
-        input_channels=1,
-        hidden_act="relu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        max_source_positions=20,
-        max_target_positions=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-        scale_embedding=False,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.num_conv_layers = num_conv_layers
-        self.conv_kernel_sizes = conv_kernel_sizes
-        self.conv_channels = conv_channels
-        self.input_feat_per_channel = input_feat_per_channel
-        self.input_channels = input_channels
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.max_source_positions = max_source_positions
-        self.max_target_positions = max_target_positions
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.scale_embedding = scale_embedding
-
-    def prepare_config_and_inputs(self):
-        input_features = floats_tensor(
-            [self.batch_size, self.seq_length, self.input_feat_per_channel], self.vocab_size
-        )
-        attention_mask = tf.ones([self.batch_size, self.seq_length], dtype=tf.int64)
-        decoder_input_ids = tf.math.maximum(ids_tensor([self.batch_size, self.seq_length], self.vocab_size), 2)
-
-        config = self.get_config()
-        inputs_dict = prepare_speech_to_text_inputs_dict(
-            config,
-            input_features=input_features,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-        )
-        return config, inputs_dict
-
-    def get_config(self):
-        return Speech2TextConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            num_conv_layers=self.num_conv_layers,
-            conv_kernel_sizes=self.conv_kernel_sizes,
-            conv_channels=self.conv_channels,
-            input_feat_per_channel=self.input_feat_per_channel,
-            input_channels=self.input_channels,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            max_source_positions=self.max_source_positions,
-            max_target_positions=self.max_target_positions,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            scale_embedding=self.scale_embedding,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def get_subsampled_output_lengths(self, input_lengths):
-        """
-        Computes the output length of the convolutional layers
-        """
-
-        for _ in range(self.num_conv_layers):
-            input_lengths = (input_lengths - 1) // 2 + 1
-
-        return input_lengths
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = TFSpeech2TextModel(config=config).get_decoder()
-        input_ids = inputs_dict["decoder_input_ids"]
-        attention_mask = inputs_dict["decoder_attention_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        _, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = tf.math.maximum(ids_tensor((self.batch_size, 3), config.vocab_size), 2)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2, dtype=tf.int64)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, atol=1e-2)
-
-
-@require_tf
-class TFSpeech2TextModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TFSpeech2TextModel, TFSpeech2TextForConditionalGeneration) if is_tf_available() else ()
-    all_generative_model_classes = (TFSpeech2TextForConditionalGeneration,) if is_tf_available() else ()
-    pipeline_model_mapping = {"feature-extraction": TFSpeech2TextModel} if is_tf_available() else {}
-    is_encoder_decoder = True
-    test_pruning = False
-    test_missing_keys = False
-    test_onnx = False
-
-    input_name = "input_ids"
-
-    def setUp(self):
-        self.model_tester = TFSpeech2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Speech2TextConfig)
-        self.maxDiff = 3000
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    # not implemented currently
-    def test_inputs_embeds(self):
-        pass
-
-    # training is not supported yet
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    def test_generate_fp16(self):
-        pass
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            subsampled_seq_length = model._get_feat_extract_output_lengths(seq_length)
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [subsampled_seq_length, self.model_tester.hidden_size],
-            )
-
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.assertIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class._from_config(config, attn_implementation="eager")
-            config = model.config
-
-            subsampled_encoder_seq_length = model._get_feat_extract_output_lengths(encoder_seq_length)
-            subsampled_encoder_key_length = model._get_feat_extract_output_lengths(encoder_key_length)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
-            )
-            out_len = len(outputs)
-
-            correct_outlen = 5
-
-            # loss is at first position
-            if "labels" in inputs_dict:
-                correct_outlen += 1  # loss is added to beginning
-            if "past_key_values" in outputs:
-                correct_outlen += 1  # past_key_values have been returned
-
-            self.assertEqual(out_len, correct_outlen)
-
-            # decoder attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertIsInstance(decoder_attentions, (list, tuple))
-            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-            )
-
-            # cross attentions
-            cross_attentions = outputs.cross_attentions
-            self.assertIsInstance(cross_attentions, (list, tuple))
-            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(cross_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    decoder_seq_length,
-                    subsampled_encoder_key_length,
-                ],
-            )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            added_hidden_states = 2
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
-            )
-
-    def test_resize_token_embeddings(self):
-        # Overwritten method from parent; see `test_resize_embeddings_untied`
-        pass
-
-    def test_resize_tokens_embeddings(self):
-        # see `test_resize_embeddings_untied`
-        pass
-
-    def test_resize_embeddings_untied(self):
-        # TODO: copy test from PT. Not working at the moment because the test relies on `model.resize_token_embeddings`,
-        # whose TF implementation assumes the use of `TFWrappedEmbeddings`. But with a `TFWrappedEmbeddings` we can't
-        # load the weights from PT (also, it induces TF1 behavior, so we might want to rework how
-        # `model.resize_token_embeddings` operates).
-        pass
-
-    def test_generate_without_input_ids(self):
-        pass
-
-    # overwritten from parent -- the input is `input_features`, not `input_ids`
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "input_features",
-                "attention_mask",
-                "decoder_input_ids",
-                "decoder_attention_mask",
-            ]
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-
-@require_tf
-@require_sentencepiece
-@require_tokenizers
-@slow
-class TFSpeech2TextModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_processor(self):
-        return Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
-
-    def _load_datasamples(self, num_samples):
-        from datasets import load_dataset
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_generation_librispeech(self):
-        model = TFSpeech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
-        processor = self.default_processor
-
-        input_speech = self._load_datasamples(1)
-
-        input_features = processor(input_speech, return_tensors="tf").input_features
-
-        generated_ids = model.generate(input_features)
-        generated_transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"
-        ]
-        self.assertListEqual(generated_transcript, EXPECTED_TRANSCRIPTIONS)
-
-    def test_generation_librispeech_batched(self):
-        model = TFSpeech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
-        processor = self.default_processor
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="tf", padding=True)
-        generated_ids = model.generate(inputs.input_features, attention_mask=inputs.attention_mask)
-        generated_transcripts = processor.batch_decode(generated_ids, skip_special_tokens=True)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel",
-            "nor is mister cultar's manner less interesting than his matter",
-            "he tells us that at this festive season of the year with christmas and roast beef looming before us"
-            " similes drawn from eating and its results occur most readily to the mind",
-            "he has grave doubts whether sir frederick leyton's work is really greek after all and can discover in it"
-            " but little of rocky ithaca",
-        ]
-        self.assertListEqual(generated_transcripts, EXPECTED_TRANSCRIPTIONS)
diff --git a/tests/models/swiftformer/test_modeling_tf_swiftformer.py b/tests/models/swiftformer/test_modeling_tf_swiftformer.py
deleted file mode 100644
index 161e5db753..0000000000
--- a/tests/models/swiftformer/test_modeling_tf_swiftformer.py
+++ /dev/null
@@ -1,270 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow SwiftFormer model."""
-
-import inspect
-import unittest
-
-from transformers import SwiftFormerConfig
-from transformers.testing_utils import (
-    require_tf,
-    require_vision,
-    slow,
-)
-from transformers.utils import cached_property, is_tf_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFSwiftFormerForImageClassification, TFSwiftFormerModel
-    from transformers.modeling_tf_utils import keras
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import ViTImageProcessor
-
-
-class TFSwiftFormerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=1,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        image_size=224,
-        num_labels=2,
-        layer_depths=[3, 3, 6, 4],
-        embed_dims=[48, 56, 112, 220],
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.num_labels = num_labels
-        self.image_size = image_size
-        self.layer_depths = layer_depths
-        self.embed_dims = embed_dims
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return SwiftFormerConfig(
-            depths=self.layer_depths,
-            embed_dims=self.embed_dims,
-            mlp_ratio=4,
-            downsamples=[True, True, True, True],
-            hidden_act="gelu",
-            num_labels=self.num_labels,
-            down_patch_size=3,
-            down_stride=2,
-            down_pad=1,
-            drop_rate=0.0,
-            drop_path_rate=0.0,
-            use_layer_scale=True,
-            layer_scale_init_value=1e-5,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = TFSwiftFormerModel(config=config)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.embed_dims[-1], 7, 7))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
-        model = TFSwiftFormerForImageClassification(config)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-        model = TFSwiftFormerForImageClassification(config)
-
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        (config, pixel_values, labels) = self.prepare_config_and_inputs()
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFSwiftFormerModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as SwiftFormer does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (TFSwiftFormerModel, TFSwiftFormerForImageClassification) if is_tf_available() else ()
-
-    pipeline_model_mapping = (
-        {"feature-extraction": TFSwiftFormerModel, "image-classification": TFSwiftFormerForImageClassification}
-        if is_tf_available()
-        else {}
-    )
-
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-    test_onnx = False
-    from_pretrained_id = "MBZUAI/swiftformer-xs"
-
-    def setUp(self):
-        self.model_tester = TFSwiftFormerModelTester(self)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=SwiftFormerConfig,
-            has_text_modality=False,
-            hidden_size=37,
-            num_attention_heads=12,
-            num_hidden_layers=12,
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="TFSwiftFormer does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, keras.layers.Dense))
-
-    # Copied from transformers.tests.models.deit.test_modeling_tf_deit.py
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TFSwiftFormerModel.from_pretrained(self.from_pretrained_id)
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="TFSwiftFormer does not output attentions")
-    def test_attention_outputs(self):
-        pass
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_stages = 8
-            self.assertEqual(len(hidden_states), expected_num_stages)
-
-            # SwiftFormer's feature maps are of shape (batch_size, embed_dims, height, width)
-            # with the width and height being successively divided by 2, after every 2 blocks
-            for i in range(len(hidden_states)):
-                self.assertEqual(
-                    hidden_states[i].shape,
-                    tf.TensorShape(
-                        [
-                            self.model_tester.batch_size,
-                            self.model_tester.embed_dims[i // 2],
-                            (self.model_tester.image_size // 4) // 2 ** (i // 2),
-                            (self.model_tester.image_size // 4) // 2 ** (i // 2),
-                        ]
-                    ),
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_tf
-@require_vision
-class TFSwiftFormerModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return ViTImageProcessor.from_pretrained("MBZUAI/swiftformer-xs") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = TFSwiftFormerForImageClassification.from_pretrained("MBZUAI/swiftformer-xs")
-
-        feature_extractor = self.default_feature_extractor
-        image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = tf.constant([[-2.1703e00, 2.1107e00, -2.0811e00]])
-        tf.debugging.assert_near(outputs.logits[0, :3], expected_slice, atol=1e-4)
diff --git a/tests/models/swin/test_modeling_tf_swin.py b/tests/models/swin/test_modeling_tf_swin.py
deleted file mode 100644
index b72369480d..0000000000
--- a/tests/models/swin/test_modeling_tf_swin.py
+++ /dev/null
@@ -1,405 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TF 2.0 Swin model."""
-
-from __future__ import annotations
-
-import inspect
-import unittest
-
-import numpy as np
-
-from transformers import SwinConfig
-from transformers.testing_utils import require_tf, require_vision, slow, to_2tuple
-from transformers.utils import cached_property, is_tf_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers.modeling_tf_utils import keras
-    from transformers.models.swin.modeling_tf_swin import (
-        TFSwinForImageClassification,
-        TFSwinForMaskedImageModeling,
-        TFSwinModel,
-    )
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import AutoImageProcessor
-
-
-class TFSwinModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=32,
-        patch_size=2,
-        num_channels=3,
-        embed_dim=16,
-        depths=[1, 2, 1],
-        num_heads=[2, 2, 4],
-        window_size=2,
-        mlp_ratio=2.0,
-        qkv_bias=True,
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        drop_path_rate=0.1,
-        hidden_act="gelu",
-        use_absolute_embeddings=False,
-        patch_norm=True,
-        initializer_range=0.02,
-        layer_norm_eps=1e-5,
-        is_training=True,
-        scope=None,
-        use_labels=True,
-        type_sequence_label_size=10,
-        encoder_stride=8,
-    ) -> None:
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.embed_dim = embed_dim
-        self.depths = depths
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.mlp_ratio = mlp_ratio
-        self.qkv_bias = qkv_bias
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.drop_path_rate = drop_path_rate
-        self.hidden_act = hidden_act
-        self.use_absolute_embeddings = use_absolute_embeddings
-        self.patch_norm = patch_norm
-        self.layer_norm_eps = layer_norm_eps
-        self.initializer_range = initializer_range
-        self.is_training = is_training
-        self.scope = scope
-        self.use_labels = use_labels
-        self.type_sequence_label_size = type_sequence_label_size
-        self.encoder_stride = encoder_stride
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return SwinConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            embed_dim=self.embed_dim,
-            depths=self.depths,
-            num_heads=self.num_heads,
-            window_size=self.window_size,
-            mlp_ratio=self.mlp_ratio,
-            qkv_bias=self.qkv_bias,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            drop_path_rate=self.drop_path_rate,
-            hidden_act=self.hidden_act,
-            use_absolute_embeddings=self.use_absolute_embeddings,
-            path_norm=self.patch_norm,
-            layer_norm_eps=self.layer_norm_eps,
-            initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = TFSwinModel(config=config)
-        result = model(pixel_values)
-
-        expected_seq_len = ((config.image_size // config.patch_size) ** 2) // (4 ** (len(config.depths) - 1))
-        expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1))
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim))
-
-    def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels):
-        model = TFSwinForMaskedImageModeling(config=config)
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
-        )
-
-        # test greyscale images
-        config.num_channels = 1
-        model = TFSwinForMaskedImageModeling(config)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 1, self.image_size, self.image_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = TFSwinForImageClassification(config)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = TFSwinForImageClassification(config)
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFSwinModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFSwinModel,
-            TFSwinForImageClassification,
-            TFSwinForMaskedImageModeling,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"feature-extraction": TFSwinModel, "image-classification": TFSwinForImageClassification}
-        if is_tf_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFSwinModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=SwinConfig, embed_dim=37)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_image_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @unittest.skip(reason="Swin does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), keras.layers.Layer)
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, keras.layers.Dense))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            expected_num_attentions = len(self.model_tester.depths)
-            self.assertEqual(len(attentions), expected_num_attentions)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            window_size_squared = config.window_size**2
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), expected_num_attentions)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_heads[0], window_size_squared, window_size_squared],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            else:
-                # also another +1 for reshaped_hidden_states
-                added_hidden_states = 2
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), expected_num_attentions)
-
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_heads[0], window_size_squared, window_size_squared],
-            )
-
-    def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
-        model = model_class(config)
-        outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-        hidden_states = outputs.hidden_states
-
-        expected_num_layers = getattr(
-            self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1
-        )
-        self.assertEqual(len(hidden_states), expected_num_layers)
-
-        # Swin has a different seq_length
-        patch_size = to_2tuple(config.patch_size)
-
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-
-        self.assertListEqual(
-            list(hidden_states[0].shape[-2:]),
-            [num_patches, self.model_tester.embed_dim],
-        )
-
-        reshaped_hidden_states = outputs.reshaped_hidden_states
-        self.assertEqual(len(reshaped_hidden_states), expected_num_layers)
-
-        batch_size, num_channels, height, width = reshaped_hidden_states[0].shape
-
-        reshaped_hidden_states = tf.reshape(reshaped_hidden_states[0], (batch_size, num_channels, height * width))
-        reshaped_hidden_states = tf.transpose(reshaped_hidden_states, (0, 2, 1))
-
-        self.assertListEqual(
-            list(reshaped_hidden_states.shape[-2:]),
-            [num_patches, self.model_tester.embed_dim],
-        )
-
-    def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        image_size = to_2tuple(self.model_tester.image_size)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
-
-    def test_inputs_requiring_padding(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.patch_size = 3
-
-        image_size = to_2tuple(self.model_tester.image_size)
-        patch_size = to_2tuple(config.patch_size)
-
-        padded_height = image_size[0] + patch_size[0] - (image_size[0] % patch_size[0])
-        padded_width = image_size[1] + patch_size[1] - (image_size[1] % patch_size[1])
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/swin-tiny-patch4-window7-224"
-        model = TFSwinModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_vision
-@require_tf
-class TFSwinModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = TFSwinForImageClassification.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
-        image_processor = self.default_image_processor
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = image_processor(images=image, return_tensors="tf")
-
-        # forward pass
-        outputs = model(inputs)
-
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        expected_slice = tf.constant([-0.0948, -0.6454, -0.0921])
-        self.assertTrue(np.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/t5/test_modeling_flax_t5.py b/tests/models/t5/test_modeling_flax_t5.py
deleted file mode 100644
index d0eb9e0f50..0000000000
--- a/tests/models/t5/test_modeling_flax_t5.py
+++ /dev/null
@@ -1,922 +0,0 @@
-# Copyright 2021 Google T5 Authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import tempfile
-import unittest
-
-import numpy as np
-
-from transformers import is_flax_available
-from transformers.testing_utils import require_flax, require_sentencepiece, require_tokenizers, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
-
-
-if is_flax_available():
-    import os
-
-    # The slow tests are often failing with OOM error on GPU
-    # This makes JAX allocate exactly what is needed on demand, and deallocate memory that is no longer needed
-    # but will be slower as stated here https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
-    os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
-
-    import jax
-    import jax.numpy as jnp
-    import optax
-    from flax.core.frozen_dict import unfreeze
-    from flax.training.common_utils import onehot
-    from flax.traverse_util import flatten_dict
-
-    from transformers import FLAX_MODEL_MAPPING, ByT5Tokenizer, T5Config, T5Tokenizer
-    from transformers.models.t5.modeling_flax_t5 import (
-        FlaxT5EncoderModel,
-        FlaxT5ForConditionalGeneration,
-        FlaxT5Model,
-        shift_tokens_right,
-    )
-
-
-class FlaxT5ModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        encoder_seq_length=7,
-        decoder_seq_length=9,
-        # For common tests
-        is_training=True,
-        use_attention_mask=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=8,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        eos_token_id=1,
-        pad_token_id=0,
-        decoder_start_token_id=0,
-        scope=None,
-        decoder_layers=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.scope = None
-        self.decoder_layers = decoder_layers
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        decoder_attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        config = T5Config(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-        )
-
-        return (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-    ):
-        model = FlaxT5Model(config=config)
-        result = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-        decoder_output = result.last_hidden_state
-        encoder_output = result.encoder_last_hidden_state
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, self.encoder_seq_length, self.hidden_size))
-        self.parent.assertEqual(decoder_output.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size))
-
-    def check_use_cache_forward_with_attn_mask(
-        self,
-        model_class_name,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-    ):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        encoder_outputs = model.encode(input_ids)
-
-        # prevent fully zero'd out attention mask
-        decoder_attention_mask = jnp.ones_like(decoder_attention_mask)
-
-        decoder_attention_mask_cache = jnp.concatenate(
-            [
-                decoder_attention_mask,
-                jnp.zeros((decoder_attention_mask.shape[0], max_decoder_length - decoder_attention_mask.shape[1])),
-            ],
-            axis=-1,
-        )
-
-        past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
-
-        outputs_cache = model.decode(
-            decoder_input_ids[:, :-1],
-            encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask_cache,
-            past_key_values=past_key_values,
-        )
-        outputs_cache_next = model.decode(
-            decoder_input_ids[:, -1:],
-            encoder_outputs,
-            past_key_values=outputs_cache.past_key_values,
-            decoder_attention_mask=decoder_attention_mask_cache,
-        )
-
-        outputs = model.decode(decoder_input_ids, encoder_outputs, decoder_attention_mask=decoder_attention_mask)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_flax
-class FlaxT5ModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxT5Model, FlaxT5ForConditionalGeneration) if is_flax_available() else ()
-    is_encoder_decoder = True
-
-    def setUp(self):
-        self.model_tester = FlaxT5ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_v1_1(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        # check that gated gelu feed forward and different word embeddings work
-        config = config_and_inputs[0]
-        config.tie_word_embeddings = False
-        config.feed_forward_proj = "gated-gelu"
-        self.model_tester.create_and_check_model(config, *config_and_inputs[1:])
-
-    def test_use_cache_forward_with_attn_mask(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            self.model_tester.check_use_cache_forward_with_attn_mask(model_class, *config_and_inputs)
-
-    def test_encode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def encode_jitted(input_ids, attention_mask=None, **kwargs):
-                    return model.encode(input_ids=input_ids, attention_mask=attention_mask)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    def test_decode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                model = model_class(config)
-                encoder_outputs = model.encode(inputs_dict["input_ids"], inputs_dict["attention_mask"])
-
-                prepared_inputs_dict = {
-                    "decoder_input_ids": inputs_dict["decoder_input_ids"],
-                    "decoder_attention_mask": inputs_dict["decoder_attention_mask"],
-                    "encoder_outputs": encoder_outputs,
-                }
-
-                @jax.jit
-                def decode_jitted(decoder_input_ids, decoder_attention_mask, encoder_outputs):
-                    return model.decode(
-                        decoder_input_ids=decoder_input_ids,
-                        decoder_attention_mask=decoder_attention_mask,
-                        encoder_outputs=encoder_outputs,
-                    )
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    def test_shift_right(self):
-        decoder_start_token_id = 0
-        pad_token_id = 1
-        labels = np.arange(2, 102).reshape(5, 20)
-        labels[:2, 15:] = -100
-
-        decoder_input_ids = shift_tokens_right(labels, pad_token_id, decoder_start_token_id)
-        np_decoder_input_ids = np.array(decoder_input_ids)
-
-        padded_slice = np_decoder_input_ids[:2, (15 + 1) :]
-        self.assertTrue((padded_slice == 1).all())
-
-        not_padded_slice = np_decoder_input_ids[2:, 1:]
-        rolled_labels = np.roll(labels[2:], 1)[:, 1:]
-        self.assertTrue((not_padded_slice == rolled_labels).all())
-        self.assertTrue((np_decoder_input_ids[:, 0] == 0).all())
-
-    # overwrite since special base model prefix is used
-    def test_save_load_from_base(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = base_class(config)
-            base_params = flatten_dict(unfreeze(model.params))
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                head_model = model_class.from_pretrained(tmpdirname)
-
-                base_param_from_head = flatten_dict(unfreeze(head_model.params))
-
-                for key in base_param_from_head.keys():
-                    max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite since special base model prefix is used
-    def test_save_load_to_base(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            base_params_from_head = flatten_dict(unfreeze(model.params))
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-
-class FlaxT5EncoderOnlyModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        encoder_seq_length=7,
-        # For common tests
-        is_training=True,
-        use_attention_mask=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=8,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        eos_token_id=1,
-        pad_token_id=0,
-        decoder_start_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        # For common tests
-        self.seq_length = self.encoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.scope = None
-        self.decoder_layers = 0
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-
-        config = T5Config(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            is_encoder_decoder=False,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-    ):
-        model = FlaxT5EncoderModel(config=config)
-        result = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-        )
-        result = model(input_ids=input_ids)
-        encoder_output = result.last_hidden_state
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, self.encoder_seq_length, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_flax
-class FlaxT5EncoderOnlyModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxT5EncoderModel,) if is_flax_available() else ()
-    is_encoder_decoder = False
-
-    def setUp(self):
-        self.model_tester = FlaxT5EncoderOnlyModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_v1_1(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        # check that gated gelu feed forward and different word embeddings work
-        config = config_and_inputs[0]
-        config.tie_word_embeddings = False
-        config.feed_forward_proj = "gated-gelu"
-        self.model_tester.create_and_check_model(config, *config_and_inputs[1:])
-
-    def test_encode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def encode_jitted(input_ids, attention_mask=None, **kwargs):
-                    return model(input_ids=input_ids, attention_mask=attention_mask)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    # overwrite since special base model prefix is used
-    def test_save_load_from_base(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = base_class(config)
-            base_params = flatten_dict(unfreeze(model.params))
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                head_model = model_class.from_pretrained(tmpdirname)
-
-                base_param_from_head = flatten_dict(unfreeze(head_model.params))
-
-                for key in base_param_from_head.keys():
-                    max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite since special base model prefix is used
-    def test_save_load_to_base(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            base_params_from_head = flatten_dict(unfreeze(model.params))
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-
-@require_sentencepiece
-@require_tokenizers
-@require_flax
-class FlaxT5ModelIntegrationTests(unittest.TestCase):
-    @slow
-    def test_small_integration_test(self):
-        """
-        For comparison run:
-        >>> import t5  # pip install t5==0.7.1
-        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
-
-        >>> path_to_mtf_small_t5_checkpoint = '<fill_in>'
-        >>> path_to_mtf_small_spm_model_path = '<fill_in>'
-        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_t5_checkpoint, batch_size=1, tpu=None)
-        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_spm_model_path, extra_ids=100)
-        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
-        """
-
-        model = FlaxT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
-        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
-
-        input_ids = tokenizer("Hello there", return_tensors="np").input_ids
-        labels = tokenizer("Hi I am", return_tensors="np").input_ids
-
-        decoder_input_ids = shift_tokens_right(labels, model.config.pad_token_id, model.config.decoder_start_token_id)
-
-        logits = model(input_ids, decoder_input_ids=decoder_input_ids).logits
-
-        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])).mean()
-        mtf_score = -(labels.shape[-1] * loss.item())
-
-        EXPECTED_SCORE = -19.0845
-        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
-
-    @slow
-    def test_small_v1_1_integration_test(self):
-        """
-        For comparison run:
-        >>> import t5  # pip install t5==0.7.1
-        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
-
-        >>> path_to_mtf_small_t5_v1_1_checkpoint = '<fill_in>'
-        >>> path_to_mtf_small_spm_model_path = '<fill_in>'
-        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_t5_v1_1_checkpoint, batch_size=1, tpu=None)
-        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_spm_model_path, extra_ids=100)
-        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
-        """
-
-        model = FlaxT5ForConditionalGeneration.from_pretrained("google/t5-v1_1-small")
-        tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-small")
-
-        input_ids = tokenizer("Hello there", return_tensors="np").input_ids
-        labels = tokenizer("Hi I am", return_tensors="np").input_ids
-
-        decoder_input_ids = shift_tokens_right(labels, model.config.pad_token_id, model.config.decoder_start_token_id)
-
-        logits = model(input_ids, decoder_input_ids=decoder_input_ids).logits
-        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])).mean()
-
-        mtf_score = -(labels.shape[-1] * loss.item())
-
-        EXPECTED_SCORE = -59.0293
-        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
-
-    @slow
-    def test_small_byt5_integration_test(self):
-        """
-        For comparison run:
-        >>> import t5  # pip install t5==0.9.1
-
-        >>> path_to_byt5_small_checkpoint = '<fill_in>'
-        >>> t5_model = t5.models.MtfModel(model_dir=path_to_tf_checkpoint, batch_size=1, tpu=None)
-        >>> vocab = t5.data.ByteVocabulary()
-        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
-        """
-
-        model = FlaxT5ForConditionalGeneration.from_pretrained("google/byt5-small")
-        tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
-
-        input_ids = tokenizer("Hello there", return_tensors="np").input_ids
-        labels = tokenizer("Hi I am", return_tensors="np").input_ids
-
-        decoder_input_ids = shift_tokens_right(labels, model.config.pad_token_id, model.config.decoder_start_token_id)
-
-        logits = model(input_ids, decoder_input_ids=decoder_input_ids).logits
-        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])).mean()
-
-        mtf_score = -(labels.shape[-1] * loss.item())
-
-        EXPECTED_SCORE = -60.7397
-        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
-
-    @slow
-    def test_small_generation(self):
-        model = FlaxT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
-        model.config.max_length = 8
-        model.config.num_beams = 1
-        model.config.do_sample = False
-        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
-
-        input_ids = tokenizer("summarize: Hello there", return_tensors="np").input_ids
-
-        sequences = model.generate(input_ids).sequences
-
-        output_str = tokenizer.batch_decode(sequences, skip_special_tokens=True)[0]
-        self.assertTrue(output_str == "Hello there!")
-
-    @slow
-    def test_small_generation_bfloat16(self):
-        model = FlaxT5ForConditionalGeneration.from_pretrained("google-t5/t5-small", dtype=jnp.bfloat16)
-        model.config.max_length = 8
-        model.config.num_beams = 1
-        model.config.do_sample = False
-        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
-
-        input_ids = tokenizer("summarize: Hello there", return_tensors="np").input_ids
-
-        sequences = model.generate(input_ids).sequences
-
-        output_str = tokenizer.batch_decode(sequences, skip_special_tokens=True)[0]
-        self.assertTrue(output_str == "Hello there!")
-
-    @slow
-    def test_summarization(self):
-        model = FlaxT5ForConditionalGeneration.from_pretrained("google-t5/t5-base")
-        tok = T5Tokenizer.from_pretrained("google-t5/t5-base")
-
-        FRANCE_ARTICLE = (  # @noqa
-            "Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings"
-            " Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane."
-            ' Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation."'
-            ' He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s'
-            " comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video"
-            " showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French"
-            " Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a"
-            " phone at the wreckage site. The two publications described the supposed video, but did not post it on"
-            " their websites. The publications said that they watched the video, which was found by a source close to"
-            " the investigation. \"One can hear cries of 'My God' in several languages,\" Paris Match reported."
-            ' "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the'
-            " cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the"
-            ' screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt,'
-            " editor-in-chief of Bild online. An official with France's accident investigation agency, the BEA, said"
-            " the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman"
-            " in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the"
-            ' reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said,'
-            ' but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be'
-            " sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by"
-            " specialized technicians working hand-in-hand with investigators. But none of the cell phones found so"
-            " far have been sent to the institute, Menichini said. Asked whether staff involved in the search could"
-            ' have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin'
-            ' Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match'
-            ' are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered'
-            ' cell phones from the crash site after Bild and Paris Match published their reports. "That is something'
-            " we did not know before. ... Overall we can say many things of the investigation weren't revealed by the"
-            ' investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline'
-            " Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the"
-            " controls of Germanwings Flight 9525, which he's accused of deliberately crashing last week in the"
-            ' French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of'
-            ' severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school'
-            " discovered in an internal investigation, Lufthansa said, included medical documents he submitted in"
-            " connection with resuming his flight training. The announcement indicates that Lufthansa, the parent"
-            " company of Germanwings, knew of Lubitz's battle with depression, allowed him to continue training and"
-            " ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100%"
-            ' fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was'
-            " sharing the information and documents -- including training and medical records -- with public"
-            " prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the"
-            " past week to recover human remains and plane debris scattered across a steep mountainside. He saw the"
-            " crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash"
-            " site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late"
-            " Tuesday that no visible human remains were left at the site but recovery teams would keep searching."
-            " French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all"
-            " the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested."
-            " In the meantime, the recovery of the victims' personal belongings will start Wednesday, Menichini said."
-            " Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew"
-            " on board. Check out the latest from our correspondents . The details about Lubitz's correspondence with"
-            " the flight school during his training were among several developments as investigators continued to"
-            " delve into what caused the crash and Lubitz's possible motive for downing the jet. A Lufthansa"
-            " spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his"
-            ' examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in'
-            " Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at"
-            " some point before his aviation career and underwent psychotherapy before he got his pilot's license."
-            " Kumpa emphasized there's no evidence suggesting Lubitz was suicidal or acting aggressively before the"
-            " crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to"
-            " lose his pilot's license, a European government official briefed on the investigation told CNN on"
-            ' Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being'
-            " considered. Another source, a law enforcement official briefed on the investigation, also told CNN that"
-            " authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would"
-            " not be allowed to fly because of his medical problems. Lubitz's girlfriend told investigators he had"
-            " seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded"
-            " he had psychological issues, the European government official said. But no matter what details emerge"
-            " about his previous mental health struggles, there's more to the story, said Brian Russell, a forensic"
-            ' psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact'
-            " that maybe they weren't going to keep doing their job and they're upset about that and so they're"
-            ' suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to'
-            " also take that rage and turn it outward on 149 other people who had nothing to do with the person's"
-            ' problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight'
-            " 9525? CNN's Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura"
-            " Smith-Spark wrote from London. CNN's Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine"
-            " Amiel and Anna-Maja Rappard contributed to this report."
-        )
-        SHORTER_ARTICLE = (
-            "(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on"
-            " Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The"
-            " formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based."
-            " The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its"
-            ' jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East'
-            ' Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the'
-            " situation in Palestinian territories, paving the way for possible war crimes investigations against"
-            " Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and"
-            " the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the"
-            " body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony, said it was a"
-            ' move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the'
-            ' world is also a step closer to ending a long era of impunity and injustice," he said, according to an'
-            ' ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge'
-            " Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the"
-            ' Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine'
-            " acquires all the rights as well as responsibilities that come with being a State Party to the Statute."
-            ' These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights'
-            ' Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should'
-            " immediately end their pressure, and countries that support universal acceptance of the court's treaty"
-            ' should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the'
-            " group. \"What's objectionable is the attempts to undermine international justice, not Palestine's"
-            ' decision to join a treaty to which over 100 countries around the world are members." In January, when'
-            " the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an"
-            ' outrage, saying the court was overstepping its boundaries. The United States also said it "strongly"'
-            " disagreed with the court's decision. \"As we have said repeatedly, we do not believe that Palestine is a"
-            ' state and therefore we do not believe that it is eligible to join the ICC," the State Department said in'
-            ' a statement. It urged the warring sides to resolve their differences through direct negotiations. "We'
-            ' will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace,"'
-            " it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the"
-            ' territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the'
-            " court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou"
-            ' Bensouda said her office would "conduct its analysis in full independence and impartiality." The war'
-            " between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry"
-            " will include alleged war crimes committed since June. The International Criminal Court was set up in"
-            " 2002 to prosecute genocide, crimes against humanity and war crimes. CNN's Vasco Cotovio, Kareem Khadder"
-            " and Faith Karimi contributed to this report."
-        )
-        IRAN_ARTICLE = (
-            "(CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran"
-            " in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively"
-            " block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger."
-            " Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli"
-            " Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a"
-            " letter to the Iranian leadership warning them away from a deal. The debate that has already begun since"
-            " the announcement of the new framework will likely result in more heat than light. It will not be helped"
-            " by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: ."
-            " The most misleading assertion, despite universal rejection by experts, is that the negotiations'"
-            " objective at the outset was the total elimination of any nuclear program in Iran. That is the position"
-            " of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it"
-            " had been, there would have been no Iranian team at the negotiating table. Rather, the objective has"
-            " always been to structure an agreement or series of agreements so that Iran could not covertly develop a"
-            " nuclear arsenal before the United States and its allies could respond. The new framework has exceeded"
-            " expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by"
-            " two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another"
-            " dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite"
-            " sharp accusations by some in the United States and its allies, Iran denies having such a program, and"
-            " U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's"
-            " continued cooperation with International Atomic Energy Agency inspections is further evidence on this"
-            " point, and we'll know even more about Iran's program in the coming months and years because of the deal."
-            " In fact, the inspections provisions that are part of this agreement are designed to protect against any"
-            " covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that"
-            " the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter"
-            " warning that a deal might be killed by Congress or a future president). This of course is not the case."
-            " The talks were between Iran and the five permanent members of the U.N. Security Council (United States,"
-            " United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has"
-            " played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement"
-            " reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran"
-            " and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement"
-            " contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the"
-            " case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased"
-            " or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes"
-            " Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear"
-            " sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going"
-            " forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such"
-            " a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the"
-            ' agreement should be a formal treaty requiring the Senate to "advise and consent." But the issue is not'
-            " suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New"
-            " START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement"
-            " with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement"
-            " will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove"
-            " most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally"
-            " some insist that any agreement must address Iranian missile programs, human rights violations or support"
-            " for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are"
-            " unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in"
-            " the negotiations would be a poison pill. This agreement should be judged on its merits and on how it"
-            " affects the security of our negotiating partners and allies, including Israel. Those judgments should be"
-            " fact-based, not based on questionable assertions or dubious assumptions."
-        )
-        ARTICLE_SUBWAY = (
-            "New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A"
-            " year later, she got married again in Westchester County, but to a different man and without divorcing"
-            " her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos"
-            ' declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married'
-            " once more, this time in the Bronx. In an application for a marriage license, she stated it was her"
-            ' "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false'
-            ' instrument for filing in the first degree," referring to her false statements on the 2010 marriage'
-            " license application, according to court documents. Prosecutors said the marriages were part of an"
-            " immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to"
-            " her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was"
-            " arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New"
-            " York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total,"
-            " Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All"
-            " occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be"
-            " married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors"
-            " said the immigration scam involved some of her husbands, who filed for permanent residence status"
-            " shortly after the marriages.  Any divorces happened only after such filings were approved. It was"
-            " unclear whether any of the men will be prosecuted. The case was referred to the Bronx District"
-            " Attorney's Office by Immigration and Customs Enforcement and the Department of Homeland Security's"
-            ' Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt,'
-            " Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his"
-            " native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces"
-            " up to four years in prison.  Her next court appearance is scheduled for May 18."
-        )
-
-        expected_summaries = [
-            'prosecutor: "so far no videos were used in the crash investigation" two magazines claim to have found a'
-            " cell phone video of the final seconds . \"one can hear cries of 'My God' in several languages,\" one"
-            " magazine says . all 150 on board were killed in the crash .",
-            "the formal accession was marked by a ceremony at The Hague, in the Netherlands . the ICC opened a"
-            " preliminary examination into the situation in the occupied Palestinian territory . as members of the"
-            " court, Palestinians may be subject to counter-charges as well .",
-            "the u.s. and its negotiating partners reached a very strong framework agreement with Iran . aaron miller:"
-            " the debate that has already begun since the announcement of the new framework will likely result in more"
-            " heat than light . he says the new framework would reduce Iran's low-enriched uranium stockpile and cut"
-            " centrifuges . miller: if it had been, there would have been no Iranian team at the table .",
-            "prosecutors say the marriages were part of an immigration scam . if convicted, barrientos faces two"
-            ' criminal counts of "offering a false instrument for filing in the first degree" she has been married 10'
-            " times, with nine of her marriages occurring between 1999 and 2002 .",
-        ]
-
-        dct = tok(
-            ["summarize: " + x for x in [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY]],
-            padding="max_length",
-            truncation=True,
-            return_tensors="np",
-        )
-        self.assertEqual(512, dct["input_ids"].shape[1])
-
-        hypotheses_batch = model.generate(
-            **dct,
-            num_beams=4,
-            length_penalty=2.0,
-            max_length=142,
-            min_length=56,
-            do_sample=False,
-            early_stopping=True,
-        ).sequences
-
-        decoded = tok.batch_decode(hypotheses_batch, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        self.assertListEqual(
-            expected_summaries,
-            decoded,
-        )
diff --git a/tests/models/t5/test_modeling_tf_t5.py b/tests/models/t5/test_modeling_tf_t5.py
deleted file mode 100644
index 8dd06d6e25..0000000000
--- a/tests/models/t5/test_modeling_tf_t5.py
+++ /dev/null
@@ -1,1030 +0,0 @@
-# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import T5Config, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-from transformers.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import ByT5Tokenizer, T5Tokenizer, TFT5EncoderModel, TFT5ForConditionalGeneration, TFT5Model
-
-
-class TFT5ModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.n_positions = 14
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.d_ff = 37
-        self.relative_attention_num_buckets = 8
-        self.dropout_rate = 0.1
-        self.initializer_factor = 0.002
-        self.eos_token_id = 1
-        self.pad_token_id = 0
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_labels = None
-        if self.use_labels:
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = T5Config(
-            vocab_size=self.vocab_size,
-            n_positions=self.n_positions,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.pad_token_id,
-        )
-
-        return (config, input_ids, input_mask, token_labels)
-
-    def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
-        model = TFT5Model(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "decoder_input_ids": input_ids,
-            "decoder_attention_mask": input_mask,
-        }
-        result = model(inputs)
-
-        result = model(input_ids, decoder_attention_mask=input_mask, decoder_input_ids=input_ids)
-        decoder_output = result.last_hidden_state
-        decoder_past = result.past_key_values
-        encoder_output = result.encoder_last_hidden_state
-        self.parent.assertListEqual(list(encoder_output.shape), [self.batch_size, self.seq_length, self.hidden_size])
-        self.parent.assertListEqual(list(decoder_output.shape), [self.batch_size, self.seq_length, self.hidden_size])
-        # There should be `num_layers` key value embeddings stored in decoder_past[1]
-        self.parent.assertEqual(len(decoder_past), config.num_layers)
-        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past[1] tuple
-        self.parent.assertEqual(len(decoder_past[0]), 4)
-
-    def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
-        model = TFT5ForConditionalGeneration(config=config)
-        inputs_dict = {
-            "input_ids": input_ids,
-            "decoder_input_ids": input_ids,
-            "decoder_attention_mask": input_mask,
-        }
-
-        result = model(inputs_dict)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_t5_decoder_model_past(self, config, input_ids, decoder_input_ids, attention_mask):
-        model = TFT5Model(config=config).get_decoder()
-
-        input_ids = input_ids[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-
-        output_from_no_past = model(next_input_ids)[0]
-        output_from_past = model(next_tokens, past_key_values=outputs.past_key_values)[0]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_t5_decoder_model_attention_mask_past(
-        self, config, input_ids, decoder_input_ids, attention_mask
-    ):
-        model = TFT5Model(config=config).get_decoder()
-
-        # create attention mask
-        half_seq_length = self.seq_length // 2
-        attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
-        attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
-        attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attn_mask, use_cache=True)
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
-        vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
-        condition = tf.transpose(
-            tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
-        )
-        input_ids = tf.where(condition, random_other_next_tokens, input_ids)
-
-        # append to next input_ids and attn_mask
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        attn_mask = tf.concat(
-            [attn_mask, tf.ones((attn_mask.shape[0], 1), dtype=tf.int32)],
-            axis=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0]
-        output_from_past = model(next_tokens, past_key_values=outputs.past_key_values, attention_mask=attn_mask)[0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).numpy().item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_t5_decoder_model_past_large_inputs(
-        self, config, input_ids, decoder_input_ids, attention_mask
-    ):
-        model = TFT5Model(config=config).get_decoder()
-
-        input_ids = input_ids[:1, :]
-        attention_mask = attention_mask[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
-        output_from_past = model(
-            next_tokens, attention_mask=next_attention_mask, past_key_values=outputs.past_key_values
-        )[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, input_ids, input_mask, token_labels) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "decoder_input_ids": input_ids,
-            "decoder_attention_mask": input_mask,
-        }
-        return config, inputs_dict
-
-
-@require_tf
-class TFT5ModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    is_encoder_decoder = True
-    all_model_classes = (TFT5Model, TFT5ForConditionalGeneration) if is_tf_available() else ()
-    all_generative_model_classes = (TFT5ForConditionalGeneration,) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFT5Model,
-            "summarization": TFT5ForConditionalGeneration,
-            "text2text-generation": TFT5ForConditionalGeneration,
-            "translation": TFT5ForConditionalGeneration,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFT5ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_t5_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_t5_model(*config_and_inputs)
-
-    def test_t5_model_v1_1(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        config = config_and_inputs[0]
-        config.tie_word_embeddings = False
-        config.feed_forward_proj = "gated-gelu"
-        self.model_tester.create_and_check_t5_model(config, *config_and_inputs[1:])
-
-    def test_with_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
-
-    def test_t5_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_t5_decoder_model_past(*config_and_inputs)
-
-    def test_t5_decoder_model_past_with_attn_mask(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_t5_decoder_model_attention_mask_past(*config_and_inputs)
-
-    def test_t5_decoder_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-
-        # `create_and_check_t5_decoder_model_past_large_inputs` has special inputs:
-        #     (config, input_ids, decoder_input_ids, attention_mask)
-        # and we have to prepare it correctly here.
-        config, input_ids, input_mask, token_labels = config_and_inputs
-        config_and_inputs = (config, input_ids, None, input_mask)
-
-        self.model_tester.create_and_check_t5_decoder_model_past_large_inputs(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TFT5Model.from_pretrained("google-t5/t5-small")
-        self.assertIsNotNone(model)
-
-    def test_generate_with_headmasking(self):
-        # TODO: Fix head-masking according to PyTorch T5 model
-        pass
-
-    # This test is run in `TFT5EncoderOnlyModelTest`, where the main layer has the same inputs as the model
-    @unittest.skip(reason="The inputs of the Main Layer are different.")
-    def test_keras_save_load(self):
-        pass
-
-
-class TFT5EncoderOnlyModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        encoder_seq_length=7,
-        # For common tests
-        use_attention_mask=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=8,
-        is_training=False,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        is_encoder_decoder=False,
-        eos_token_id=1,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        # For common tests
-        self.seq_length = self.encoder_seq_length
-        self.use_attention_mask = use_attention_mask
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.is_encoder_decoder = is_encoder_decoder
-        self.scope = None
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-
-        config = T5Config(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-    ):
-        model = TFT5EncoderModel(config=config)
-        result = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-        )
-        result = model(input_ids=input_ids)
-        encoder_output = result.last_hidden_state
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, self.encoder_seq_length, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-class TFT5EncoderOnlyModelTest(TFModelTesterMixin, unittest.TestCase):
-    is_encoder_decoder = False
-    all_model_classes = (TFT5EncoderModel,) if is_tf_available() else ()
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFT5EncoderOnlyModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    # is not able to be part of a pipeline
-    def test_train_pipeline_custom_model(self):
-        pass
-
-
-@require_tf
-@require_sentencepiece
-@require_tokenizers
-class TFT5GenerationIntegrationTests(unittest.TestCase):
-    @slow
-    def test_greedy_xla_generate_simple(self):
-        model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
-        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
-
-        # two examples with different lengths to confirm that attention masks are operational in XLA
-        sentences = [
-            "Translate English to German: Today is a beautiful day.",
-            "Translate English to German: I have four cats, three dogs, two birds, and a horse.",
-        ]
-        input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids
-
-        xla_generate = tf.function(model.generate, jit_compile=True)
-
-        output_ids = model.generate(input_ids)
-        output_ids_xla = xla_generate(input_ids)
-
-        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-        output_strings_xla = tokenizer.batch_decode(output_ids_xla, skip_special_tokens=True)
-
-        expected_output_string = [
-            "Heute ist ein schöner Tag.",
-            "Ich habe vier Katzen, drei Hunde, zwei Vögel und ein Pferd.",
-        ]
-
-        self.assertListEqual(expected_output_string, output_strings)
-        self.assertListEqual(expected_output_string, output_strings_xla)
-
-    @slow
-    def test_t5_greedy_generate(self):
-        model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
-        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
-
-        sentences = ["Yesterday, my name was", "Today is a beautiful day and"]
-        input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids
-
-        generation_kwargs = {
-            "bad_words_ids": [tokenizer("my").input_ids, tokenizer("ein schöner").input_ids],
-            "no_repeat_ngram_size": 3,
-            "do_sample": False,
-            "repetition_penalty": 2.2,
-        }
-
-        output_ids = model.generate(input_ids, **generation_kwargs)
-
-        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-
-        expected_output_string = ["Yesterday, my name was", "Heute ist ein schöne Tag und"]
-
-        self.assertListEqual(expected_output_string, output_strings)
-
-    @slow
-    def test_sample_xla_generate_simple(self):
-        # NOTE: due to the small numerical differences that are natural when we compile to XLA, sampling the same
-        # output out of the same seed is far from guaranteed. We can, however, confirm that the results are sensible
-        # and that we can seed both versions.
-
-        # forces the generation to happen on CPU, to avoid GPU-related quirks
-        with tf.device(":/CPU:0"):
-            model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
-            tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
-
-            sentence = "Translate English to German: I have two bananas"
-            input_ids = tokenizer(sentence, return_tensors="tf", padding=True).input_ids
-            expected_output_string = ["Ich habe zwei Bananen"]
-            expected_output_string_xla = ["Ich habe 2 Bananen"]
-
-            # seed set -> deterministic sampling sequence -> deterministic generation
-            output_ids = model.generate(input_ids, do_sample=True, seed=[42, 0])
-            output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-            self.assertListEqual(expected_output_string, output_strings)
-
-            xla_generate = tf.function(model.generate, jit_compile=True)
-            # seed set -> deterministic sampling sequence -> deterministic generation
-            output_ids_xla = xla_generate(input_ids, do_sample=True, seed=[42, 0])
-            output_strings_xla = tokenizer.batch_decode(output_ids_xla, skip_special_tokens=True)
-            self.assertListEqual(expected_output_string_xla, output_strings_xla)
-
-    @slow
-    def test_t5_sample_generate(self):
-        model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
-        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
-
-        sentences = ["I really love my", "Translate English to German: the transformers are truly amazing"]
-        input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids
-
-        generation_kwargs = {
-            "do_sample": True,
-            "bad_words_ids": [tokenizer("my").input_ids, tokenizer("ein schöner").input_ids],
-            "no_repeat_ngram_size": 3,
-            "repetition_penalty": 2.2,
-            "temperature": 0.8,
-            "top_k": 500,
-            "top_p": 0.9,
-            "seed": [20, 0],  # seed set -> deterministic sampling sequence -> deterministic generation
-        }
-
-        # forces the generation to happen on CPU, to avoid GPU-related quirks
-        with tf.device(":/CPU:0"):
-            output_ids = model.generate(input_ids, **generation_kwargs)
-
-        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-
-        expected_output_string = ["- I really love my way of this.", "die Transformatoren sind wirklich erstaunlich"]
-
-        self.assertListEqual(expected_output_string, output_strings)
-
-    # TODO (ydshieh): undo skip once a fix is done on TF side.
-    @unittest.skip("Skip for now as TF 2.13 breaks it on GPU")
-    @slow
-    def test_beam_search_xla_generate_simple(self):
-        model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
-        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
-
-        # tests XLA with task specific arguments
-        task_specific_config = getattr(model.config, "task_specific_params", {})
-        translation_config = task_specific_config.get("translation_en_to_fr", {})
-        model.config.update(translation_config)
-
-        # two examples with different lengths to confirm that attention masks are operational in XLA
-        sentences = [
-            model.config.prefix + "Today is a beautiful day.",
-            model.config.prefix + "I have four cats, three dogs, two birds, and a horse.",
-        ]
-        input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids
-
-        xla_generate = tf.function(model.generate, jit_compile=True)
-
-        output_ids = model.generate(input_ids, num_beams=2)
-        output_ids_xla = xla_generate(input_ids, num_beams=2)
-
-        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-        output_strings_xla = tokenizer.batch_decode(output_ids_xla, skip_special_tokens=True)
-
-        expected_output_string = [
-            "Aujourd'hui est une belle journée.",
-            "J'ai quatre chats, trois chiens, deux oiseaux et un cheval.",
-        ]
-
-        self.assertListEqual(expected_output_string, output_strings)
-        self.assertListEqual(expected_output_string, output_strings_xla)
-
-    @slow
-    def test_beam_search_generate(self):
-        model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
-        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
-
-        sentences = ["I really love my", "Translate English to German: the transformers are truly amazing"]
-        input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids
-
-        generation_kwargs = {
-            "bad_words_ids": [tokenizer("my").input_ids, tokenizer("ein schöner").input_ids],
-            "no_repeat_ngram_size": 3,
-            "do_sample": False,
-            "repetition_penalty": 2.2,
-            "num_beams": 4,
-        }
-
-        output_ids = model.generate(input_ids, **generation_kwargs)
-
-        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-
-        expected_output_string = ["Ich liebe es so sehr!", "die Transformatoren sind wirklich erstaunlich"]
-        self.assertListEqual(expected_output_string, output_strings)
-
-
-@require_tf
-@require_sentencepiece
-@require_tokenizers
-class TFT5ModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def model(self):
-        return TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-base")
-
-    @slow
-    def test_small_integration_test(self):
-        """
-        For comparison run:
-        >>> import t5  # pip install t5==0.7.1
-        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
-
-        >>> path_to_mtf_small_t5_checkpoint = '<fill_in>'
-        >>> path_to_mtf_small_spm_model_path = '<fill_in>'
-        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_t5_checkpoint, batch_size=1, tpu=None)
-        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_spm_model_path, extra_ids=100)
-        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
-        """
-
-        model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
-        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
-
-        input_ids = tokenizer("Hello there", return_tensors="tf").input_ids
-        labels = tokenizer("Hi I am", return_tensors="tf").input_ids
-
-        loss = model(input_ids, labels=labels).loss
-        mtf_score = -tf.math.reduce_mean(loss).numpy()
-
-        EXPECTED_SCORE = -4.771147
-        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
-
-    @slow
-    def test_small_v1_1_integration_test(self):
-        """
-        For comparison run:
-        >>> import t5  # pip install t5==0.7.1
-        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
-
-        >>> path_to_mtf_small_t5_v1.1_checkpoint = '<fill_in>'
-        >>> path_to_mtf_small_spm_model_path = '<fill_in>'
-        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_t5_v1.1_checkpoint, batch_size=1, tpu=None)
-        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_spm_model_path, extra_ids=100)
-        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
-        """
-
-        model = TFT5ForConditionalGeneration.from_pretrained("google/t5-v1_1-small")
-        tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-small")
-
-        input_ids = tokenizer("Hello there", return_tensors="tf").input_ids
-        labels = tokenizer("Hi I am", return_tensors="tf").input_ids
-
-        loss = model(input_ids, labels=labels).loss
-        mtf_score = -tf.math.reduce_mean(loss).numpy()
-
-        EXPECTED_SCORE = -14.757326
-        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
-
-    @slow
-    def test_small_byt5_integration_test(self):
-        """
-        For comparison run:
-        >>> import t5  # pip install t5==0.9.1
-
-        >>> path_to_byt5_small_checkpoint = '<fill_in>'
-        >>> t5_model = t5.models.MtfModel(model_dir=path_to_tf_checkpoint, batch_size=1, tpu=None)
-        >>> vocab = t5.data.ByteVocabulary()
-        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
-        """
-
-        model = TFT5ForConditionalGeneration.from_pretrained("google/byt5-small")
-        tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
-
-        input_ids = tokenizer("Hello there", return_tensors="tf").input_ids
-        labels = tokenizer("Hi I am", return_tensors="tf").input_ids
-
-        loss = model(input_ids, labels=labels).loss
-        mtf_score = -tf.math.reduce_mean(loss).numpy()
-
-        EXPECTED_SCORE = -7.592465
-        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
-
-    @slow
-    def test_summarization(self):
-        model = self.model
-        tok = T5Tokenizer.from_pretrained("google-t5/t5-base")
-
-        FRANCE_ARTICLE = (  # @noqa
-            "Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings"
-            " Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane."
-            ' Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation."'
-            ' He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s'
-            " comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video"
-            " showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French"
-            " Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a"
-            " phone at the wreckage site. The two publications described the supposed video, but did not post it on"
-            " their websites. The publications said that they watched the video, which was found by a source close to"
-            " the investigation. \"One can hear cries of 'My God' in several languages,\" Paris Match reported."
-            ' "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the'
-            " cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the"
-            ' screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt,'
-            " editor-in-chief of Bild online. An official with France's accident investigation agency, the BEA, said"
-            " the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman"
-            " in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the"
-            ' reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said,'
-            ' but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be'
-            " sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by"
-            " specialized technicians working hand-in-hand with investigators. But none of the cell phones found so"
-            " far have been sent to the institute, Menichini said. Asked whether staff involved in the search could"
-            ' have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin'
-            ' Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match'
-            ' are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered'
-            ' cell phones from the crash site after Bild and Paris Match published their reports. "That is something'
-            " we did not know before. ... Overall we can say many things of the investigation weren't revealed by the"
-            ' investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline'
-            " Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the"
-            " controls of Germanwings Flight 9525, which he's accused of deliberately crashing last week in the"
-            ' French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of'
-            ' severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school'
-            " discovered in an internal investigation, Lufthansa said, included medical documents he submitted in"
-            " connection with resuming his flight training. The announcement indicates that Lufthansa, the parent"
-            " company of Germanwings, knew of Lubitz's battle with depression, allowed him to continue training and"
-            " ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100%"
-            ' fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was'
-            " sharing the information and documents -- including training and medical records -- with public"
-            " prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the"
-            " past week to recover human remains and plane debris scattered across a steep mountainside. He saw the"
-            " crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash"
-            " site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late"
-            " Tuesday that no visible human remains were left at the site but recovery teams would keep searching."
-            " French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all"
-            " the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested."
-            " In the meantime, the recovery of the victims' personal belongings will start Wednesday, Menichini said."
-            " Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew"
-            " on board. Check out the latest from our correspondents . The details about Lubitz's correspondence with"
-            " the flight school during his training were among several developments as investigators continued to"
-            " delve into what caused the crash and Lubitz's possible motive for downing the jet. A Lufthansa"
-            " spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his"
-            ' examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in'
-            " Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at"
-            " some point before his aviation career and underwent psychotherapy before he got his pilot's license."
-            " Kumpa emphasized there's no evidence suggesting Lubitz was suicidal or acting aggressively before the"
-            " crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to"
-            " lose his pilot's license, a European government official briefed on the investigation told CNN on"
-            ' Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being'
-            " considered. Another source, a law enforcement official briefed on the investigation, also told CNN that"
-            " authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would"
-            " not be allowed to fly because of his medical problems. Lubitz's girlfriend told investigators he had"
-            " seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded"
-            " he had psychological issues, the European government official said. But no matter what details emerge"
-            " about his previous mental health struggles, there's more to the story, said Brian Russell, a forensic"
-            ' psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact'
-            " that maybe they weren't going to keep doing their job and they're upset about that and so they're"
-            ' suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to'
-            " also take that rage and turn it outward on 149 other people who had nothing to do with the person's"
-            ' problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight'
-            " 9525? CNN's Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura"
-            " Smith-Spark wrote from London. CNN's Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine"
-            " Amiel and Anna-Maja Rappard contributed to this report."
-        )
-
-        SHORTER_ARTICLE = (
-            "(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on"
-            " Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The"
-            " formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based."
-            " The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its"
-            ' jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East'
-            ' Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the'
-            " situation in Palestinian territories, paving the way for possible war crimes investigations against"
-            " Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and"
-            " the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the"
-            " body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony, said it was a"
-            ' move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the'
-            ' world is also a step closer to ending a long era of impunity and injustice," he said, according to an'
-            ' ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge'
-            " Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the"
-            ' Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine'
-            " acquires all the rights as well as responsibilities that come with being a State Party to the Statute."
-            ' These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights'
-            ' Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should'
-            " immediately end their pressure, and countries that support universal acceptance of the court's treaty"
-            ' should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the'
-            " group. \"What's objectionable is the attempts to undermine international justice, not Palestine's"
-            ' decision to join a treaty to which over 100 countries around the world are members." In January, when'
-            " the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an"
-            ' outrage, saying the court was overstepping its boundaries. The United States also said it "strongly"'
-            " disagreed with the court's decision. \"As we have said repeatedly, we do not believe that Palestine is a"
-            ' state and therefore we do not believe that it is eligible to join the ICC," the State Department said in'
-            ' a statement. It urged the warring sides to resolve their differences through direct negotiations. "We'
-            ' will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace,"'
-            " it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the"
-            ' territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the'
-            " court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou"
-            ' Bensouda said her office would "conduct its analysis in full independence and impartiality." The war'
-            " between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry"
-            " will include alleged war crimes committed since June. The International Criminal Court was set up in"
-            " 2002 to prosecute genocide, crimes against humanity and war crimes. CNN's Vasco Cotovio, Kareem Khadder"
-            " and Faith Karimi contributed to this report."
-        )
-
-        IRAN_ARTICLE = (
-            "(CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran"
-            " in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively"
-            " block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger."
-            " Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli"
-            " Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a"
-            " letter to the Iranian leadership warning them away from a deal. The debate that has already begun since"
-            " the announcement of the new framework will likely result in more heat than light. It will not be helped"
-            " by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: ."
-            " The most misleading assertion, despite universal rejection by experts, is that the negotiations'"
-            " objective at the outset was the total elimination of any nuclear program in Iran. That is the position"
-            " of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it"
-            " had been, there would have been no Iranian team at the negotiating table. Rather, the objective has"
-            " always been to structure an agreement or series of agreements so that Iran could not covertly develop a"
-            " nuclear arsenal before the United States and its allies could respond. The new framework has exceeded"
-            " expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by"
-            " two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another"
-            " dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite"
-            " sharp accusations by some in the United States and its allies, Iran denies having such a program, and"
-            " U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's"
-            " continued cooperation with International Atomic Energy Agency inspections is further evidence on this"
-            " point, and we'll know even more about Iran's program in the coming months and years because of the deal."
-            " In fact, the inspections provisions that are part of this agreement are designed to protect against any"
-            " covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that"
-            " the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter"
-            " warning that a deal might be killed by Congress or a future president). This of course is not the case."
-            " The talks were between Iran and the five permanent members of the U.N. Security Council (United States,"
-            " United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has"
-            " played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement"
-            " reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran"
-            " and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement"
-            " contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the"
-            " case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased"
-            " or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes"
-            " Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear"
-            " sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going"
-            " forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such"
-            " a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the"
-            ' agreement should be a formal treaty requiring the Senate to "advise and consent." But the issue is not'
-            " suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New"
-            " START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement"
-            " with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement"
-            " will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove"
-            " most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally"
-            " some insist that any agreement must address Iranian missile programs, human rights violations or support"
-            " for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are"
-            " unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in"
-            " the negotiations would be a poison pill. This agreement should be judged on its merits and on how it"
-            " affects the security of our negotiating partners and allies, including Israel. Those judgments should be"
-            " fact-based, not based on questionable assertions or dubious assumptions."
-        )
-
-        ARTICLE_SUBWAY = (
-            "New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A"
-            " year later, she got married again in Westchester County, but to a different man and without divorcing"
-            " her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos"
-            ' declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married'
-            " once more, this time in the Bronx. In an application for a marriage license, she stated it was her"
-            ' "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false'
-            ' instrument for filing in the first degree," referring to her false statements on the 2010 marriage'
-            " license application, according to court documents. Prosecutors said the marriages were part of an"
-            " immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to"
-            " her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was"
-            " arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New"
-            " York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total,"
-            " Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All"
-            " occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be"
-            " married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors"
-            " said the immigration scam involved some of her husbands, who filed for permanent residence status"
-            " shortly after the marriages.  Any divorces happened only after such filings were approved. It was"
-            " unclear whether any of the men will be prosecuted. The case was referred to the Bronx District"
-            " Attorney's Office by Immigration and Customs Enforcement and the Department of Homeland Security's"
-            ' Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt,'
-            " Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his"
-            " native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces"
-            " up to four years in prison.  Her next court appearance is scheduled for May 18."
-        )
-
-        expected_summaries = [
-            'prosecutor: "so far no videos were used in the crash investigation" two magazines claim to have found a'
-            " cell phone video of the final seconds . \"one can hear cries of 'My God' in several languages,\" one"
-            " magazine says .",
-            "the formal accession was marked by a ceremony at The Hague, in the Netherlands . the ICC opened a"
-            " preliminary examination into the situation in the occupied Palestinian territory . as members of the"
-            " court, Palestinians may be subject to counter-charges as well .",
-            "the u.s. and its negotiating partners reached a very strong framework agreement with Iran . aaron miller:"
-            " the debate that has already begun since the announcement of the new framework will likely result in more"
-            " heat than light . the deal would reduce Iran's low-enriched uranium stockpile, cut centrifuges and"
-            " implement a rigorous inspection regime .",
-            "prosecutors say the marriages were part of an immigration scam . if convicted, barrientos faces two"
-            ' criminal counts of "offering a false instrument for filing in the first degree" she has been married 10'
-            " times, with nine of her marriages occurring between 1999 and 2002 .",
-        ]
-
-        task_specific_config = getattr(model.config, "task_specific_params", {})
-        summarization_config = task_specific_config.get("summarization", {})
-        model.config.update(summarization_config)
-
-        dct = tok(
-            [model.config.prefix + x for x in [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY]],
-            max_length=512,
-            padding="max_length",
-            truncation=True,
-            return_tensors="tf",
-        )
-        self.assertEqual(512, dct["input_ids"].shape[1])
-
-        hypotheses_batch = model.generate(
-            input_ids=dct["input_ids"],
-            attention_mask=dct["attention_mask"],
-            num_beams=4,
-            length_penalty=2.0,
-            max_length=142,
-            min_length=56,
-            no_repeat_ngram_size=3,
-            do_sample=False,
-            early_stopping=True,
-        )
-
-        decoded = [
-            tok.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in hypotheses_batch
-        ]
-
-        self.assertListEqual(
-            expected_summaries,
-            decoded,
-        )
-
-    @slow
-    def test_translation_en_to_de(self):
-        tok = T5Tokenizer.from_pretrained("google-t5/t5-base")
-        model = self.model
-
-        task_specific_config = getattr(model.config, "task_specific_params", {})
-        translation_config = task_specific_config.get("translation_en_to_de", {})
-        self.model.config.update(translation_config)
-
-        original_input = '"Luigi often said to me that he never wanted the brothers to end up in court", she wrote.'
-        expected_translation = (
-            '"Luigi sagte mir oft, dass er nie wollte, dass die Brüder am Gericht sitzen", schrieb sie.'
-        )
-
-        input_ids = tok.encode(model.config.prefix + original_input, return_tensors="tf")
-
-        output = model.generate(
-            input_ids=input_ids,
-            num_beams=4,
-            length_penalty=2.0,
-            max_length=50,
-            no_repeat_ngram_size=3,
-            do_sample=False,
-            early_stopping=True,
-        )
-        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-
-        self.assertEqual(translation, expected_translation)
-
-    @slow
-    def test_translation_en_to_fr(self):
-        model = self.model
-        tok = T5Tokenizer.from_pretrained("google-t5/t5-base")
-
-        task_specific_config = getattr(model.config, "task_specific_params", {})
-        translation_config = task_specific_config.get("translation_en_to_fr", {})
-        model.config.update(translation_config)
-
-        en_text = (
-            ' This image section from an infrared recording by the Spitzer telescope shows a "family portrait" of'
-            " countless generations of stars: the oldest stars are seen as blue dots. "
-        )
-
-        new_truncated_translation = (
-            "Cette section d'images provenant de l'enregistrement infrarouge effectué par le télescope Spitzer montre "
-            "un "
-            "« portrait familial » de générations innombrables d’étoiles : les plus anciennes sont observées "
-            "sous forme "
-            "de points bleus."
-        )
-
-        input_ids = tok(model.config.prefix + en_text, return_tensors="tf").input_ids
-
-        output = model.generate(
-            input_ids=input_ids,
-            num_beams=4,
-            length_penalty=2.0,
-            max_length=100,
-            no_repeat_ngram_size=3,
-            do_sample=False,
-            early_stopping=True,
-        )
-        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-
-        self.assertEqual(translation, new_truncated_translation)
-
-    @slow
-    def test_translation_en_to_ro(self):
-        model = self.model
-        tok = T5Tokenizer.from_pretrained("google-t5/t5-base")
-
-        task_specific_config = getattr(model.config, "task_specific_params", {})
-        translation_config = task_specific_config.get("translation_en_to_ro", {})
-        model.config.update(translation_config)
-
-        original_input = "Taco Bell said it plans to add 2,000 locations in the US by 2022."
-        expected_translation = "Taco Bell a declarat că intenţionează să adauge 2 000 de locaţii în SUA până în 2022."
-
-        input_ids = tok.encode(model.config.prefix + original_input, return_tensors="tf")
-
-        output = model.generate(
-            input_ids=input_ids,
-            num_beams=4,
-            length_penalty=2.0,
-            max_length=50,
-            no_repeat_ngram_size=3,
-            do_sample=False,
-            early_stopping=True,
-        )
-        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-
-        self.assertEqual(translation, expected_translation)
diff --git a/tests/models/tapas/test_modeling_tf_tapas.py b/tests/models/tapas/test_modeling_tf_tapas.py
deleted file mode 100644
index ab00f70673..0000000000
--- a/tests/models/tapas/test_modeling_tf_tapas.py
+++ /dev/null
@@ -1,1072 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import copy
-import unittest
-
-import numpy as np
-import pandas as pd
-
-from transformers import (
-    TF_MODEL_FOR_CAUSAL_LM_MAPPING,
-    TF_MODEL_FOR_MASKED_LM_MAPPING,
-    TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
-    TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
-    TF_MODEL_FOR_PRETRAINING_MAPPING,
-    TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-    TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-    TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
-    TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-    TapasConfig,
-    TapasTokenizer,
-    is_tf_available,
-)
-from transformers.models.auto import get_values
-from transformers.testing_utils import require_tensorflow_probability, require_tf, slow
-from transformers.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TFTapasForMaskedLM,
-        TFTapasForQuestionAnswering,
-        TFTapasForSequenceClassification,
-        TFTapasModel,
-    )
-    from transformers.models.tapas.modeling_tf_tapas import (
-        IndexMap,
-        ProductIndexMap,
-        flatten,
-        gather,
-        range_index_map,
-        reduce_max,
-        reduce_mean,
-        reduce_sum,
-    )
-
-
-class TFTapasModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        initializer_range=0.02,
-        max_position_embeddings=512,
-        type_vocab_sizes=[3, 256, 256, 2, 256, 256, 10],
-        type_sequence_label_size=2,
-        positive_weight=10.0,
-        num_aggregation_labels=4,
-        num_labels=2,
-        aggregation_loss_importance=0.8,
-        use_answer_as_supervision=True,
-        answer_loss_importance=0.001,
-        use_normalized_answer_loss=False,
-        huber_loss_delta=25.0,
-        temperature=1.0,
-        agg_temperature=1.0,
-        use_gumbel_for_cells=False,
-        use_gumbel_for_agg=False,
-        average_approximation_function="ratio",
-        cell_selection_preference=0.5,
-        answer_loss_cutoff=100,
-        max_num_rows=64,
-        max_num_columns=32,
-        average_logits_per_cell=True,
-        select_one_column=True,
-        allow_empty_column_selection=False,
-        init_cell_selection_weights_to_zero=True,
-        reset_position_index_per_cell=True,
-        disable_per_token_loss=False,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_sizes = type_vocab_sizes
-        self.type_sequence_label_size = type_sequence_label_size
-        self.positive_weight = positive_weight
-        self.num_aggregation_labels = num_aggregation_labels
-        self.num_labels = num_labels
-        self.aggregation_loss_importance = aggregation_loss_importance
-        self.use_answer_as_supervision = use_answer_as_supervision
-        self.answer_loss_importance = answer_loss_importance
-        self.use_normalized_answer_loss = use_normalized_answer_loss
-        self.huber_loss_delta = huber_loss_delta
-        self.temperature = temperature
-        self.agg_temperature = agg_temperature
-        self.use_gumbel_for_cells = use_gumbel_for_cells
-        self.use_gumbel_for_agg = use_gumbel_for_agg
-        self.average_approximation_function = average_approximation_function
-        self.cell_selection_preference = cell_selection_preference
-        self.answer_loss_cutoff = answer_loss_cutoff
-        self.max_num_rows = max_num_rows
-        self.max_num_columns = max_num_columns
-        self.average_logits_per_cell = average_logits_per_cell
-        self.select_one_column = select_one_column
-        self.allow_empty_column_selection = allow_empty_column_selection
-        self.init_cell_selection_weights_to_zero = init_cell_selection_weights_to_zero
-        self.reset_position_index_per_cell = reset_position_index_per_cell
-        self.disable_per_token_loss = disable_per_token_loss
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = []
-        for type_vocab_size in self.type_vocab_sizes:
-            token_type_ids.append(ids_tensor(shape=[self.batch_size, self.seq_length], vocab_size=type_vocab_size))
-        token_type_ids = tf.stack(token_type_ids, axis=2)
-
-        sequence_labels = None
-        token_labels = None
-        labels = None
-        numeric_values = None
-        numeric_values_scale = None
-        float_answer = None
-        aggregation_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            labels = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-            numeric_values = ids_tensor([self.batch_size, self.seq_length], vocab_size=2, dtype=tf.float32)
-            numeric_values_scale = ids_tensor([self.batch_size, self.seq_length], vocab_size=2, dtype=tf.float32)
-            float_answer = ids_tensor([self.batch_size], vocab_size=2, dtype=tf.float32)
-            aggregation_labels = ids_tensor([self.batch_size], self.num_aggregation_labels)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            token_type_ids,
-            sequence_labels,
-            token_labels,
-            labels,
-            numeric_values,
-            numeric_values_scale,
-            float_answer,
-            aggregation_labels,
-        )
-
-    def get_config(self):
-        return TapasConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_sizes=self.type_vocab_sizes,
-            initializer_range=self.initializer_range,
-            positive_weight=self.positive_weight,
-            num_aggregation_labels=self.num_aggregation_labels,
-            num_labels=self.num_labels,
-            aggregation_loss_importance=self.aggregation_loss_importance,
-            use_answer_as_supervision=self.use_answer_as_supervision,
-            answer_loss_importance=self.answer_loss_importance,
-            use_normalized_answer_loss=self.use_normalized_answer_loss,
-            huber_loss_delta=self.huber_loss_delta,
-            temperature=self.temperature,
-            agg_temperature=self.agg_temperature,
-            use_gumbel_for_cells=self.use_gumbel_for_cells,
-            use_gumbel_for_agg=self.use_gumbel_for_agg,
-            average_approximation_function=self.average_approximation_function,
-            cell_selection_preference=self.cell_selection_preference,
-            answer_loss_cutoff=self.answer_loss_cutoff,
-            max_num_rows=self.max_num_rows,
-            max_num_columns=self.max_num_columns,
-            average_logits_per_cell=self.average_logits_per_cell,
-            select_one_column=self.select_one_column,
-            allow_empty_column_selection=self.allow_empty_column_selection,
-            init_cell_selection_weights_to_zero=self.init_cell_selection_weights_to_zero,
-            reset_position_index_per_cell=self.reset_position_index_per_cell,
-            disable_per_token_loss=self.disable_per_token_loss,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        token_type_ids,
-        sequence_labels,
-        token_labels,
-        labels,
-        numeric_values,
-        numeric_values_scale,
-        float_answer,
-        aggregation_labels,
-    ):
-        model = TFTapasModel(config=config)
-
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        inputs.pop("attention_mask")
-        result = model(inputs)
-        inputs.pop("token_type_ids")
-        result = model(inputs)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_masked_lm(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        token_type_ids,
-        sequence_labels,
-        token_labels,
-        labels,
-        numeric_values,
-        numeric_values_scale,
-        float_answer,
-        aggregation_labels,
-    ):
-        model = TFTapasForMaskedLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "labels": token_labels,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_sequence_classification(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        token_type_ids,
-        sequence_labels,
-        token_labels,
-        labels,
-        numeric_values,
-        numeric_values_scale,
-        float_answer,
-        aggregation_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = TFTapasForSequenceClassification(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "labels": sequence_labels,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        token_type_ids,
-        sequence_labels,
-        token_labels,
-        labels,
-        numeric_values,
-        numeric_values_scale,
-        float_answer,
-        aggregation_labels,
-    ):
-        # inference: without aggregation head (SQA). Model only returns logits
-        sqa_config = copy.copy(config)
-        sqa_config.num_aggregation_labels = 0
-        sqa_config.use_answer_as_supervision = False
-        model = TFTapasForQuestionAnswering(config=sqa_config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
-
-        # inference: with aggregation head (WTQ, WikiSQL-supervised). Model returns logits and aggregation logits
-        model = TFTapasForQuestionAnswering(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels))
-
-        # training: can happen in 3 main ways
-        # case 1: conversational (SQA)
-        model = TFTapasForQuestionAnswering(config=sqa_config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "labels": labels,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.loss.shape, (1,))
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
-
-        # case 2: weak supervision for aggregation (WTQ)
-        model = TFTapasForQuestionAnswering(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "labels": labels,
-            "numeric_values": numeric_values,
-            "numeric_values_scale": numeric_values_scale,
-            "float_answer": float_answer,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.loss.shape, (1,))
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels))
-
-        # case 3: strong supervision for aggregation (WikiSQL-supervised)
-        wikisql_config = copy.copy(config)
-        wikisql_config.use_answer_as_supervision = False
-        model = TFTapasForQuestionAnswering(config=wikisql_config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "labels": labels,
-            "aggregation_labels": aggregation_labels,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.loss.shape, (1,))
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            input_mask,
-            token_type_ids,
-            sequence_labels,
-            token_labels,
-            labels,
-            numeric_values,
-            numeric_values_scale,
-            float_answer,
-            aggregation_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tensorflow_probability
-@require_tf
-class TFTapasModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFTapasModel,
-            TFTapasForMaskedLM,
-            TFTapasForSequenceClassification,
-            TFTapasForQuestionAnswering,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFTapasModel,
-            "fill-mask": TFTapasForMaskedLM,
-            "text-classification": TFTapasForSequenceClassification,
-            "zero-shot": TFTapasForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_case_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        image_processor_name,
-        feature_extractor_name,
-        processor_name,
-    ):
-        return True
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict:
-        inputs_dict = copy.deepcopy(inputs_dict)
-
-        if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-            inputs_dict = {
-                k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1))
-                if isinstance(v, tf.Tensor) and v.ndim > 0
-                else v
-                for k, v in inputs_dict.items()
-            }
-
-        if return_labels:
-            if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in get_values(TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING):
-                inputs_dict["labels"] = tf.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
-                )
-                inputs_dict["aggregation_labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-                inputs_dict["numeric_values"] = tf.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.float32
-                )
-                inputs_dict["numeric_values_scale"] = tf.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.float32
-                )
-                inputs_dict["float_answer"] = tf.zeros(self.model_tester.batch_size, dtype=tf.float32)
-            elif model_class in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
-                inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING):
-                inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in [
-                *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
-                *get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING),
-                *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
-                *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING),
-                *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
-            ]:
-                inputs_dict["labels"] = tf.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = TFTapasModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=TapasConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    @unittest.skip(reason="The default test gets NaN losses with the test-generated inputs")
-    def test_dataset_conversion(self):
-        pass
-
-    @unittest.skip(reason="The default test gets NaN losses with the test-generated inputs")
-    def test_keras_fit(self):
-        pass
-
-    @unittest.skip(reason="The default test gets NaN losses with the test-generated inputs")
-    def test_loss_computation(self):
-        pass
-
-
-def prepare_tapas_single_inputs_for_inference():
-    # Here we prepare a single table-question pair to test TAPAS inference on:
-    data = {
-        "Footballer": ["Lionel Messi", "Cristiano Ronaldo"],
-        "Age": ["33", "35"],
-    }
-    queries = "Which footballer is 33 years old?"
-    table = pd.DataFrame.from_dict(data)
-
-    return table, queries
-
-
-def prepare_tapas_batch_inputs_for_inference():
-    # Here we prepare a batch of 2 table-question pairs to test TAPAS inference on:
-    data = {
-        "Footballer": ["Lionel Messi", "Cristiano Ronaldo"],
-        "Age": ["33", "35"],
-        "Number of goals": ["712", "750"],
-    }
-    queries = ["Which footballer is 33 years old?", "How many goals does Ronaldo have?"]
-    table = pd.DataFrame.from_dict(data)
-
-    return table, queries
-
-
-def prepare_tapas_batch_inputs_for_training():
-    # Here we prepare a DIFFERENT batch of 2 table-question pairs to test TAPAS training on:
-    data = {
-        "Footballer": ["Lionel Messi", "Cristiano Ronaldo"],
-        "Age": ["33", "35"],
-        "Number of goals": ["712", "750"],
-    }
-    queries = ["Which footballer is 33 years old?", "What's the total number of goals?"]
-    table = pd.DataFrame.from_dict(data)
-
-    answer_coordinates = [[(0, 0)], [(0, 2), (1, 2)]]
-    answer_text = [["Lionel Messi"], ["1462"]]
-    float_answer = [float("NaN"), float("1462")]
-
-    return table, queries, answer_coordinates, answer_text, float_answer
-
-
-@require_tensorflow_probability
-@require_tf
-class TFTapasModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_tokenizer(self):
-        return TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")
-
-    @slow
-    def test_inference_no_head(self):
-        # ideally we want to test this with the weights of tapas_inter_masklm_base_reset,
-        # but since it's not straightforward to do this with the TF 1 implementation, we test it with
-        # the weights of the WTQ base model (i.e. tapas_wtq_wikisql_sqa_inter_masklm_base_reset)
-        model = TFTapasModel.from_pretrained("google/tapas-base-finetuned-wtq")
-        tokenizer = self.default_tokenizer
-        table, queries = prepare_tapas_single_inputs_for_inference()
-        inputs = tokenizer(table=table, queries=queries, return_tensors="tf")
-        outputs = model(**inputs)
-
-        # test the sequence output
-        expected_slice = tf.constant(
-            [
-                [
-                    [-0.141581565, -0.599805772, 0.747186482],
-                    [-0.143664181, -0.602008104, 0.749218345],
-                    [-0.15169853, -0.603363097, 0.741370678],
-                ]
-            ]
-        )
-        tf.debugging.assert_near(outputs.last_hidden_state[:, :3, :3], expected_slice, atol=0.0005)
-
-        # test the pooled output
-        expected_slice = tf.constant([[0.987518311, -0.970520139, -0.994303405]])
-
-        tf.debugging.assert_near(outputs.pooler_output[:, :3], expected_slice, atol=0.0005)
-
-    @unittest.skip(reason="Model not available yet")
-    def test_inference_masked_lm(self):
-        pass
-
-    # TapasForQuestionAnswering has 3 possible ways of being fine-tuned:
-    # - conversational set-up (SQA)
-    # - weak supervision for aggregation (WTQ, WikiSQL)
-    # - strong supervision for aggregation (WikiSQL-supervised)
-    # We test all of them:
-    @slow
-    def test_inference_question_answering_head_conversational(self):
-        # note that google/tapas-base-finetuned-sqa should correspond to tapas_sqa_inter_masklm_base_reset
-        model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-sqa")
-        tokenizer = self.default_tokenizer
-        table, queries = prepare_tapas_single_inputs_for_inference()
-        inputs = tokenizer(table=table, queries=queries, return_tensors="tf")
-        outputs = model(**inputs)
-
-        # test the logits
-        logits = outputs.logits
-        expected_shape = tf.TensorShape([1, 21])
-        tf.debugging.assert_equal(logits.shape, expected_shape)
-
-        expected_slice = tf.constant(
-            [
-                [
-                    -9997.274,
-                    -9997.274,
-                    -9997.274,
-                    -9997.274,
-                    -9997.274,
-                    -9997.274,
-                    -9997.274,
-                    -9997.274,
-                    -9997.274,
-                    -16.262585,
-                    -10004.089,
-                    15.435196,
-                    15.435196,
-                    15.435196,
-                    -9990.443,
-                    -16.327433,
-                    -16.327433,
-                    -16.327433,
-                    -16.327433,
-                    -16.327433,
-                    -10004.84,
-                ]
-            ]
-        )
-
-        tf.debugging.assert_near(logits, expected_slice, atol=0.015)
-
-    @slow
-    def test_inference_question_answering_head_conversational_absolute_embeddings(self):
-        # note that google/tapas-small-finetuned-sqa should correspond to tapas_sqa_inter_masklm_small_reset
-        # however here we test the version with absolute position embeddings
-        model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-small-finetuned-sqa")
-        tokenizer = self.default_tokenizer
-        table, queries = prepare_tapas_single_inputs_for_inference()
-        inputs = tokenizer(table=table, queries=queries, return_tensors="tf")
-        outputs = model(**inputs)
-
-        # test the logits
-        logits = outputs.logits
-        expected_shape = tf.TensorShape([1, 21])
-        tf.debugging.assert_equal(logits.shape, expected_shape)
-
-        expected_slice = tf.constant(
-            [
-                [
-                    -10000.041,
-                    -10000.041,
-                    -10000.041,
-                    -10000.041,
-                    -10000.041,
-                    -10000.041,
-                    -10000.041,
-                    -10000.041,
-                    -10000.041,
-                    -18.369339,
-                    -10014.692,
-                    17.730324,
-                    17.730324,
-                    17.730324,
-                    -9984.974,
-                    -18.322773,
-                    -18.322773,
-                    -18.322773,
-                    -18.322773,
-                    -18.322773,
-                    -10007.267,
-                ]
-            ]
-        )
-
-        tf.debugging.assert_near(logits, expected_slice, atol=0.01)
-
-    @slow
-    def test_inference_question_answering_head_weak_supervision(self):
-        # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset
-        model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq")
-
-        tokenizer = self.default_tokenizer
-        # let's test on a batch
-        table, queries = prepare_tapas_batch_inputs_for_inference()
-        inputs = tokenizer(table=table, queries=queries, padding="longest", return_tensors="tf")
-        outputs = model(**inputs)
-
-        # test the logits
-        logits = outputs.logits
-        expected_shape = tf.TensorShape([2, 28])
-        tf.debugging.assert_equal(logits.shape, expected_shape)
-
-        expected_slice = tf.constant(
-            [
-                [-160.375504, -160.375504, -160.375504, -10072.3965, -10070.9414, -10094.9736],
-                [-9861.6123, -9861.6123, -9861.6123, -9861.6123, -9891.01172, 146.600677],
-            ]
-        )
-
-        tf.debugging.assert_near(logits[:, -6:], expected_slice, atol=0.4)
-
-        # test the aggregation logits
-        logits_aggregation = outputs.logits_aggregation
-        expected_shape = tf.TensorShape([2, 4])
-        tf.debugging.assert_equal(logits_aggregation.shape, expected_shape)
-        expected_tensor = tf.constant(
-            [[18.8545208, -9.76614857, -6.3128891, -2.93525243], [-4.05782509, 40.0351, -5.35329962, 23.3978653]]
-        )
-        tf.debugging.assert_near(logits_aggregation, expected_tensor, atol=0.001)
-
-        # test the predicted answer coordinates and aggregation indices
-        EXPECTED_PREDICTED_ANSWER_COORDINATES = [[(0, 0)], [(1, 2)]]
-        EXPECTED_PREDICTED_AGGREGATION_INDICES = [0, 1]
-
-        predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
-            inputs, outputs.logits, outputs.logits_aggregation
-        )
-        tf.debugging.assert_equal(EXPECTED_PREDICTED_ANSWER_COORDINATES, predicted_answer_coordinates)
-        tf.debugging.assert_equal(EXPECTED_PREDICTED_AGGREGATION_INDICES, predicted_aggregation_indices)
-
-    @slow
-    def test_training_question_answering_head_weak_supervision(self):
-        # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset
-        model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq")
-        tokenizer = self.default_tokenizer
-        # let's test on a batch
-        table, queries, answer_coordinates, answer_text, float_answer = prepare_tapas_batch_inputs_for_training()
-        inputs = tokenizer(
-            table=table,
-            queries=queries,
-            answer_coordinates=answer_coordinates,
-            answer_text=answer_text,
-            padding="longest",
-            return_tensors="tf",
-        )
-        # the answer should be prepared by the user
-        float_answer = tf.constant(float_answer, dtype=tf.float32)
-        outputs = model(
-            input_ids=inputs["input_ids"],
-            attention_mask=inputs["attention_mask"],
-            token_type_ids=inputs["token_type_ids"],
-            labels=inputs["labels"],
-            numeric_values=inputs["numeric_values"],
-            numeric_values_scale=inputs["numeric_values_scale"],
-            float_answer=float_answer,
-        )
-
-        # test the loss
-        loss = outputs.loss
-        expected_loss = tf.constant(3.3527612686157227e-08)
-        tf.debugging.assert_near(loss, expected_loss, atol=1e-6)
-
-        # test the logits on the first example
-        logits = outputs.logits
-        expected_shape = tf.TensorShape([2, 29])
-        tf.debugging.assert_equal(logits.shape, expected_shape)
-        expected_slice = tf.constant(
-            [
-                -160.0156,
-                -160.0156,
-                -160.0156,
-                -160.0156,
-                -160.0156,
-                -10072.2266,
-                -10070.8896,
-                -10092.6006,
-                -10092.6006,
-            ]
-        )
-        tf.debugging.assert_near(logits[0, -9:], expected_slice, atol=1e-6)
-
-        # test the aggregation logits on the second example
-        logits_aggregation = outputs.logits_aggregation
-        expected_shape = tf.TensorShape([2, 4])
-        tf.debugging.assert_equal(logits_aggregation.shape, expected_shape)
-        expected_tensor = tf.constant([-4.0538, 40.0304, -5.3554, 23.3965])
-        tf.debugging.assert_near(logits_aggregation[1, -4:], expected_tensor, atol=1e-4)
-
-    @slow
-    def test_inference_question_answering_head_strong_supervision(self):
-        # note that google/tapas-base-finetuned-wikisql-supervised should correspond to tapas_wikisql_sqa_inter_masklm_base_reset
-        model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wikisql-supervised")
-        tokenizer = self.default_tokenizer
-
-        table, queries = prepare_tapas_single_inputs_for_inference()
-        inputs = tokenizer(table=table, queries=queries, return_tensors="tf")
-        outputs = model(**inputs)
-
-        # test the logits
-        logits = outputs.logits
-        expected_shape = tf.TensorShape([1, 21])
-        tf.debugging.assert_equal(logits.shape, expected_shape)
-        expected_slice = tf.constant(
-            [
-                [
-                    -10011.1084,
-                    -10011.1084,
-                    -10011.1084,
-                    -10011.1084,
-                    -10011.1084,
-                    -10011.1084,
-                    -10011.1084,
-                    -10011.1084,
-                    -10011.1084,
-                    -18.6185989,
-                    -10008.7969,
-                    17.6355762,
-                    17.6355762,
-                    17.6355762,
-                    -10002.4404,
-                    -18.7111301,
-                    -18.7111301,
-                    -18.7111301,
-                    -18.7111301,
-                    -18.7111301,
-                    -10007.0977,
-                ]
-            ]
-        )
-        tf.debugging.assert_near(logits, expected_slice, atol=0.02)
-
-        # test the aggregation logits
-        logits_aggregation = outputs.logits_aggregation
-        expected_shape = tf.TensorShape([1, 4])
-        tf.debugging.assert_equal(logits_aggregation.shape, expected_shape)
-        expected_tensor = tf.constant([[16.5659733, -3.06624889, -2.34152961, -0.970244825]])
-        tf.debugging.assert_near(logits_aggregation, expected_tensor, atol=0.003)
-
-    @slow
-    def test_inference_classification_head(self):
-        # note that google/tapas-base-finetuned-tabfact should correspond to tapas_tabfact_inter_masklm_base_reset
-        model = TFTapasForSequenceClassification.from_pretrained("google/tapas-base-finetuned-tabfact")
-        tokenizer = self.default_tokenizer
-
-        table, queries = prepare_tapas_single_inputs_for_inference()
-        inputs = tokenizer(table=table, queries=queries, return_tensors="tf")
-        outputs = model(**inputs)
-
-        # test the classification logits
-        logits = outputs.logits
-        expected_shape = tf.TensorShape([1, 2])
-        tf.debugging.assert_equal(logits.shape, expected_shape)
-        expected_slice = tf.constant([[0.795137286, 9.5572]])
-        tf.debugging.assert_near(logits, expected_slice, atol=0.05)
-
-
-# Below: tests for Tapas utilities which are defined in modeling_tf_tapas.py.
-# These are based on segmented_tensor_test.py of the original implementation.
-# URL: https://github.com/google-research/tapas/blob/master/tapas/models/segmented_tensor_test.py
-@require_tensorflow_probability
-class TFTapasUtilsTest(unittest.TestCase):
-    def _prepare_tables(self):
-        """Prepares two tables, both with three distinct rows.
-        The first table has two columns:
-        1.0, 2.0 | 3.0
-        2.0, 0.0 | 1.0
-        1.0, 3.0 | 4.0
-        The second table has three columns:
-        1.0 | 2.0 | 3.0
-        2.0 | 0.0 | 1.0
-        1.0 | 3.0 | 4.0
-        Returns:
-        SegmentedTensors with the tables.
-        """
-        values = tf.constant(
-            [
-                [[1.0, 2.0, 3.0], [2.0, 0.0, 1.0], [1.0, 3.0, 4.0]],
-                [[1.0, 2.0, 3.0], [2.0, 0.0, 1.0], [1.0, 3.0, 4.0]],
-            ]
-        )
-        row_index = IndexMap(
-            indices=[
-                [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
-                [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
-            ],
-            num_segments=3,
-            batch_dims=1,
-        )
-        col_index = IndexMap(
-            indices=[
-                [[0, 0, 1], [0, 0, 1], [0, 0, 1]],
-                [[0, 1, 2], [0, 1, 2], [0, 1, 2]],
-            ],
-            num_segments=3,
-            batch_dims=1,
-        )
-        return values, row_index, col_index
-
-    def test_product_index(self):
-        _, row_index, col_index = self._prepare_tables()
-        cell_index = ProductIndexMap(row_index, col_index)
-        row_index_proj = cell_index.project_outer(cell_index)
-        col_index_proj = cell_index.project_inner(cell_index)
-
-        ind = cell_index.indices
-        self.assertEqual(cell_index.num_segments, 9)
-
-        # Projections should give back the original indices.
-        # we use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-        np.testing.assert_array_equal(row_index.indices.numpy(), row_index_proj.indices.numpy())
-        self.assertEqual(row_index.num_segments, row_index_proj.num_segments)
-        self.assertEqual(row_index.batch_dims, row_index_proj.batch_dims)
-        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-        np.testing.assert_array_equal(col_index.indices.numpy(), col_index_proj.indices.numpy())
-        self.assertEqual(col_index.batch_dims, col_index_proj.batch_dims)
-
-        # The first and second "column" are identified in the first table.
-        for i in range(3):
-            self.assertEqual(ind[0, i, 0], ind[0, i, 1])
-            self.assertNotEqual(ind[0, i, 0], ind[0, i, 2])
-
-        # All rows are distinct in the first table.
-        for i, i_2 in zip(range(3), range(3)):
-            for j, j_2 in zip(range(3), range(3)):
-                if i != i_2 and j != j_2:
-                    self.assertNotEqual(ind[0, i, j], ind[0, i_2, j_2])
-
-        # All cells are distinct in the second table.
-        for i, i_2 in zip(range(3), range(3)):
-            for j, j_2 in zip(range(3), range(3)):
-                if i != i_2 or j != j_2:
-                    self.assertNotEqual(ind[1, i, j], ind[1, i_2, j_2])
-
-    def test_flatten(self):
-        _, row_index, col_index = self._prepare_tables()
-        row_index_flat = flatten(row_index)
-        col_index_flat = flatten(col_index)
-
-        shape = [3, 4, 5]
-        batched_index = IndexMap(indices=tf.zeros(shape, dtype=tf.int32), num_segments=1, batch_dims=3)
-        batched_index_flat = flatten(batched_index)
-
-        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-        np.testing.assert_array_equal(
-            row_index_flat.indices.numpy(), [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5]
-        )
-        np.testing.assert_array_equal(
-            col_index_flat.indices.numpy(), [0, 0, 1, 0, 0, 1, 0, 0, 1, 3, 4, 5, 3, 4, 5, 3, 4, 5]
-        )
-        self.assertEqual(batched_index_flat.num_segments.numpy(), np.prod(shape))
-        np.testing.assert_array_equal(batched_index_flat.indices.numpy(), range(np.prod(shape)))
-
-    def test_range_index_map(self):
-        batch_shape = [3, 4]
-        num_segments = 5
-        index = range_index_map(batch_shape, num_segments)
-
-        self.assertEqual(num_segments, index.num_segments)
-        self.assertEqual(2, index.batch_dims)
-        indices = index.indices
-        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-        np.testing.assert_array_equal(list(indices.shape), [3, 4, 5])
-        for i in range(batch_shape[0]):
-            for j in range(batch_shape[1]):
-                # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-                np.testing.assert_array_equal(indices[i, j, :].numpy(), range(num_segments))
-
-    def test_reduce_sum(self):
-        values, row_index, col_index = self._prepare_tables()
-        cell_index = ProductIndexMap(row_index, col_index)
-        row_sum, _ = reduce_sum(values, row_index)
-        col_sum, _ = reduce_sum(values, col_index)
-        cell_sum, _ = reduce_sum(values, cell_index)
-
-        # We use np.testing.assert_allclose rather than Tensorflow's assertAllClose
-        np.testing.assert_allclose(row_sum.numpy(), [[6.0, 3.0, 8.0], [6.0, 3.0, 8.0]])
-        np.testing.assert_allclose(col_sum.numpy(), [[9.0, 8.0, 0.0], [4.0, 5.0, 8.0]])
-        np.testing.assert_allclose(
-            cell_sum.numpy(),
-            [[3.0, 3.0, 0.0, 2.0, 1.0, 0.0, 4.0, 4.0, 0.0], [1.0, 2.0, 3.0, 2.0, 0.0, 1.0, 1.0, 3.0, 4.0]],
-        )
-
-    def test_reduce_mean(self):
-        values, row_index, col_index = self._prepare_tables()
-        cell_index = ProductIndexMap(row_index, col_index)
-        row_mean, _ = reduce_mean(values, row_index)
-        col_mean, _ = reduce_mean(values, col_index)
-        cell_mean, _ = reduce_mean(values, cell_index)
-
-        # We use np.testing.assert_allclose rather than Tensorflow's assertAllClose
-        np.testing.assert_allclose(
-            row_mean.numpy(), [[6.0 / 3.0, 3.0 / 3.0, 8.0 / 3.0], [6.0 / 3.0, 3.0 / 3.0, 8.0 / 3.0]]
-        )
-        np.testing.assert_allclose(col_mean.numpy(), [[9.0 / 6.0, 8.0 / 3.0, 0.0], [4.0 / 3.0, 5.0 / 3.0, 8.0 / 3.0]])
-        np.testing.assert_allclose(
-            cell_mean.numpy(),
-            [
-                [3.0 / 2.0, 3.0, 0.0, 2.0 / 2.0, 1.0, 0.0, 4.0 / 2.0, 4.0, 0.0],
-                [1.0, 2.0, 3.0, 2.0, 0.0, 1.0, 1.0, 3.0, 4.0],
-            ],
-        )
-
-    def test_reduce_max(self):
-        values = tf.convert_to_tensor([2.0, 1.0, 0.0, 3.0])
-        index = IndexMap(indices=tf.convert_to_tensor([0, 1, 0, 1]), num_segments=2)
-        maximum, _ = reduce_max(values, index)
-
-        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-        np.testing.assert_array_equal(maximum.numpy(), [2, 3])
-
-    def test_reduce_sum_vectorized(self):
-        values = tf.convert_to_tensor([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0]])
-        index = IndexMap(indices=tf.convert_to_tensor([0, 0, 1]), num_segments=2, batch_dims=0)
-        sums, new_index = reduce_sum(values, index)
-
-        # We use np.testing.assert_allclose rather than Tensorflow's assertAllClose
-        np.testing.assert_allclose(sums.numpy(), [[3.0, 5.0, 7.0], [3.0, 4.0, 5.0]])
-        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-        np.testing.assert_array_equal(new_index.indices.numpy(), [0, 1])
-        np.testing.assert_array_equal(new_index.num_segments.numpy(), 2)
-        np.testing.assert_array_equal(new_index.batch_dims, 0)
-
-    def test_gather(self):
-        values, row_index, col_index = self._prepare_tables()
-        cell_index = ProductIndexMap(row_index, col_index)
-
-        # Compute sums and then gather. The result should have the same shape as
-        # the original table and each element should contain the sum the values in
-        # its cell.
-        sums, _ = reduce_sum(values, cell_index)
-        cell_sum = gather(sums, cell_index)
-        assert cell_sum.shape == values.shape
-
-        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-        np.testing.assert_allclose(
-            cell_sum.numpy(),
-            [[[3.0, 3.0, 3.0], [2.0, 2.0, 1.0], [4.0, 4.0, 4.0]], [[1.0, 2.0, 3.0], [2.0, 0.0, 1.0], [1.0, 3.0, 4.0]]],
-        )
-
-    def test_gather_vectorized(self):
-        values = tf.constant([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
-        index = IndexMap(indices=tf.convert_to_tensor([[0, 1], [1, 0]]), num_segments=2, batch_dims=1)
-        result = gather(values, index)
-
-        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-        np.testing.assert_array_equal(result.numpy(), [[[1, 2], [3, 4]], [[7, 8], [5, 6]]])
diff --git a/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
deleted file mode 100644
index 3c70f5a81e..0000000000
--- a/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
+++ /dev/null
@@ -1,410 +0,0 @@
-# Copyright 2021 HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import tempfile
-import unittest
-
-import numpy as np
-
-from transformers import is_flax_available, is_vision_available
-from transformers.testing_utils import require_flax, require_vision, slow
-
-from ...test_modeling_flax_common import floats_tensor, ids_tensor
-from ..gpt2.test_modeling_flax_gpt2 import FlaxGPT2ModelTester
-from ..vit.test_modeling_flax_vit import FlaxViTModelTester
-
-
-if is_flax_available():
-    from transformers import (
-        AutoTokenizer,
-        FlaxGPT2LMHeadModel,
-        FlaxVisionEncoderDecoderModel,
-        FlaxViTModel,
-        VisionEncoderDecoderConfig,
-    )
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import ViTImageProcessor
-
-
-@require_flax
-class FlaxEncoderDecoderMixin:
-    def get_encoder_decoder_model(self, config, decoder_config):
-        raise NotImplementedError
-
-    def prepare_config_and_inputs(self):
-        raise NotImplementedError
-
-    def get_pretrained_model(self):
-        raise NotImplementedError
-
-    def check_encoder_decoder_model_from_pretrained_configs(
-        self,
-        config,
-        pixel_values,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-        self.assertTrue(encoder_decoder_config.decoder.is_decoder)
-
-        enc_dec_model = FlaxVisionEncoderDecoderModel(encoder_decoder_config)
-
-        self.assertTrue(enc_dec_model.config.is_encoder_decoder)
-
-        outputs_encoder_decoder = enc_dec_model(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-        self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[0], pixel_values.shape[0])
-        self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[-1], config.hidden_size)
-
-    def check_encoder_decoder_model_from_pretrained(
-        self,
-        config,
-        pixel_values,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        return_dict,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
-        enc_dec_model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
-        outputs_encoder_decoder = enc_dec_model(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            return_dict=True,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-        self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[0], pixel_values.shape[0])
-        self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[-1], config.hidden_size)
-
-    def check_save_and_load(
-        self,
-        config,
-        pixel_values,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model}
-        enc_dec_model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
-
-        outputs = enc_dec_model(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        out_2 = np.array(outputs[0])
-        out_2[np.isnan(out_2)] = 0
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            enc_dec_model.save_pretrained(tmpdirname)
-            FlaxVisionEncoderDecoderModel.from_pretrained(tmpdirname)
-
-            after_outputs = enc_dec_model(
-                pixel_values=pixel_values,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-            out_1 = np.array(after_outputs[0])
-            out_1[np.isnan(out_1)] = 0
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-    def check_encoder_decoder_model_output_attentions(
-        self,
-        config,
-        pixel_values,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        # make the decoder inputs a different shape from the encoder inputs to harden the test
-        decoder_input_ids = decoder_input_ids[:, :-1]
-        decoder_attention_mask = decoder_attention_mask[:, :-1]
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model}
-        enc_dec_model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
-        outputs_encoder_decoder = enc_dec_model(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            output_attentions=True,
-        )
-
-        encoder_attentions = outputs_encoder_decoder["encoder_attentions"]
-        self.assertEqual(len(encoder_attentions), config.num_hidden_layers)
-
-        self.assertEqual(encoder_attentions[0].shape[-3:-2], (config.num_attention_heads,))
-
-        decoder_attentions = outputs_encoder_decoder["decoder_attentions"]
-        num_decoder_layers = (
-            decoder_config.num_decoder_layers
-            if hasattr(decoder_config, "num_decoder_layers")
-            else decoder_config.num_hidden_layers
-        )
-        self.assertEqual(len(decoder_attentions), num_decoder_layers)
-
-        self.assertEqual(
-            decoder_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]),
-        )
-
-        cross_attentions = outputs_encoder_decoder["cross_attentions"]
-        self.assertEqual(len(cross_attentions), num_decoder_layers)
-
-        cross_attention_input_seq_len = decoder_input_ids.shape[-1] * (
-            1 + (decoder_config.ngram if hasattr(decoder_config, "ngram") else 0)
-        )
-        self.assertEqual(
-            cross_attentions[0].shape[-3:-1],
-            (decoder_config.num_attention_heads, cross_attention_input_seq_len),
-        )
-
-    def check_encoder_decoder_model_generate(self, pixel_values, config, decoder_config, **kwargs):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model}
-        enc_dec_model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
-
-        pad_token_id = enc_dec_model.config.decoder.pad_token_id
-        eos_token_id = enc_dec_model.config.decoder.eos_token_id
-        decoder_start_token_id = enc_dec_model.config.decoder.decoder_start_token_id
-
-        # Copied from generation.utils (GPT2 doesn't have `pad_token_id`)
-        if pad_token_id is None and eos_token_id is not None:
-            pad_token_id = eos_token_id
-        if decoder_start_token_id is None:
-            decoder_start_token_id = enc_dec_model.config.decoder.bos_token_id
-
-        # Bert does not have a bos token id, so use pad_token_id instead
-        # Copied from `test_modeling_encoder_decoder.py`
-        if decoder_start_token_id is None:
-            decoder_start_token_id = pad_token_id
-
-        generated_output = enc_dec_model.generate(
-            pixel_values,
-            pad_token_id=pad_token_id,
-            eos_token_id=eos_token_id,
-            decoder_start_token_id=decoder_start_token_id,
-        )
-        generated_sequences = generated_output.sequences
-        self.assertEqual(generated_sequences.shape, (pixel_values.shape[0],) + (decoder_config.max_length,))
-
-    def test_encoder_decoder_model_from_pretrained_configs(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained_configs(**config_inputs_dict)
-
-    def test_encoder_decoder_model_from_pretrained(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained(**config_inputs_dict, return_dict=False)
-
-    def test_encoder_decoder_model_from_pretrained_return_dict(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained(**config_inputs_dict, return_dict=True)
-
-    def test_save_and_load_from_pretrained(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        self.check_save_and_load(**config_inputs_dict)
-
-    def test_encoder_decoder_model_output_attentions(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_output_attentions(**config_inputs_dict)
-
-    def test_encoder_decoder_model_generate(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_generate(**config_inputs_dict)
-
-    def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
-        diff = np.abs(a - b).max()
-        self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
-
-    @slow
-    def test_real_model_save_load_from_pretrained(self):
-        model_2 = self.get_pretrained_model()
-        pixel_values = floats_tensor(
-            [
-                13,
-                model_2.config.encoder.num_channels,
-                model_2.config.encoder.image_size,
-                model_2.config.encoder.image_size,
-            ]
-        )
-        decoder_input_ids = ids_tensor([13, 1], model_2.config.decoder.vocab_size)
-
-        outputs = model_2(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-        )
-        out_2 = np.array(outputs[0])
-        out_2[np.isnan(out_2)] = 0
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            model_2.save_pretrained(tmp_dirname)
-            model_1 = FlaxVisionEncoderDecoderModel.from_pretrained(tmp_dirname)
-
-            after_outputs = model_1(
-                pixel_values=pixel_values,
-                decoder_input_ids=decoder_input_ids,
-            )
-            out_1 = np.array(after_outputs[0])
-            out_1[np.isnan(out_1)] = 0
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-
-@require_flax
-class FlaxViT2GPT2EncoderDecoderModelTest(FlaxEncoderDecoderMixin, unittest.TestCase):
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = FlaxViTModel(config)
-        decoder_model = FlaxGPT2LMHeadModel(decoder_config)
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        model_tester_encoder = FlaxViTModelTester(self, batch_size=13)
-        model_tester_decoder = FlaxGPT2ModelTester(self, batch_size=13)
-        encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
-        decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder()
-        (config, pixel_values) = encoder_config_and_inputs
-        (
-            decoder_config,
-            decoder_input_ids,
-            decoder_attention_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        return {
-            "config": config,
-            "pixel_values": pixel_values,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "encoder_hidden_states": encoder_hidden_states,  # This is not used in the tests.
-        }
-
-    def get_pretrained_model(self):
-        return FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
-        )
-
-
-@require_flax
-class FlaxVisionEncoderDecoderModelTest(unittest.TestCase):
-    def get_from_encoderdecoder_pretrained_model(self):
-        return FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
-        )
-
-    def _check_configuration_tie(self, model):
-        module = model.module.bind(model.params)
-
-        assert id(module.decoder.config) == id(model.config.decoder)
-        assert id(module.encoder.config) == id(model.config.encoder)
-
-    @slow
-    def test_configuration_tie(self):
-        model = self.get_from_encoderdecoder_pretrained_model()
-        self._check_configuration_tie(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_vision
-@require_flax
-class FlaxViT2GPT2ModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_coco_en(self):
-        loc = "ydshieh/vit-gpt2-coco-en"
-
-        image_processor = ViTImageProcessor.from_pretrained(loc)
-        tokenizer = AutoTokenizer.from_pretrained(loc)
-        model = FlaxVisionEncoderDecoderModel.from_pretrained(loc)
-
-        img = prepare_img()
-        pixel_values = image_processor(images=img, return_tensors="np").pixel_values
-
-        decoder_input_ids = np.array([[model.config.decoder_start_token_id]])
-        logits = model(pixel_values, decoder_input_ids)[0]
-        logits = np.array(logits)
-
-        # verify the logits
-        expected_shape = (1, 1, model.config.decoder.vocab_size)
-        self.assertEqual(logits.shape, expected_shape)
-
-        EXPECTED_LOGIT_SLICE = np.array(
-            [
-                -38.705837,
-                -30.639936,
-                -31.41905,
-                -39.01204,
-                -38.38698,
-                -34.887215,
-                -33.29087,
-                -35.684475,
-                -38.50852,
-                -36.124676,
-            ]
-        )
-        max_diff = np.amax(np.abs(logits[0, 0, :10] - EXPECTED_LOGIT_SLICE))
-        self.assertLessEqual(max_diff, 1e-4)
-
-        def generate_step(pixel_values):
-            outputs = model.generate(pixel_values, max_length=16, num_beams=4)
-            output_ids = outputs.sequences
-            preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-            preds = [pred.strip() for pred in preds]
-
-            return preds, outputs.scores
-
-        preds, scores = generate_step(pixel_values)
-
-        EXPECTED_SCORES = np.array([-0.59563464])
-        scores = np.array(scores)
-        max_diff = np.amax(np.abs(scores - EXPECTED_SCORES))
-        self.assertLessEqual(max_diff, 1e-4)
-
-        # should produce
-        # ["a cat laying on top of a couch next to another cat"]
-        self.assertEqual(preds, ["a cat laying on top of a couch next to another cat"])
diff --git a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
deleted file mode 100644
index 0810789ce0..0000000000
--- a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
+++ /dev/null
@@ -1,648 +0,0 @@
-# Copyright 2022 HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow VisionEncoderDecoder model."""
-
-from __future__ import annotations
-
-import os
-import tempfile
-import unittest
-
-import numpy as np
-
-from transformers import is_tf_available, is_vision_available
-from transformers.testing_utils import (
-    require_tf,
-    require_vision,
-    slow,
-)
-
-from ...test_modeling_tf_common import floats_tensor, ids_tensor
-from ..gpt2.test_modeling_tf_gpt2 import TFGPT2ModelTester
-from ..vit.test_modeling_tf_vit import TFViTModelTester
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        AutoConfig,
-        AutoImageProcessor,
-        AutoTokenizer,
-        TFAutoModel,
-        TFAutoModelForCausalLM,
-        TFGPT2LMHeadModel,
-        TFVisionEncoderDecoderModel,
-        TFViTModel,
-        VisionEncoderDecoderConfig,
-    )
-    from transformers.modeling_tf_outputs import TFBaseModelOutput
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import ViTImageProcessor
-
-
-@require_tf
-class TFVisionEncoderDecoderMixin:
-    def get_encoder_decoder_model(self, config, decoder_config):
-        raise NotImplementedError
-
-    def prepare_config_and_inputs(self):
-        raise NotImplementedError
-
-    def get_pretrained_model(self):
-        raise NotImplementedError
-
-    def check_encoder_decoder_model_from_pretrained_configs(
-        self,
-        config,
-        pixel_values,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-        self.assertTrue(encoder_decoder_config.decoder.is_decoder)
-
-        enc_dec_model = TFVisionEncoderDecoderModel(encoder_decoder_config)
-
-        self.assertTrue(enc_dec_model.config.is_encoder_decoder)
-
-        outputs_encoder_decoder = enc_dec_model(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            kwargs=kwargs,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-        self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[0], pixel_values.shape[0])
-        self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[-1], config.hidden_size)
-
-    def check_encoder_decoder_model(
-        self,
-        config,
-        pixel_values,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = TFVisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        self.assertTrue(enc_dec_model.config.decoder.is_decoder)
-        self.assertTrue(enc_dec_model.config.decoder.add_cross_attention)
-        self.assertTrue(enc_dec_model.config.is_encoder_decoder)
-
-        outputs_encoder_decoder = enc_dec_model(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            kwargs=kwargs,
-        )
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-        self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[0], pixel_values.shape[0])
-        self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[-1], config.hidden_size)
-
-        encoder_outputs = TFBaseModelOutput(last_hidden_state=encoder_hidden_states)
-        outputs_encoder_decoder = enc_dec_model(
-            pixel_values=None,
-            encoder_outputs=encoder_outputs,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            kwargs=kwargs,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-        self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[0], pixel_values.shape[0])
-        self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[-1], config.hidden_size)
-
-    def check_encoder_decoder_model_from_pretrained(
-        self,
-        config,
-        pixel_values,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        return_dict,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
-        enc_dec_model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
-        outputs_encoder_decoder = enc_dec_model(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            return_dict=True,
-            kwargs=kwargs,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-        self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[0], pixel_values.shape[0])
-        self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[-1], config.hidden_size)
-
-    def check_save_and_load(
-        self,
-        config,
-        pixel_values,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = TFVisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-
-        outputs = enc_dec_model(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            kwargs=kwargs,
-        )
-        out_2 = np.array(outputs[0])
-        out_2[np.isnan(out_2)] = 0
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            enc_dec_model.save_pretrained(tmpdirname)
-            enc_dec_model = TFVisionEncoderDecoderModel.from_pretrained(tmpdirname)
-
-            after_outputs = enc_dec_model(
-                pixel_values=pixel_values,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-                kwargs=kwargs,
-            )
-            out_1 = np.array(after_outputs[0])
-            out_1[np.isnan(out_1)] = 0
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-    def check_encoder_decoder_model_labels(
-        self,
-        config,
-        pixel_values,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        labels,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = TFVisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-
-        outputs_encoder_decoder = enc_dec_model(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            labels=labels,
-            kwargs=kwargs,
-        )
-
-        # Make sure `loss` exist
-        self.assertIn("loss", outputs_encoder_decoder)
-
-        batch_size, seq_len = decoder_input_ids.shape
-        expected_shape = (batch_size, seq_len, decoder_config.vocab_size)
-        self.assertEqual(outputs_encoder_decoder["logits"].shape, expected_shape)
-        self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[0], pixel_values.shape[0])
-        self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[-1], config.hidden_size)
-
-    def check_encoder_decoder_model_output_attentions(
-        self,
-        config,
-        pixel_values,
-        encoder_hidden_states,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        # make the decoder inputs a different shape from the encoder inputs to harden the test
-        decoder_input_ids = decoder_input_ids[:, :-1]
-        decoder_attention_mask = decoder_attention_mask[:, :-1]
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = TFVisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        outputs_encoder_decoder = enc_dec_model(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            output_attentions=True,
-            kwargs=kwargs,
-        )
-
-        encoder_attentions = outputs_encoder_decoder["encoder_attentions"]
-        self.assertEqual(len(encoder_attentions), config.num_hidden_layers)
-
-        self.assertEqual(encoder_attentions[0].shape[-3:-2], (config.num_attention_heads,))
-
-        decoder_attentions = outputs_encoder_decoder["decoder_attentions"]
-        num_decoder_layers = (
-            decoder_config.num_decoder_layers
-            if hasattr(decoder_config, "num_decoder_layers")
-            else decoder_config.num_hidden_layers
-        )
-        self.assertEqual(len(decoder_attentions), num_decoder_layers)
-
-        self.assertEqual(
-            decoder_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]),
-        )
-
-        cross_attentions = outputs_encoder_decoder["cross_attentions"]
-        self.assertEqual(len(cross_attentions), num_decoder_layers)
-
-        cross_attention_input_seq_len = decoder_input_ids.shape[-1] * (
-            1 + (decoder_config.ngram if hasattr(decoder_config, "ngram") else 0)
-        )
-        self.assertEqual(
-            cross_attentions[0].shape[-3:-1],
-            (decoder_config.num_attention_heads, cross_attention_input_seq_len),
-        )
-
-    def check_encoder_decoder_model_generate(self, pixel_values, config, decoder_config, **kwargs):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = TFVisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-
-        # Generate until max length
-        if hasattr(enc_dec_model.config, "eos_token_id"):
-            enc_dec_model.config.eos_token_id = None
-        if hasattr(enc_dec_model.config, "decoder") and hasattr(enc_dec_model.config.decoder, "eos_token_id"):
-            enc_dec_model.config.decoder.eos_token_id = None
-        if hasattr(enc_dec_model.generation_config, "eos_token_id"):
-            enc_dec_model.generation_config.eos_token_id = None
-
-        # Bert does not have a bos token id, so use pad_token_id instead
-        generated_output = enc_dec_model.generate(
-            pixel_values, decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id
-        )
-        self.assertEqual(
-            tuple(generated_output.shape.as_list()), (pixel_values.shape[0],) + (decoder_config.max_length,)
-        )
-
-    def test_encoder_decoder_model(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model(**config_inputs_dict)
-
-    def test_encoder_decoder_model_from_pretrained_configs(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained_configs(**config_inputs_dict)
-
-    def test_encoder_decoder_model_from_pretrained(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained(**config_inputs_dict, return_dict=False)
-
-    def test_encoder_decoder_model_from_pretrained_return_dict(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained(**config_inputs_dict, return_dict=True)
-
-    def test_save_and_load_from_pretrained(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        self.check_save_and_load(**config_inputs_dict)
-
-    def test_encoder_decoder_model_labels(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_labels(**config_inputs_dict)
-
-    def test_encoder_decoder_model_output_attentions(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_output_attentions(**config_inputs_dict)
-
-    def test_encoder_decoder_model_generate(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_generate(**config_inputs_dict)
-
-    def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
-        diff = np.abs(a - b).max()
-        self.assertLessEqual(diff, tol, f"Difference between torch and tf is {diff} (>= {tol}).")
-
-    @slow
-    def test_real_model_save_load_from_pretrained(self):
-        model_2 = self.get_pretrained_model()
-        pixel_values = floats_tensor(
-            [
-                13,
-                model_2.config.encoder.num_channels,
-                model_2.config.encoder.image_size,
-                model_2.config.encoder.image_size,
-            ]
-        )
-        decoder_input_ids = ids_tensor([13, 1], model_2.config.decoder.vocab_size)
-
-        outputs = model_2(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-        )
-        out_2 = np.array(outputs[0])
-        out_2[np.isnan(out_2)] = 0
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            model_2.save_pretrained(tmp_dirname)
-            model_1 = TFVisionEncoderDecoderModel.from_pretrained(tmp_dirname)
-
-            after_outputs = model_1(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
-            out_1 = np.array(after_outputs[0])
-            out_1[np.isnan(out_1)] = 0
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-
-@require_tf
-class TFViT2GPT2EncoderDecoderModelTest(TFVisionEncoderDecoderMixin, unittest.TestCase):
-    def get_pretrained_model(self):
-        return TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
-        )
-
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = TFViTModel(config, name="encoder")
-        decoder_model = TFGPT2LMHeadModel(decoder_config, name="decoder")
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        model_tester_encoder = TFViTModelTester(self, batch_size=13)
-        model_tester_decoder = TFGPT2ModelTester(self)
-        encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
-        decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder()
-        (config, pixel_values, labels) = encoder_config_and_inputs
-        (
-            decoder_config,
-            decoder_input_ids,
-            decoder_attention_mask,
-            decoder_head_mask,
-            decoder_token_type_ids,
-            decoder_sequence_labels,
-            decoder_token_labels,
-            decoder_choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        # disable cache for now
-        decoder_config.use_cache = False
-        return {
-            "config": config,
-            "pixel_values": pixel_values,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "decoder_token_labels": decoder_token_labels,
-            "encoder_hidden_states": encoder_hidden_states,  # This is not used in the tests.
-            "labels": decoder_token_labels,
-        }
-
-
-@require_tf
-class TFVisionEncoderDecoderModelTest(unittest.TestCase):
-    def get_from_encoderdecoder_pretrained_model(self):
-        return TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
-        )
-
-    def get_decoder_config(self):
-        config = AutoConfig.from_pretrained("openai-community/gpt2")
-        config.is_decoder = True
-        config.add_cross_attention = True
-        return config
-
-    def get_encoderdecoder_model(self):
-        return TFVisionEncoderDecoderModel.from_pretrained("ydshieh/vit-gpt2-coco-en")
-
-    def get_encoder_decoder_models(self):
-        encoder_model = TFViTModel.from_pretrained("google/vit-base-patch16-224-in21k", name="encoder")
-        decoder_model = TFGPT2LMHeadModel.from_pretrained(
-            "openai-community/gpt2", config=self.get_decoder_config(), name="decoder"
-        )
-        return {"encoder": encoder_model, "decoder": decoder_model}
-
-    def _check_configuration_tie(self, model):
-        assert id(model.decoder.config) == id(model.config.decoder)
-        assert id(model.encoder.config) == id(model.config.encoder)
-
-    @slow
-    def test_configuration_tie(self):
-        model = self.get_from_encoderdecoder_pretrained_model()
-        self._check_configuration_tie(model)
-
-        model = TFVisionEncoderDecoderModel(**self.get_encoder_decoder_models())
-        self._check_configuration_tie(model)
-
-        model = self.get_encoderdecoder_model()
-        self._check_configuration_tie(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_tf
-class TFVisionEncoderDecoderModelSaveLoadTests(unittest.TestCase):
-    def get_encoder_decoder_config(self):
-        encoder_config = AutoConfig.from_pretrained("google/vit-base-patch16-224-in21k")
-        decoder_config = AutoConfig.from_pretrained("openai-community/gpt2", is_decoder=True, add_cross_attention=True)
-        return VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config)
-
-    def get_encoder_decoder_config_small(self):
-        encoder_config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-vit")
-        decoder_config = AutoConfig.from_pretrained(
-            "hf-internal-testing/tiny-random-gpt2", is_decoder=True, add_cross_attention=True
-        )
-        return VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config)
-
-    def test_encoder_decoder_save_load_from_encoder_decoder(self):
-        config = self.get_encoder_decoder_config_small()
-
-        # create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights)
-        encoder = TFViTModel(config.encoder)
-        encoder.build_in_name_scope()
-        decoder = TFGPT2LMHeadModel(config.decoder)
-        decoder.build_in_name_scope()
-
-        encoder_decoder_orig = TFVisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
-
-        pixel_values = floats_tensor(
-            [
-                13,
-                encoder.config.num_channels,
-                encoder.config.image_size,
-                encoder.config.image_size,
-            ]
-        )
-        decoder_input_ids = ids_tensor([13, 1], decoder.config.vocab_size)
-
-        logits_orig = encoder_decoder_orig(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            encoder_path = os.path.join(tmp_dirname, "encoder")
-            decoder_path = os.path.join(tmp_dirname, "decoder")
-
-            encoder.save_pretrained(encoder_path)
-            decoder.save_pretrained(decoder_path)
-
-            encoder_decoder = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(encoder_path, decoder_path)
-
-        logits_1 = encoder_decoder(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits
-
-        self.assertTrue(logits_orig.numpy().sum() - logits_1.numpy().sum() < 1e-3)
-
-        max_diff = np.max(np.abs(logits_1.numpy() - logits_orig.numpy()))
-        self.assertAlmostEqual(max_diff, 0.0, places=4)
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            encoder_decoder.save_pretrained(tmp_dirname)
-            encoder_decoder = TFVisionEncoderDecoderModel.from_pretrained(tmp_dirname)
-
-        logits_2 = encoder_decoder(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits
-
-        max_diff = np.max(np.abs(logits_2.numpy() - logits_orig.numpy()))
-        self.assertAlmostEqual(max_diff, 0.0, places=4)
-
-    @require_vision
-    @slow
-    def test_encoder_decoder_from_pretrained(self):
-        load_weight_prefix = TFVisionEncoderDecoderModel.load_weight_prefix
-
-        config = self.get_encoder_decoder_config()
-        image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
-        decoder_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
-
-        img = prepare_img()
-        pixel_values = image_processor(images=img, return_tensors="tf").pixel_values
-        decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            # Since most of HF's models don't have pretrained cross-attention layers, they are randomly
-            # initialized even if we create models using `from_pretrained` method.
-            # For the tests, the decoder need to be a model with pretrained cross-attention layers.
-            # So we create pretrained models (without `load_weight_prefix`), save them, and later,
-            # we load them using `from_pretrained`.
-            # (we don't need to do this for encoder, but let's make the code more similar between encoder/decoder)
-            encoder = TFAutoModel.from_pretrained("google/vit-base-patch16-224-in21k", name="encoder")
-            # It's necessary to specify `add_cross_attention=True` here.
-            decoder = TFAutoModelForCausalLM.from_pretrained(
-                "openai-community/gpt2", is_decoder=True, add_cross_attention=True, name="decoder"
-            )
-            pretrained_encoder_dir = os.path.join(tmp_dirname, "pretrained_encoder")
-            pretrained_decoder_dir = os.path.join(tmp_dirname, "pretrained_decoder")
-            encoder.save_pretrained(pretrained_encoder_dir)
-            decoder.save_pretrained(pretrained_decoder_dir)
-            del encoder
-            del decoder
-
-            enc_dec_model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-                pretrained_encoder_dir,
-                pretrained_decoder_dir,
-            )
-            enc_dec_model.build_in_name_scope()
-            # check that the from pretrained methods work
-            enc_dec_model.save_pretrained(tmp_dirname)
-            enc_dec_model = TFVisionEncoderDecoderModel.from_pretrained(tmp_dirname)
-
-            output = enc_dec_model(pixel_values, decoder_input_ids=decoder_input_ids, labels=decoder_input_ids)
-
-            loss_pretrained = output.loss
-            del enc_dec_model
-
-            # Create the model using `__init__` with loaded ``pretrained`` encoder / decoder
-            encoder = TFAutoModel.from_pretrained(
-                pretrained_encoder_dir, load_weight_prefix=load_weight_prefix, name="encoder"
-            )
-            decoder = TFAutoModelForCausalLM.from_pretrained(
-                pretrained_decoder_dir, load_weight_prefix=load_weight_prefix, name="decoder"
-            )
-            enc_dec_model = TFVisionEncoderDecoderModel(config=config, encoder=encoder, decoder=decoder)
-
-        output = enc_dec_model(pixel_values, decoder_input_ids=decoder_input_ids, labels=decoder_input_ids)
-
-        loss_init = output.loss
-
-        max_diff = np.max(np.abs(loss_pretrained - loss_init))
-        expected_diff = 0.0
-
-        self.assertAlmostEqual(max_diff, expected_diff, places=4)
-
-
-@require_vision
-@require_tf
-class TFViT2GPT2ModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_coco_en(self):
-        loc = "ydshieh/vit-gpt2-coco-en"
-
-        image_processor = ViTImageProcessor.from_pretrained(loc)
-        tokenizer = AutoTokenizer.from_pretrained(loc)
-        model = TFVisionEncoderDecoderModel.from_pretrained(loc)
-
-        # We will verify our results on an image of cute cats
-        img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        pixel_values = image_processor(images=img, return_tensors="tf").pixel_values
-
-        decoder_input_ids = tf.constant([[model.config.decoder_start_token_id]])
-
-        logits = model(pixel_values, decoder_input_ids)[0].numpy()
-
-        # verify the logits
-        expected_shape = (1, 1, model.config.decoder.vocab_size)
-        self.assertEqual(logits.shape, expected_shape)
-
-        EXPECTED_LOGIT_SLICE = np.array(
-            [
-                -38.705807,
-                -30.639929,
-                -31.41903,
-                -39.012012,
-                -38.38696,
-                -34.887207,
-                -33.290855,
-                -35.68447,
-                -38.508484,
-                -36.124645,
-            ]
-        )
-        max_diff = np.amax(np.abs(logits[0, 0, :10] - EXPECTED_LOGIT_SLICE))
-        self.assertLessEqual(max_diff, 1e-4)
-
-        def generate_step(pixel_values):
-            outputs = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True)
-            output_ids = outputs.sequences
-            preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-            preds = [pred.strip() for pred in preds]
-
-            return preds
-
-        preds = generate_step(pixel_values)
-
-        # should produce
-        # ["a cat laying on top of a couch next to another cat"]
-        self.assertEqual(preds, ["a cat laying on top of a couch next to another cat"])
diff --git a/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
deleted file mode 100644
index cc0a0fa212..0000000000
--- a/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
+++ /dev/null
@@ -1,297 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch VisionTextDualEncoder model."""
-
-import collections
-import tempfile
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_flax, require_torch, require_vision, slow
-from transformers.utils import is_flax_available, is_vision_available
-
-from ...test_modeling_flax_common import floats_tensor, ids_tensor, random_attention_mask
-from ..bert.test_modeling_flax_bert import FlaxBertModelTester
-from ..clip.test_modeling_flax_clip import FlaxCLIPVisionModelTester
-from ..vit.test_modeling_flax_vit import FlaxViTModelTester
-
-
-if is_flax_available():
-    from transformers import (
-        FlaxBertModel,
-        FlaxCLIPVisionModel,
-        FlaxVisionTextDualEncoderModel,
-        FlaxViTModel,
-        VisionTextDualEncoderConfig,
-        VisionTextDualEncoderProcessor,
-    )
-
-
-if is_vision_available():
-    from PIL import Image
-
-
-# Inspired by
-# https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py
-# From PyTorch internals
-def to_2tuple(x):
-    if isinstance(x, collections.abc.Iterable):
-        return x
-    return (x, x)
-
-
-@require_flax
-class VisionTextDualEncoderMixin:
-    def get_vision_text_model(self, config, text_config):
-        pass
-
-    def prepare_config_and_inputs(self):
-        pass
-
-    def get_pretrained_model_and_inputs(self):
-        pass
-
-    def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
-        diff = np.abs(a - b).max()
-        self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
-
-    def check_model_from_pretrained_configs(
-        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
-    ):
-        config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
-
-        model = FlaxVisionTextDualEncoderModel(config)
-
-        output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
-
-        self.assertEqual(output["text_embeds"].shape, (input_ids.shape[0], config.projection_dim))
-        self.assertEqual(output["image_embeds"].shape, (pixel_values.shape[0], config.projection_dim))
-
-    def check_vision_text_dual_encoder_from_pretrained(
-        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
-    ):
-        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
-        kwargs = {"vision_model": vision_model, "text_model": text_model}
-        model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(**kwargs)
-
-        output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
-
-        self.assertEqual(output["text_embeds"].shape, (input_ids.shape[0], model.config.projection_dim))
-        self.assertEqual(output["image_embeds"].shape, (pixel_values.shape[0], model.config.projection_dim))
-
-    def check_save_load(self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs):
-        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
-        kwargs = {"vision_model": vision_model, "text_model": text_model}
-        model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(**kwargs)
-
-        output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
-        out_1 = output[0]
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            model = FlaxVisionTextDualEncoderModel.from_pretrained(tmpdirname)
-
-            after_output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
-            out_2 = after_output[0]
-            max_diff = np.amax(np.abs(out_2 - out_1))
-            self.assertLessEqual(max_diff, 1e-3)
-
-    def check_vision_text_output_attention(
-        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
-    ):
-        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
-        kwargs = {"vision_model": vision_model, "text_model": text_model}
-        model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(**kwargs)
-
-        output = model(
-            input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, output_attentions=True
-        )
-
-        vision_attentions = output.vision_model_output.attentions
-        self.assertEqual(len(vision_attentions), vision_config.num_hidden_layers)
-
-        # in ViT, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
-        image_size = to_2tuple(vision_model.config.image_size)
-        patch_size = to_2tuple(vision_model.config.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        seq_len = num_patches + 1
-        self.assertEqual(vision_attentions[0].shape[-3:], (vision_config.num_attention_heads, seq_len, seq_len))
-
-        text_attentions = output.text_model_output.attentions
-        self.assertEqual(len(text_attentions), text_config.num_hidden_layers)
-
-        self.assertEqual(
-            text_attentions[0].shape[-3:],
-            (text_config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1]),
-        )
-
-    def test_model_from_pretrained_configs(self):
-        inputs_dict = self.prepare_config_and_inputs()
-        self.check_model_from_pretrained_configs(**inputs_dict)
-
-    def test_vision_text_dual_encoder_from_pretrained(self):
-        inputs_dict = self.prepare_config_and_inputs()
-        self.check_vision_text_dual_encoder_from_pretrained(**inputs_dict)
-
-    def test_save_load(self):
-        inputs_dict = self.prepare_config_and_inputs()
-        self.check_save_load(**inputs_dict)
-
-    def test_vision_text_output_attention(self):
-        inputs_dict = self.prepare_config_and_inputs()
-        self.check_vision_text_output_attention(**inputs_dict)
-
-    @slow
-    def test_real_model_save_load_from_pretrained(self):
-        model_2, inputs = self.get_pretrained_model_and_inputs()
-
-        outputs = model_2(**inputs)
-        out_2 = outputs[0]
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            model_2.save_pretrained(tmp_dirname)
-            model_1 = FlaxVisionTextDualEncoderModel.from_pretrained(tmp_dirname)
-
-            after_outputs = model_1(**inputs)
-            out_1 = after_outputs[0]
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-
-@require_flax
-class FlaxViTBertModelTest(VisionTextDualEncoderMixin, unittest.TestCase):
-    def get_pretrained_model_and_inputs(self):
-        model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(
-            "hf-internal-testing/tiny-random-vit",
-            "hf-internal-testing/tiny-bert",
-            vision_from_pt=True,
-            text_from_pt=True,
-        )
-        batch_size = 13
-        pixel_values = floats_tensor(
-            [
-                batch_size,
-                model.config.vision_config.num_channels,
-                model.config.vision_config.image_size,
-                model.config.vision_config.image_size,
-            ]
-        )
-        input_ids = ids_tensor([batch_size, 4], model.config.text_config.vocab_size)
-        attention_mask = random_attention_mask([batch_size, 4])
-        inputs = {"pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask}
-
-        return model, inputs
-
-    def get_vision_text_model(self, vision_config, text_config):
-        vision_model = FlaxViTModel(vision_config)
-        text_model = FlaxBertModel(text_config)
-        return vision_model, text_model
-
-    def prepare_config_and_inputs(self):
-        vit_model_tester = FlaxViTModelTester(self)
-        bert_model_tester = FlaxBertModelTester(self)
-        vision_config_and_inputs = vit_model_tester.prepare_config_and_inputs()
-        text_config_and_inputs = bert_model_tester.prepare_config_and_inputs()
-
-        vision_config, pixel_values = vision_config_and_inputs
-
-        text_config, input_ids, token_type_ids, attention_mask = text_config_and_inputs
-
-        # make sure that cross attention layers are added
-        return {
-            "text_config": text_config,
-            "vision_config": vision_config,
-            "pixel_values": pixel_values,
-            "attention_mask": attention_mask,
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-        }
-
-
-@require_torch
-class FlaxCLIPVisionBertModelTest(VisionTextDualEncoderMixin, unittest.TestCase):
-    def get_pretrained_model_and_inputs(self):
-        model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(
-            "hf-internal-testing/tiny-random-clip",
-            "hf-internal-testing/tiny-bert",
-            vision_from_pt=True,
-            text_from_pt=True,
-        )
-        batch_size = 13
-        pixel_values = floats_tensor(
-            [
-                batch_size,
-                model.config.vision_config.num_channels,
-                model.config.vision_config.image_size,
-                model.config.vision_config.image_size,
-            ]
-        )
-        input_ids = ids_tensor([batch_size, 4], model.config.text_config.vocab_size)
-        attention_mask = random_attention_mask([batch_size, 4])
-        inputs = {"pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask}
-
-        return model, inputs
-
-    def get_vision_text_model(self, vision_config, text_config):
-        vision_model = FlaxCLIPVisionModel(vision_config)
-        text_model = FlaxBertModel(text_config)
-        return vision_model, text_model
-
-    def prepare_config_and_inputs(self):
-        clip_model_tester = FlaxCLIPVisionModelTester(self)
-        bert_model_tester = FlaxBertModelTester(self)
-        vision_config_and_inputs = clip_model_tester.prepare_config_and_inputs()
-        text_config_and_inputs = bert_model_tester.prepare_config_and_inputs()
-
-        vision_config, pixel_values = vision_config_and_inputs
-
-        text_config, input_ids, token_type_ids, attention_mask = text_config_and_inputs
-
-        # make sure that cross attention layers are added
-        return {
-            "text_config": text_config,
-            "vision_config": vision_config,
-            "pixel_values": pixel_values,
-            "attention_mask": attention_mask,
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-        }
-
-
-@require_flax
-@require_vision
-class FlaxVisionTextDualEncoderIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model = FlaxVisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian", logit_scale_init_value=1.0)
-        processor = VisionTextDualEncoderProcessor.from_pretrained("clip-italian/clip-italian")
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = processor(
-            text=["una foto di un gatto", "una foto di un cane"], images=image, padding=True, return_tensors="np"
-        )
-
-        outputs = model(**inputs)
-
-        # verify the logits
-        self.assertEqual(outputs.logits_per_image.shape, (inputs.pixel_values.shape[0], inputs.input_ids.shape[0]))
-        self.assertEqual(
-            outputs.logits_per_text.shape,
-            (inputs.input_ids.shape[0], inputs.pixel_values.shape[0]),
-        )
-
-        expected_logits = np.array([[1.2284727, 0.3104122]])
-
-        self.assertTrue(np.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
diff --git a/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py
deleted file mode 100644
index b24a5dfb67..0000000000
--- a/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py
+++ /dev/null
@@ -1,419 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch VisionTextDualEncoder model."""
-
-from __future__ import annotations
-
-import collections
-import tempfile
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_tf, require_vision, slow
-from transformers.utils import is_tf_available, is_vision_available
-
-from ...test_modeling_tf_common import floats_tensor, ids_tensor, random_attention_mask
-from ..bert.test_modeling_tf_bert import TFBertModelTester
-from ..clip.test_modeling_tf_clip import TFCLIPVisionModelTester
-from ..deit.test_modeling_tf_deit import TFDeiTModelTester
-from ..roberta.test_modeling_tf_roberta import TFRobertaModelTester
-from ..vit.test_modeling_tf_vit import TFViTModelTester
-
-
-if is_tf_available():
-    from transformers import (
-        TFBertModel,
-        TFCLIPVisionModel,
-        TFDeiTModel,
-        TFRobertaModel,
-        TFVisionTextDualEncoderModel,
-        TFViTModel,
-        VisionTextDualEncoderConfig,
-    )
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import VisionTextDualEncoderProcessor
-
-
-# Inspired by
-# https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py
-# From PyTorch internals
-def to_2tuple(x):
-    if isinstance(x, collections.abc.Iterable):
-        return x
-    return (x, x)
-
-
-@require_tf
-class TFVisionTextDualEncoderMixin:
-    def get_vision_text_model(self, config, text_config):
-        pass
-
-    def prepare_config_and_inputs(self):
-        pass
-
-    def get_pretrained_model_and_inputs(self):
-        pass
-
-    def check_model_from_pretrained_configs(
-        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
-    ):
-        config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
-
-        model = TFVisionTextDualEncoderModel(config)
-
-        output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
-
-        self.assertEqual(output["text_embeds"].shape, (input_ids.shape[0], config.projection_dim))
-        self.assertEqual(output["image_embeds"].shape, (pixel_values.shape[0], config.projection_dim))
-
-    def check_vision_text_dual_encoder_model(
-        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
-    ):
-        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
-        model = TFVisionTextDualEncoderModel(vision_model=vision_model, text_model=text_model)
-
-        output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
-
-        self.assertEqual(output["text_embeds"].shape, (input_ids.shape[0], model.config.projection_dim))
-        self.assertEqual(output["image_embeds"].shape, (pixel_values.shape[0], model.config.projection_dim))
-
-    def check_vision_text_dual_encoder_from_pretrained(
-        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
-    ):
-        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
-        kwargs = {"vision_model": vision_model, "text_model": text_model}
-        model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(**kwargs)
-
-        output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
-
-        self.assertEqual(output["text_embeds"].shape, (input_ids.shape[0], model.config.projection_dim))
-        self.assertEqual(output["image_embeds"].shape, (pixel_values.shape[0], model.config.projection_dim))
-
-    def check_save_load(self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs):
-        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
-        model = TFVisionTextDualEncoderModel(vision_model=vision_model, text_model=text_model)
-
-        output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
-        out_1 = output[0].numpy()
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            model = TFVisionTextDualEncoderModel.from_pretrained(tmpdirname)
-
-            after_output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
-            out_2 = after_output[0].numpy()
-            max_diff = np.amax(np.abs(out_2 - out_1))
-            self.assertLessEqual(max_diff, 1e-5)
-
-    def check_vision_text_output_attention(
-        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
-    ):
-        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
-        model = TFVisionTextDualEncoderModel(vision_model=vision_model, text_model=text_model)
-
-        output = model(
-            input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, output_attentions=True
-        )
-
-        vision_attentions = output.vision_model_output.attentions
-        self.assertEqual(len(vision_attentions), vision_config.num_hidden_layers)
-
-        # in ViT, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
-        image_size = to_2tuple(vision_model.config.image_size)
-        patch_size = to_2tuple(vision_model.config.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        seq_len = num_patches + 1
-        self.assertEqual(vision_attentions[0].shape[-3:], (vision_config.num_attention_heads, seq_len, seq_len))
-
-        text_attentions = output.text_model_output.attentions
-        self.assertEqual(len(text_attentions), text_config.num_hidden_layers)
-
-        self.assertEqual(
-            text_attentions[0].shape[-3:],
-            (text_config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1]),
-        )
-
-    def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
-        diff = np.abs(a - b).max()
-        self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
-
-    def test_vision_text_dual_encoder_model(self):
-        inputs_dict = self.prepare_config_and_inputs()
-        self.check_vision_text_dual_encoder_model(**inputs_dict)
-
-    def test_model_from_pretrained_configs(self):
-        inputs_dict = self.prepare_config_and_inputs()
-        self.check_model_from_pretrained_configs(**inputs_dict)
-
-    def test_vision_text_dual_encoder_from_pretrained(self):
-        inputs_dict = self.prepare_config_and_inputs()
-        self.check_vision_text_dual_encoder_from_pretrained(**inputs_dict)
-
-    def test_save_load(self):
-        inputs_dict = self.prepare_config_and_inputs()
-        self.check_save_load(**inputs_dict)
-
-    def test_vision_text_output_attention(self):
-        inputs_dict = self.prepare_config_and_inputs()
-        self.check_vision_text_output_attention(**inputs_dict)
-
-    @slow
-    def test_real_model_save_load_from_pretrained(self):
-        model_2, inputs = self.get_pretrained_model_and_inputs()
-
-        outputs = model_2(**inputs)
-        out_2 = outputs[0].numpy()
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            model_2.save_pretrained(tmp_dirname)
-            model_1 = TFVisionTextDualEncoderModel.from_pretrained(tmp_dirname)
-
-            after_outputs = model_1(**inputs)
-            out_1 = after_outputs[0].numpy()
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-
-@require_tf
-class TFViTBertModelTest(TFVisionTextDualEncoderMixin, unittest.TestCase):
-    def get_pretrained_model_and_inputs(self):
-        model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
-            "hf-internal-testing/tiny-random-vit", "hf-internal-testing/tiny-random-bert"
-        )
-        batch_size = 13
-        pixel_values = floats_tensor(
-            [
-                batch_size,
-                model.vision_model.config.num_channels,
-                model.vision_model.config.image_size,
-                model.vision_model.config.image_size,
-            ]
-        )
-        input_ids = ids_tensor([batch_size, 4], model.text_model.config.vocab_size)
-        attention_mask = random_attention_mask([batch_size, 4])
-        inputs = {"pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask}
-
-        return model, inputs
-
-    def get_vision_text_model(self, vision_config, text_config):
-        vision_model = TFViTModel(vision_config, name="vision_model")
-        text_model = TFBertModel(text_config, name="text_model")
-        return vision_model, text_model
-
-    def prepare_config_and_inputs(self):
-        vit_model_tester = TFViTModelTester(self)
-        bert_model_tester = TFBertModelTester(self)
-        vision_config_and_inputs = vit_model_tester.prepare_config_and_inputs()
-        text_config_and_inputs = bert_model_tester.prepare_config_and_inputs()
-
-        vision_config, pixel_values, _ = vision_config_and_inputs
-
-        (
-            text_config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = text_config_and_inputs
-
-        return {
-            "text_config": text_config,
-            "vision_config": vision_config,
-            "pixel_values": pixel_values,
-            "attention_mask": input_mask,
-            "input_ids": input_ids,
-            "text_token_type_ids": token_type_ids,
-            "text_sequence_labels": sequence_labels,
-            "text_token_labels": token_labels,
-            "text_choice_labels": choice_labels,
-        }
-
-
-@require_tf
-class TFDeiTRobertaModelTest(TFVisionTextDualEncoderMixin, unittest.TestCase):
-    def get_pretrained_model_and_inputs(self):
-        # DeiT repo doesn't have TF weights, but we don't actually use the weights at all so let's
-        # just reinitialize it.
-        model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
-            "Rocketknight1/tiny-random-deit-tf", "hf-internal-testing/tiny-random-roberta"
-        )
-        batch_size = 13
-        pixel_values = floats_tensor(
-            [
-                batch_size,
-                model.vision_model.config.num_channels,
-                model.vision_model.config.image_size,
-                model.vision_model.config.image_size,
-            ]
-        )
-        input_ids = ids_tensor([batch_size, 4], model.text_model.config.vocab_size)
-        attention_mask = random_attention_mask([batch_size, 4])
-        inputs = {"pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask}
-
-        return model, inputs
-
-    def check_vision_text_output_attention(
-        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
-    ):
-        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
-        model = TFVisionTextDualEncoderModel(vision_model=vision_model, text_model=text_model)
-
-        output = model(
-            input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, output_attentions=True
-        )
-
-        vision_attentions = output.vision_model_output.attentions
-        self.assertEqual(len(vision_attentions), vision_config.num_hidden_layers)
-
-        # in DEiT, the seq_len equals the number of patches + 2 (we add 2 for the [CLS] and distillation tokens)
-        image_size = to_2tuple(vision_model.config.image_size)
-        patch_size = to_2tuple(vision_model.config.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        seq_len = num_patches + 2
-        self.assertEqual(vision_attentions[0].shape[-3:], (vision_config.num_attention_heads, seq_len, seq_len))
-
-        text_attentions = output.text_model_output.attentions
-        self.assertEqual(len(text_attentions), text_config.num_hidden_layers)
-
-        self.assertEqual(
-            text_attentions[0].shape[-3:],
-            (text_config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1]),
-        )
-
-    def get_vision_text_model(self, vision_config, text_config):
-        vision_model = TFDeiTModel(vision_config, name="vision_model")
-        text_model = TFRobertaModel(text_config, name="text_model")
-        return vision_model, text_model
-
-    def prepare_config_and_inputs(self):
-        vit_model_tester = TFDeiTModelTester(self)
-        bert_model_tester = TFRobertaModelTester(self)
-        vision_config_and_inputs = vit_model_tester.prepare_config_and_inputs()
-        text_config_and_inputs = bert_model_tester.prepare_config_and_inputs()
-
-        vision_config, pixel_values, _ = vision_config_and_inputs
-
-        (
-            text_config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = text_config_and_inputs
-
-        return {
-            "text_config": text_config,
-            "vision_config": vision_config,
-            "pixel_values": pixel_values,
-            "attention_mask": input_mask,
-            "input_ids": input_ids,
-            "text_token_type_ids": token_type_ids,
-            "text_sequence_labels": sequence_labels,
-            "text_token_labels": token_labels,
-            "text_choice_labels": choice_labels,
-        }
-
-
-@require_tf
-class TFCLIPVisionBertModelTest(TFVisionTextDualEncoderMixin, unittest.TestCase):
-    def get_pretrained_model_and_inputs(self):
-        model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
-            "Rocketknight1/tiny-random-clip-tf", "hf-internal-testing/tiny-random-bert"
-        )
-        batch_size = 13
-        pixel_values = floats_tensor(
-            [
-                batch_size,
-                model.vision_model.config.num_channels,
-                model.vision_model.config.image_size,
-                model.vision_model.config.image_size,
-            ]
-        )
-        input_ids = ids_tensor([batch_size, 4], model.text_model.config.vocab_size)
-        attention_mask = random_attention_mask([batch_size, 4])
-        inputs = {"pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask}
-
-        return model, inputs
-
-    def get_vision_text_model(self, vision_config, text_config):
-        vision_model = TFCLIPVisionModel(vision_config, name="vision_model")
-        text_model = TFBertModel(text_config, name="text_model")
-        return vision_model, text_model
-
-    def prepare_config_and_inputs(self):
-        clip_model_tester = TFCLIPVisionModelTester(self)
-        bert_model_tester = TFBertModelTester(self)
-        vision_config_and_inputs = clip_model_tester.prepare_config_and_inputs()
-        text_config_and_inputs = bert_model_tester.prepare_config_and_inputs()
-
-        vision_config, pixel_values = vision_config_and_inputs
-
-        (
-            text_config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = text_config_and_inputs
-
-        return {
-            "text_config": text_config,
-            "vision_config": vision_config,
-            "pixel_values": pixel_values,
-            "attention_mask": input_mask,
-            "input_ids": input_ids,
-            "text_token_type_ids": token_type_ids,
-            "text_sequence_labels": sequence_labels,
-            "text_token_labels": token_labels,
-            "text_choice_labels": choice_labels,
-        }
-
-
-@require_vision
-@require_tf
-class TFVisionTextDualEncoderIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model = TFVisionTextDualEncoderModel.from_pretrained(
-            "clip-italian/clip-italian", logit_scale_init_value=1.0, from_pt=True
-        )
-        processor = VisionTextDualEncoderProcessor.from_pretrained("clip-italian/clip-italian")
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = processor(
-            text=["una foto di un gatto", "una foto di un cane"], images=image, padding=True, return_tensors="np"
-        )
-
-        outputs = model(**inputs)
-
-        # verify the logits
-        self.assertEqual(outputs.logits_per_image.shape, (inputs.pixel_values.shape[0], inputs.input_ids.shape[0]))
-        self.assertEqual(
-            outputs.logits_per_text.shape,
-            (inputs.input_ids.shape[0], inputs.pixel_values.shape[0]),
-        )
-
-        expected_logits = np.array([[1.2284727, 0.3104122]])
-
-        self.assertTrue(np.allclose(outputs.logits_per_image.numpy(), expected_logits, atol=1e-3))
diff --git a/tests/models/vit/test_modeling_flax_vit.py b/tests/models/vit/test_modeling_flax_vit.py
deleted file mode 100644
index 2e35205663..0000000000
--- a/tests/models/vit/test_modeling_flax_vit.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import unittest
-
-import numpy as np
-
-from transformers import ViTConfig, is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor
-
-
-if is_flax_available():
-    import jax
-
-    from transformers.models.vit.modeling_flax_vit import FlaxViTForImageClassification, FlaxViTModel
-
-
-class FlaxViTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        attn_implementation="eager",
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.attn_implementation = attn_implementation
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        config = ViTConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            attn_implementation=self.attn_implementation,
-        )
-
-        return config, pixel_values
-
-    def create_and_check_model(self, config, pixel_values):
-        model = FlaxViTModel(config=config)
-        result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values):
-        config.num_labels = self.type_sequence_label_size
-        model = FlaxViTForImageClassification(config=config)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = FlaxViTForImageClassification(config)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-        ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_flax
-class FlaxViTModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxViTModel, FlaxViTForImageClassification) if is_flax_available() else ()
-
-    def setUp(self) -> None:
-        self.model_tester = FlaxViTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ViTConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    # We need to override this test because ViT's forward signature is different than text models.
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.__call__)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    # We need to override this test because ViT expects pixel_values instead of input_ids
-    def test_jit_compilation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def model_jitted(pixel_values, **kwargs):
-                    return model(pixel_values=pixel_values, **kwargs)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("google/vit-base-patch16-224")
-            outputs = model(np.ones((1, 3, 224, 224)))
-            self.assertIsNotNone(outputs)
diff --git a/tests/models/vit/test_modeling_tf_vit.py b/tests/models/vit/test_modeling_tf_vit.py
deleted file mode 100644
index 6fa3d93d41..0000000000
--- a/tests/models/vit/test_modeling_tf_vit.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow ViT model."""
-
-from __future__ import annotations
-
-import inspect
-import unittest
-
-from transformers import ViTConfig
-from transformers.testing_utils import require_tf, require_vision, slow
-from transformers.utils import cached_property, is_tf_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFViTForImageClassification, TFViTModel
-    from transformers.modeling_tf_utils import keras
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import ViTImageProcessor
-
-
-class TFViTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        num_labels=3,
-        scope=None,
-        attn_implementation="eager",
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.attn_implementation = attn_implementation
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return ViTConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            attn_implementation=self.attn_implementation,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = TFViTModel(config=config)
-        result = model(pixel_values, training=False)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-        # Test with an image with different size than the one specified in config.
-        image_size = self.image_size // 2
-        pixel_values = pixel_values[:, :, :image_size, :image_size]
-        result = model(pixel_values, interpolate_pos_encoding=True, training=False)
-        seq_length = (image_size // self.patch_size) ** 2 + 1
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, seq_length, self.hidden_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = TFViTForImageClassification(config)
-        result = model(pixel_values, labels=labels, training=False)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # Test with an image with different size than the one specified in config.
-        image_size = self.image_size // 2
-        pixel_values = pixel_values[:, :, :image_size, :image_size]
-        result = model(pixel_values, interpolate_pos_encoding=True, training=False)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = TFViTForImageClassification(config)
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_tf_common.py, as ViT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (TFViTModel, TFViTForImageClassification) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": TFViTModel, "image-classification": TFViTForImageClassification}
-        if is_tf_available()
-        else {}
-    )
-
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFViTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ViTConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="ViT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="ViT does not use inputs_embeds")
-    def test_graph_mode_with_inputs_embeds(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, keras.layers.Layer))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TFViTModel.from_pretrained("google/vit-base-patch16-224")
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_tf
-@require_vision
-class TFViTModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = TFViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="tf")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = tf.constant([-0.2744, 0.8215, -0.0836])
-
-        tf.debugging.assert_near(outputs.logits[0, :3], expected_slice, atol=1e-4)
diff --git a/tests/models/vit_mae/test_modeling_tf_vit_mae.py b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
deleted file mode 100644
index 3154e13106..0000000000
--- a/tests/models/vit_mae/test_modeling_tf_vit_mae.py
+++ /dev/null
@@ -1,471 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow ViTMAE model."""
-
-from __future__ import annotations
-
-import copy
-import inspect
-import json
-import math
-import os
-import tempfile
-import unittest
-from importlib import import_module
-
-import numpy as np
-
-from transformers import ViTMAEConfig
-from transformers.file_utils import cached_property, is_tf_available, is_vision_available
-from transformers.testing_utils import require_tf, require_vision, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFViTMAEForPreTraining, TFViTMAEModel
-    from transformers.modeling_tf_utils import keras
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import ViTImageProcessor
-
-
-class TFViTMAEModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        num_labels=3,
-        mask_ratio=0.6,
-        scope=None,
-        attn_implementation="eager",
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.mask_ratio = mask_ratio
-        self.scope = scope
-        self.attn_implementation = attn_implementation
-
-        # in ViTMAE, the expected sequence length = (num_patches + 1) * (1 - config.mask_ratio), rounded above
-        # (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = int(math.ceil((1 - mask_ratio) * (num_patches + 1)))
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return ViTMAEConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            decoder_hidden_size=self.hidden_size,
-            decoder_num_hidden_layers=self.num_hidden_layers,
-            decoder_num_attention_heads=self.num_attention_heads,
-            decoder_intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            mask_ratio=self.mask_ratio,
-            attn_implementation=self.attn_implementation,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = TFViTMAEModel(config=config)
-        result = model(pixel_values, training=False)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_pretraining(self, config, pixel_values, labels):
-        model = TFViTMAEForPreTraining(config)
-        result = model(pixel_values, training=False)
-        # expected sequence length = num_patches
-        num_patches = (self.image_size // self.patch_size) ** 2
-        expected_num_channels = self.patch_size**2 * self.num_channels
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, num_patches, expected_num_channels))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = TFViTMAEForPreTraining(config)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values, training=False)
-        expected_num_channels = self.patch_size**2
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, num_patches, expected_num_channels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, pixel_values, labels) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFViTMAEModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ViTMAE does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (TFViTMAEModel, TFViTMAEForPreTraining) if is_tf_available() else ()
-    pipeline_model_mapping = {"feature-extraction": TFViTMAEModel} if is_tf_available() else {}
-
-    test_pruning = False
-    test_onnx = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = TFViTMAEModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ViTMAEConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="ViTMAE does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, keras.layers.Layer))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise
-    # to generate masks during test
-    def test_keyword_and_dict_args(self):
-        # make the mask reproducible
-        np.random.seed(2)
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        num_patches = int((config.image_size // config.patch_size) ** 2)
-        noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            outputs_dict = model(inputs, noise=noise)
-
-            inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            outputs_keywords = model(**inputs_keywords, noise=noise)
-            output_dict = outputs_dict[0].numpy()
-            output_keywords = outputs_keywords[0].numpy()
-
-            self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
-
-    # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise
-    # to generate masks during test
-    def test_numpy_arrays_inputs(self):
-        # make the mask reproducible
-        np.random.seed(2)
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        num_patches = int((config.image_size // config.patch_size) ** 2)
-        noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))
-
-        def prepare_numpy_arrays(inputs_dict):
-            inputs_np_dict = {}
-            for k, v in inputs_dict.items():
-                if tf.is_tensor(v):
-                    inputs_np_dict[k] = v.numpy()
-                else:
-                    inputs_np_dict[k] = np.array(k)
-
-            return inputs_np_dict
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            inputs_np = prepare_numpy_arrays(inputs)
-
-            output_for_dict_input = model(inputs_np, noise=noise)
-            output_for_kw_input = model(**inputs_np, noise=noise)
-            self.assert_outputs_same(output_for_dict_input, output_for_kw_input)
-
-    # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise
-    # to generate masks during test
-    def test_keras_save_load(self):
-        # make mask reproducible
-        np.random.seed(2)
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        tf_main_layer_classes = {
-            module_member
-            for model_class in self.all_model_classes
-            for module in (import_module(model_class.__module__),)
-            for module_member_name in dir(module)
-            if module_member_name.endswith("MainLayer")
-            # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`.
-            and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
-            for module_member in (getattr(module, module_member_name),)
-            if isinstance(module_member, type)
-            and keras.layers.Layer in module_member.__bases__
-            and getattr(module_member, "_keras_serializable", False)
-        }
-
-        num_patches = int((config.image_size // config.patch_size) ** 2)
-        noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))
-        noise = tf.convert_to_tensor(noise)
-        inputs_dict.update({"noise": noise})
-
-        for main_layer_class in tf_main_layer_classes:
-            main_layer = main_layer_class(config)
-
-            symbolic_inputs = {
-                name: keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
-            }
-
-            model = keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
-            outputs = model(inputs_dict)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                filepath = os.path.join(tmpdirname, "keras_model.h5")
-                model.save(filepath)
-                model = keras.models.load_model(filepath, custom_objects={main_layer_class.__name__: main_layer_class})
-                assert isinstance(model, keras.Model)
-                after_outputs = model(inputs_dict)
-                self.assert_outputs_same(after_outputs, outputs)
-
-    # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise
-    # to generate masks during test
-    @slow
-    def test_save_load(self):
-        # make mask reproducible
-        np.random.seed(2)
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        num_patches = int((config.image_size // config.patch_size) ** 2)
-        noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model_input = self._prepare_for_class(inputs_dict, model_class)
-            outputs = model(model_input, noise=noise)
-
-            if model_class.__name__ == "TFViTMAEModel":
-                out_2 = outputs.last_hidden_state.numpy()
-                out_2[np.isnan(out_2)] = 0
-            else:
-                out_2 = outputs.logits.numpy()
-                out_2[np.isnan(out_2)] = 0
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=False)
-                model = model_class.from_pretrained(tmpdirname)
-                after_outputs = model(model_input, noise=noise)
-
-                if model_class.__name__ == "TFViTMAEModel":
-                    out_1 = after_outputs["last_hidden_state"].numpy()
-                    out_1[np.isnan(out_1)] = 0
-                else:
-                    out_1 = after_outputs["logits"].numpy()
-                    out_1[np.isnan(out_1)] = 0
-
-                max_diff = np.amax(np.abs(out_1 - out_2))
-                self.assertLessEqual(max_diff, 1e-5)
-
-    # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise
-    # to generate masks during test
-    def test_save_load_config(self):
-        # make mask reproducible
-        np.random.seed(2)
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        num_patches = int((config.image_size // config.patch_size) ** 2)
-        noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            outputs = model(model_inputs, noise=noise)
-            model_config = model.get_config()
-            # make sure that returned config is jsonifiable, which is required by keras
-            json.dumps(model_config)
-            new_model = model_class.from_config(model.get_config())
-            # make sure it also accepts a normal config
-            _ = model_class.from_config(model.config)
-            _ = new_model(model_inputs)  # Build model
-            new_model.set_weights(model.get_weights())
-            after_outputs = new_model(model_inputs, noise=noise)
-
-            self.assert_outputs_same(after_outputs, outputs)
-
-    @unittest.skip(
-        reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load
-    to get deterministic results."""
-    )
-    def test_determinism(self):
-        pass
-
-    @unittest.skip(reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load""")
-    def test_model_outputs_equivalence(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TFViTMAEModel.from_pretrained("google/vit-base-patch16-224")
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_tf
-@require_vision
-class TFViTMAEModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return ViTImageProcessor.from_pretrained("facebook/vit-mae-base")
-
-    @slow
-    def test_inference_for_pretraining(self):
-        # make random mask reproducible across the PT and TF model
-        np.random.seed(2)
-
-        model = TFViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="tf")
-
-        # prepare a noise vector that will be also used for testing the TF model
-        # (this way we can ensure that the PT and TF models operate on the same inputs)
-        vit_mae_config = ViTMAEConfig()
-        num_patches = int((vit_mae_config.image_size // vit_mae_config.patch_size) ** 2)
-        noise = np.random.uniform(size=(1, num_patches))
-
-        # forward pass
-        outputs = model(**inputs, noise=noise)
-
-        # verify the logits
-        expected_shape = tf.convert_to_tensor([1, 196, 768])
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = tf.convert_to_tensor(
-            [[-0.0548, -1.7023, -0.9325], [0.3721, -0.5670, -0.2233], [0.8235, -1.3878, -0.3524]]
-        )
-
-        tf.debugging.assert_near(outputs.logits[0, :3, :3], expected_slice, atol=1e-4)
-
-    @slow
-    def test_inference_interpolate_pos_encoding(self):
-        # ViTMAE models have an `interpolate_pos_encoding` argument in their forward method,
-        # allowing to interpolate the pre-trained position embeddings in order to use
-        # the model on higher resolutions. The DINO model by Facebook AI leverages this
-        # to visualize self-attention on higher resolution images.
-
-        # make random mask reproducible across the PT and TF model
-        np.random.seed(2)
-
-        model = TFViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, do_resize=False, return_tensors="tf")
-
-        # prepare a noise vector that will be also used for testing the TF model
-        # (this way we can ensure that the PT and TF models operate on the same inputs)
-        vit_mae_config = ViTMAEConfig()
-        num_patches = (image.height // vit_mae_config.patch_size) * (image.width // vit_mae_config.patch_size)
-        noise = np.random.uniform(size=(1, num_patches))
-
-        # forward pass
-        outputs = model(**inputs, noise=noise, interpolate_pos_encoding=True)
-
-        # verify the logits
-        expected_shape = tf.convert_to_tensor([1, 1200, 768])
-        self.assertEqual(outputs.logits.shape, expected_shape)
diff --git a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
deleted file mode 100644
index aa55557691..0000000000
--- a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
+++ /dev/null
@@ -1,633 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import math
-import multiprocessing
-import traceback
-import unittest
-
-import numpy as np
-from datasets import load_dataset
-
-from transformers import Wav2Vec2Config, is_flax_available
-from transformers.testing_utils import (
-    CaptureLogger,
-    is_librosa_available,
-    is_pyctcdecode_available,
-    require_flax,
-    require_librosa,
-    require_pyctcdecode,
-    require_soundfile,
-    run_test_in_subprocess,
-    slow,
-)
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, random_attention_mask
-
-
-if is_flax_available():
-    import jax
-    import jax.numpy as jnp
-    import optax
-    from flax.traverse_util import flatten_dict
-
-    from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor
-    from transformers.models.wav2vec2.modeling_flax_wav2vec2 import (
-        FlaxWav2Vec2ForCTC,
-        FlaxWav2Vec2ForPreTraining,
-        FlaxWav2Vec2GumbelVectorQuantizer,
-        FlaxWav2Vec2Model,
-        _compute_mask_indices,
-        _sample_negative_indices,
-    )
-
-
-if is_pyctcdecode_available():
-    import pyctcdecode.decoder
-
-    from transformers import Wav2Vec2ProcessorWithLM
-    from transformers.models.wav2vec2_with_lm import processing_wav2vec2_with_lm
-
-
-if is_librosa_available():
-    import librosa
-
-
-def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
-    error = None
-    try:
-        _ = in_queue.get(timeout=timeout)
-
-        ds = load_dataset("legacy-datasets/common_voice", "es", split="test", streaming=True, trust_remote_code=True)
-        sample = next(iter(ds))
-
-        resampled_audio = librosa.resample(sample["audio"]["array"], 48_000, 16_000)
-
-        model = FlaxWav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-        processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-
-        input_values = processor(resampled_audio, return_tensors="np").input_values
-
-        logits = model(input_values).logits
-
-        # use a spawn pool, which should trigger a warning if different than fork
-        with CaptureLogger(pyctcdecode.decoder.logger) as cl, multiprocessing.get_context("spawn").Pool(1) as pool:
-            transcription = processor.batch_decode(np.array(logits), pool).text
-
-        unittest.TestCase().assertIn("Falling back to sequential decoding.", cl.out)
-        unittest.TestCase().assertEqual(transcription[0], "bien y qué regalo vas a abrir primero")
-
-        # force batch_decode to internally create a spawn pool, which should trigger a warning if different than fork
-        multiprocessing.set_start_method("spawn", force=True)
-        with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl:
-            transcription = processor.batch_decode(np.array(logits)).text
-
-        unittest.TestCase().assertIn("Falling back to sequential decoding.", cl.out)
-        unittest.TestCase().assertEqual(transcription[0], "bien y qué regalo vas a abrir primero")
-    except Exception:
-        error = f"{traceback.format_exc()}"
-
-    results = {"error": error}
-    out_queue.put(results, timeout=timeout)
-    out_queue.join()
-
-
-class FlaxWav2Vec2ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=1024,  # speech is longer
-        is_training=False,
-        hidden_size=24,
-        feat_extract_norm="layer",
-        feat_extract_dropout=0.0,
-        feat_extract_activation="gelu",
-        conv_dim=(32, 32, 32),
-        conv_stride=(4, 4, 4),
-        conv_kernel=(8, 8, 8),
-        conv_bias=False,
-        num_conv_pos_embeddings=16,
-        num_conv_pos_embedding_groups=2,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
-        intermediate_size=20,
-        layer_norm_eps=1e-5,
-        hidden_act="gelu",
-        initializer_range=0.02,
-        vocab_size=32,
-        do_stable_layer_norm=True,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
-        self.feat_extract_dropout = feat_extract_dropout
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.do_stable_layer_norm = do_stable_layer_norm
-        self.scope = scope
-
-        output_seq_length = self.seq_length
-        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
-            output_seq_length = (output_seq_length - (kernel - 1)) / stride
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.seq_length], scale=1.0)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = Wav2Vec2Config(
-            do_stable_layer_norm=self.do_stable_layer_norm,
-            hidden_size=self.hidden_size,
-            feat_extract_norm=self.feat_extract_norm,
-            feat_extract_dropout=self.feat_extract_dropout,
-            feat_extract_activation=self.feat_extract_activation,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-        )
-
-        return config, input_values, attention_mask
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_values, attention_mask = config_and_inputs
-        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_flax
-class FlaxWav2Vec2ModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (FlaxWav2Vec2Model, FlaxWav2Vec2ForCTC, FlaxWav2Vec2ForPreTraining) if is_flax_available() else ()
-    )
-
-    def setUp(self):
-        self.model_tester = FlaxWav2Vec2ModelTester(self)
-
-    def test_train(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        input_values = inputs_dict["input_values"]
-        attention_mask = inputs_dict["attention_mask"]
-
-        model = FlaxWav2Vec2ForPreTraining(config)
-
-        features_shape = (
-            input_values.shape[0],
-            model._get_feat_extract_output_lengths(np.array(input_values.shape[1])),
-        )
-
-        batch_size, sequence_length = features_shape[:2]
-
-        mask_prob = 0.5
-        mask_length = 4
-        mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-
-        dropout_rng, gumbel_rng = jax.random.split(jax.random.PRNGKey(0))
-
-        output = model(
-            input_values,
-            attention_mask=attention_mask,
-            mask_time_indices=mask_time_indices,
-            train=True,
-            dropout_rng=dropout_rng,
-            gumbel_rng=gumbel_rng,
-        )[0]
-
-        self.assertTrue(output.shape == (batch_size, sequence_length, model.config.proj_codevector_dim))
-
-    # overwrite because of `input_values`
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.__call__)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["input_values", "attention_mask"]
-            self.assertListEqual(arg_names[:2], expected_arg_names)
-
-    # overwrite because of `input_values`
-    def test_jit_compilation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def model_jitted(input_values, attention_mask=None, **kwargs):
-                    return model(input_values=input_values, attention_mask=attention_mask, **kwargs)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    def test_freeze_feature_encoder(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        input_values = inputs_dict["input_values"]
-        attention_mask = inputs_dict["attention_mask"]
-
-        model = FlaxWav2Vec2ForPreTraining(config)
-        params = model.params
-
-        # dummy loss function
-        def compute_loss(
-            params, input_values, attention_mask, freeze_feature_encoder: bool = False, epsilon: float = 1e-8
-        ):
-            outputs = model(
-                input_values,
-                attention_mask=attention_mask,
-                freeze_feature_encoder=freeze_feature_encoder,
-                params=params,
-            )
-            # compute cosine similarity of projected and projected_quantized states
-            cosine_sim = optax.cosine_similarity(
-                outputs.projected_states, outputs.projected_quantized_states, epsilon=epsilon
-            )
-            loss = cosine_sim.sum()
-            return loss, outputs.to_tuple()
-
-        # transform the loss function to get the gradients
-        grad_fn = jax.value_and_grad(compute_loss, has_aux=True)
-
-        # compute loss, outputs and gradients for unfrozen model
-        (loss, outputs), grads = grad_fn(params, input_values, attention_mask, freeze_feature_encoder=False)
-
-        # compare to loss, outputs and gradients for frozen model
-        (loss_frozen, outputs_frozen), grads_frozen = grad_fn(
-            params, input_values, attention_mask, freeze_feature_encoder=True
-        )
-
-        # ensure that the outputs and losses remain precisely equal
-        for output, output_frozen in zip(outputs, outputs_frozen):
-            self.assertTrue((output == output_frozen).all())
-        self.assertEqual(loss, loss_frozen)
-
-        grads = flatten_dict(grads)
-        grads_frozen = flatten_dict(grads_frozen)
-
-        # ensure that the dicts of gradients contain the same keys
-        self.assertEqual(grads.keys(), grads_frozen.keys())
-
-        # ensure that the gradients of the feature extractor layers are precisely zero when frozen and contain non-zero entries when unfrozen
-        feature_extractor_grads = tuple(grads[k] for k in grads if "feature_extractor" in k)
-        feature_extractor_grads_frozen = tuple(grads_frozen[k] for k in grads_frozen if "feature_extractor" in k)
-
-        for feature_extractor_grad, feature_extractor_grad_frozen in zip(
-            feature_extractor_grads, feature_extractor_grads_frozen
-        ):
-            self.assertTrue((feature_extractor_grad_frozen == 0.0).all())
-            self.assertTrue((feature_extractor_grad > 0.0).any())
-
-        # ensure that the gradients of all unfrozen layers remain equal, i.e. all layers excluding the frozen 'feature_extractor'
-        grads = tuple(grads[k] for k in grads if "feature_extractor" not in k)
-        grads_frozen = tuple(grads_frozen[k] for k in grads_frozen if "feature_extractor" not in k)
-
-        for grad, grad_frozen in zip(grads, grads_frozen):
-            self.assertTrue((grad == grad_frozen).all())
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", from_pt=True)
-            outputs = model(np.ones((1, 1024), dtype="f4"))
-            self.assertIsNotNone(outputs)
-
-
-@require_flax
-class FlaxWav2Vec2UtilsTest(unittest.TestCase):
-    def test_compute_mask_indices(self):
-        batch_size = 4
-        sequence_length = 60
-        mask_prob = 0.5
-        mask_length = 1
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-
-        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
-
-    def test_compute_mask_indices_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-
-        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-    def test_compute_mask_indices_attn_mask_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        attention_mask = np.ones((batch_size, sequence_length), dtype=np.int32)
-        attention_mask[:2, sequence_length // 2 :] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask
-        )
-
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-        self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
-
-    def test_compute_perplexity(self):
-        probs = np.arange(100).reshape(2, 5, 10) / 100
-
-        ppl = FlaxWav2Vec2GumbelVectorQuantizer._compute_perplexity(probs)
-        self.assertTrue(abs(ppl.item() - 141.4291) < 1e-3)
-
-        # mask half of the input
-        mask = np.ones((2,), dtype=bool)
-        mask[0] = 0
-
-        ppl = FlaxWav2Vec2GumbelVectorQuantizer._compute_perplexity(probs, mask)
-        self.assertTrue(abs(ppl.item() - 58.6757) < 1e-3)
-
-    def test_sample_negatives(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-
-        features = (np.arange(sequence_length * hidden_size) // hidden_size).reshape(
-            sequence_length, hidden_size
-        )  # each value in vector consists of same value
-        features = np.broadcast_to(features[None, :], (batch_size, sequence_length, hidden_size))
-
-        negative_indices = _sample_negative_indices(features.shape, num_negatives)
-
-        features = features.reshape(-1, hidden_size)  # BTC => (BxT)C
-        # take negative vectors from sampled indices
-        sampled_negatives = features[negative_indices.reshape(-1)]
-        negatives = sampled_negatives.reshape(batch_size, sequence_length, num_negatives, hidden_size).transpose(
-            2, 0, 1, 3
-        )
-
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features.reshape(negative.shape)) == 0).sum() == 0.0)
-
-        # make sure that full vectors are sampled and not values of vectors
-        # => this means that `unique()` yields a single value for `hidden_size` dim
-        self.assertEqual(np.unique(negatives, axis=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-
-    def test_sample_negatives_with_attn_mask(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-
-        features = (np.arange(sequence_length * hidden_size) // hidden_size).reshape(
-            sequence_length, hidden_size
-        )  # each value in vector consists of same value
-
-        # second half of last input tensor is padded
-        attention_mask = np.ones((batch_size, sequence_length), dtype=np.int8)
-        attention_mask[-1, sequence_length // 2 :] = 0
-
-        forbidden_indices = (
-            np.arange(sequence_length // 2, sequence_length, dtype=np.int32) + (batch_size - 1) * sequence_length
-        ).tolist()
-
-        features = np.broadcast_to(features[None, :], (batch_size, sequence_length, hidden_size))
-
-        negative_indices = _sample_negative_indices(features.shape, num_negatives, attention_mask=attention_mask)
-
-        # make sure that no padding tokens are sampled
-        self.assertTrue(all(idx not in negative_indices for idx in forbidden_indices))
-
-        features = features.reshape(-1, hidden_size)  # BTC => (BxT)C
-        # take negative vectors from sampled indices
-        sampled_negatives = features[negative_indices.reshape(-1)]
-        negatives = sampled_negatives.reshape(batch_size, sequence_length, num_negatives, hidden_size).transpose(
-            2, 0, 1, 3
-        )
-
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features.reshape(negative.shape)) == 0).sum() == 0.0)
-
-        # make sure that full vectors are sampled and not just slices of vectors
-        # => this means that `unique()` yields a single value for `hidden_size` dim
-        self.assertEqual(np.unique(negatives, axis=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-
-
-@require_flax
-@require_soundfile
-@slow
-class FlaxWav2Vec2ModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_inference_ctc_robust_batched(self):
-        model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", from_pt=True)
-        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", do_lower_case=True)
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="np", padding=True)
-
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-
-        logits = model(input_values, attention_mask=attention_mask).logits
-
-        predicted_ids = jnp.argmax(logits, axis=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around"
-            " him with the thousands of spectators were trivialities not worth thinking about",
-            "his instant panic was followed by a small sharp blow high on his chest",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_pretrained(self):
-        model = FlaxWav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-lv60", from_pt=True)
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-            "facebook/wav2vec2-large-lv60", return_attention_mask=True
-        )
-        input_speech = self._load_datasamples(2)
-
-        inputs_dict = feature_extractor(input_speech, return_tensors="np", padding=True)
-
-        features_shape = (
-            inputs_dict["input_values"].shape[0],
-            model._get_feat_extract_output_lengths(np.array(inputs_dict["input_values"].shape[1])),
-        )
-
-        mask_time_indices = _compute_mask_indices(
-            features_shape,
-            model.config.mask_time_prob,
-            model.config.mask_time_length,
-            min_masks=2,
-        )
-
-        outputs = model(
-            inputs_dict.input_values,
-            attention_mask=inputs_dict.attention_mask,
-            mask_time_indices=mask_time_indices,
-        )
-
-        # compute cosine similarity
-        cosine_sim = optax.cosine_similarity(
-            outputs.projected_states, outputs.projected_quantized_states, epsilon=1e-8
-        )
-
-        # retrieve cosine sim of masked features
-        cosine_sim_masked = cosine_sim[mask_time_indices]
-
-        # ... now compare to randomly initialized model
-
-        config = Wav2Vec2Config.from_pretrained("facebook/wav2vec2-large-lv60")
-        model_rand = FlaxWav2Vec2ForPreTraining(config)
-
-        outputs_rand = model_rand(
-            inputs_dict.input_values,
-            attention_mask=inputs_dict.attention_mask,
-            mask_time_indices=mask_time_indices,
-        )
-
-        # compute cosine similarity
-        cosine_sim_rand = optax.cosine_similarity(
-            outputs_rand.projected_states, outputs_rand.projected_quantized_states
-        )
-
-        # retrieve cosine sim of masked features
-        cosine_sim_masked_rand = cosine_sim_rand[mask_time_indices]
-
-        # a pretrained wav2vec2 model has learned to predict the quantized latent states
-        # => the cosine similarity between quantized states and predicted states > 0.5
-        # a random wav2vec2 model has not learned to predict the quantized latent states
-        # => the cosine similarity between quantized states and predicted states is very likely < 0.1
-        self.assertTrue(cosine_sim_masked.mean().item() - 5 * cosine_sim_masked_rand.mean().item() > 0)
-
-    @require_pyctcdecode
-    @require_librosa
-    def test_wav2vec2_with_lm(self):
-        ds = load_dataset("legacy-datasets/common_voice", "es", split="test", streaming=True, trust_remote_code=True)
-        sample = next(iter(ds))
-
-        resampled_audio = librosa.resample(sample["audio"]["array"], 48_000, 16_000)
-
-        model = FlaxWav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-        processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-
-        input_values = processor(resampled_audio, return_tensors="np").input_values
-
-        logits = model(input_values).logits
-
-        transcription = processor.batch_decode(np.array(logits)).text
-
-        self.assertEqual(transcription[0], "bien y qué regalo vas a abrir primero")
-
-    @require_pyctcdecode
-    @require_librosa
-    def test_wav2vec2_with_lm_pool(self):
-        ds = load_dataset("legacy-datasets/common_voice", "es", split="test", streaming=True, trust_remote_code=True)
-        sample = next(iter(ds))
-
-        resampled_audio = librosa.resample(sample["audio"]["array"], 48_000, 16_000)
-
-        model = FlaxWav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-        processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-
-        input_values = processor(resampled_audio, return_tensors="np").input_values
-
-        logits = model(input_values).logits
-
-        # test user-managed pool
-        with multiprocessing.get_context("fork").Pool(2) as pool:
-            transcription = processor.batch_decode(np.array(logits), pool).text
-
-        self.assertEqual(transcription[0], "bien y qué regalo vas a abrir primero")
-
-        # user-managed pool + num_processes should trigger a warning
-        with (
-            CaptureLogger(processing_wav2vec2_with_lm.logger) as cl,
-            multiprocessing.get_context("fork").Pool(2) as pool,
-        ):
-            transcription = processor.batch_decode(np.array(logits), pool, num_processes=2).text
-
-        self.assertIn("num_process", cl.out)
-        self.assertIn("it will be ignored", cl.out)
-
-        self.assertEqual(transcription[0], "bien y qué regalo vas a abrir primero")
-
-    @require_pyctcdecode
-    @require_librosa
-    def test_wav2vec2_with_lm_invalid_pool(self):
-        run_test_in_subprocess(test_case=self, target_func=_test_wav2vec2_with_lm_invalid_pool, inputs=None)
diff --git a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
deleted file mode 100644
index b6585baa11..0000000000
--- a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
+++ /dev/null
@@ -1,817 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import copy
-import gc
-import glob
-import inspect
-import math
-import multiprocessing
-import traceback
-import unittest
-
-import numpy as np
-import pytest
-from datasets import load_dataset
-from huggingface_hub import snapshot_download
-
-from transformers import Wav2Vec2Config, is_tf_available
-from transformers.testing_utils import (
-    CaptureLogger,
-    is_flaky,
-    require_librosa,
-    require_pyctcdecode,
-    require_tf,
-    run_test_in_subprocess,
-    slow,
-)
-from transformers.utils import is_librosa_available, is_pyctcdecode_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        AutoFeatureExtractor,
-        TFWav2Vec2ForCTC,
-        TFWav2Vec2ForSequenceClassification,
-        TFWav2Vec2Model,
-        Wav2Vec2Processor,
-    )
-    from transformers.models.wav2vec2.modeling_tf_wav2vec2 import _compute_mask_indices
-
-
-if is_pyctcdecode_available():
-    import pyctcdecode.decoder
-
-    from transformers import Wav2Vec2ProcessorWithLM
-    from transformers.models.wav2vec2_with_lm import processing_wav2vec2_with_lm
-
-
-if is_librosa_available():
-    import librosa
-
-
-def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
-    error = None
-    try:
-        _ = in_queue.get(timeout=timeout)
-
-        downloaded_folder = snapshot_download("patrickvonplaten/common_voice_es_sample")
-        file_path = glob.glob(downloaded_folder + "/*")[0]
-        sample = librosa.load(file_path, sr=16_000)[0]
-
-        model = TFWav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-        processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-
-        input_values = processor(sample, return_tensors="tf").input_values
-
-        logits = model(input_values).logits
-
-        # use a spawn pool, which should trigger a warning if different than fork
-        with CaptureLogger(pyctcdecode.decoder.logger) as cl, multiprocessing.get_context("spawn").Pool(1) as pool:
-            transcription = processor.batch_decode(logits.numpy(), pool).text
-
-        unittest.TestCase().assertIn("Falling back to sequential decoding.", cl.out)
-        unittest.TestCase().assertEqual(transcription[0], "el libro ha sido escrito por cervantes")
-
-        # force batch_decode to internally create a spawn pool, which should trigger a warning if different than fork
-        multiprocessing.set_start_method("spawn", force=True)
-        with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl:
-            transcription = processor.batch_decode(logits.numpy()).text
-
-        unittest.TestCase().assertIn("Falling back to sequential decoding.", cl.out)
-        unittest.TestCase().assertEqual(transcription[0], "el libro ha sido escrito por cervantes")
-    except Exception:
-        error = f"{traceback.format_exc()}"
-
-    results = {"error": error}
-    out_queue.put(results, timeout=timeout)
-    out_queue.join()
-
-
-@require_tf
-class TFWav2Vec2ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,
-        seq_length=1024,
-        is_training=False,
-        hidden_size=16,
-        feat_extract_norm="group",
-        feat_extract_dropout=0.0,
-        feat_extract_activation="gelu",
-        conv_dim=(32, 32, 32),
-        conv_stride=(4, 4, 4),
-        conv_kernel=(8, 8, 8),
-        conv_bias=False,
-        num_conv_pos_embeddings=16,
-        num_conv_pos_embedding_groups=2,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
-        intermediate_size=20,
-        layer_norm_eps=1e-5,
-        hidden_act="gelu",
-        initializer_range=0.02,
-        vocab_size=32,
-        do_stable_layer_norm=False,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
-        self.feat_extract_dropout = feat_extract_dropout
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.do_stable_layer_norm = do_stable_layer_norm
-        self.scope = scope
-
-        output_seq_length = self.seq_length
-        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
-            output_seq_length = (output_seq_length - (kernel - 1)) / stride
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-    def prepare_config_and_inputs(self):
-        input_values = tf.cast(ids_tensor([self.batch_size, self.seq_length], 32768), tf.float32) / 32768.0
-        attention_mask = tf.ones_like(input_values)
-
-        config = Wav2Vec2Config(
-            hidden_size=self.hidden_size,
-            feat_extract_norm=self.feat_extract_norm,
-            feat_extract_dropout=self.feat_extract_dropout,
-            feat_extract_activation=self.feat_extract_activation,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-            do_stable_layer_norm=self.do_stable_layer_norm,
-        )
-
-        return config, input_values, attention_mask
-
-    def create_and_check_model(self, config, input_values, attention_mask):
-        model = TFWav2Vec2Model(config)
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        config.layerdrop = 0.0
-        model = TFWav2Vec2Model(config)
-
-        input_values = input_values[:3]
-        attention_mask = tf.ones_like(input_values)
-
-        input_lengths = tf.constant([input_values.shape[-1] // i for i in [4, 2, 1]])
-        length_mask = tf.sequence_mask(input_lengths, dtype=tf.float32)
-
-        # convert values that are over input_lengths to padding
-        input_values = input_values * length_mask
-        attention_mask = attention_mask * length_mask
-
-        batch_outputs = model(input_values, attention_mask=attention_mask, training=False).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice, training=False).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(np.allclose(output, batch_output, atol=1e-3))
-
-    def check_ctc_loss(self, config, input_values, *args):
-        model = TFWav2Vec2ForCTC(config)
-
-        input_values = input_values[:3]
-        attention_mask = tf.ones_like(input_values)
-
-        input_lengths = tf.constant([input_values.shape[-1] // i for i in [4, 2, 1]])
-        max_length_labels = model.wav2vec2._get_feat_extract_output_lengths(input_lengths)
-        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
-
-        length_mask = tf.sequence_mask(input_lengths, dtype=tf.float32)
-
-        # convert values that are over input_lengths to padding
-        input_values = input_values * length_mask
-        attention_mask = attention_mask * length_mask
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss
-
-        self.parent.assertTrue(abs(labels.shape[0] * mean_loss - sum_loss) < 1e-2)
-
-    def check_seq_classifier_loss(self, loss, config, input_values, *args):
-        model = TFWav2Vec2ForSequenceClassification(config)
-
-        input_values = input_values[:3]
-        attention_mask = tf.ones(input_values.shape, dtype=tf.int32)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = tf.random.uniform((input_values.shape[0],), maxval=len(model.config.id2label), dtype=tf.int32)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-        training = False
-        masked_loss = (
-            model(input_values, attention_mask=attention_mask, labels=labels, training=training).loss.numpy().item()
-        )
-        unmasked_loss = model(input_values, labels=labels, training=training).loss.numpy().item()
-
-        assert isinstance(masked_loss, float)
-        assert isinstance(unmasked_loss, float)
-        assert masked_loss != unmasked_loss
-
-    def check_training(self, config, input_values, *args):
-        model = TFWav2Vec2ForCTC(config)
-
-        # freeze feature encoder
-        model.freeze_feature_encoder()
-
-        input_values = input_values[:3]
-
-        input_lengths = tf.constant([input_values.shape[-1] // i for i in [4, 2, 1]])
-        max_length_labels = model.wav2vec2._get_feat_extract_output_lengths(input_lengths)
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size)
-
-        length_mask = tf.sequence_mask(input_lengths, dtype=tf.float32)
-
-        input_values = input_values * length_mask
-
-        pad_size = max(max_length_labels) - labels.shape[1]
-        labels = tf.pad(labels, ((0, 0), (0, pad_size)), constant_values=-100)
-
-        loss = model(input_values, labels=labels, training=True).loss
-
-        self.parent.assertFalse(tf.math.is_inf(loss))
-
-    def check_labels_out_of_vocab(self, config, input_values, *args):
-        model = TFWav2Vec2ForCTC(config)
-        input_lengths = tf.constant([input_values.shape[-1] // i for i in [4, 2, 1]])
-        max_length_labels = model.wav2vec2._get_feat_extract_output_lengths(input_lengths)
-        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size + 500)
-        with pytest.raises(ValueError):
-            model(input_values, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_values, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFWav2Vec2ModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (TFWav2Vec2Model, TFWav2Vec2ForCTC, TFWav2Vec2ForSequenceClassification) if is_tf_available() else ()
-    )
-    pipeline_model_mapping = (
-        {"audio-classification": TFWav2Vec2ForSequenceClassification, "feature-extraction": TFWav2Vec2Model}
-        if is_tf_available()
-        else {}
-    )
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFWav2Vec2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Wav2Vec2Config, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    # overwrite because input_values != input_ids
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["input_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    # overwrite because input_values != input_ids
-    def test_keyword_and_dict_args(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            outputs_dict = model(inputs)
-
-            inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            input_values = inputs_keywords.pop("input_values", None)
-            outputs_keywords = model(input_values, **inputs_keywords)
-            output_dict = outputs_dict[0].numpy()
-            output_keywords = outputs_keywords[0].numpy()
-
-            self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_hidden_states_output(config, inputs_dict, model_class):
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-
-            hidden_states = outputs.hidden_states
-            self.assertEqual(config.output_attentions, False)
-            self.assertEqual(len(hidden_states), expected_num_layers)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.model_tester.output_seq_length, self.model_tester.hidden_size],
-            )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(config, inputs_dict, model_class)
-
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-            check_hidden_states_output(config, inputs_dict, model_class)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    @is_flaky()
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    def test_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_training(*config_and_inputs)
-
-    @unittest.skip(reason="Wav2Vec2 has no input embeddings")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Wav2Vec2 has no tokens embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Wav2Vec2 has no input embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="Fix me! Wav2Vec2 hits OOM errors when loss is computed on full batch")
-    def test_dataset_conversion(self):
-        # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
-        pass
-
-    @unittest.skip(reason="Fix me! Wav2Vec2 hits OOM errors when loss is computed on full batch")
-    def test_keras_fit(self):
-        # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
-        pass
-
-
-@require_tf
-class TFWav2Vec2RobustModelTest(TFModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (TFWav2Vec2Model, TFWav2Vec2ForCTC, TFWav2Vec2ForSequenceClassification) if is_tf_available() else ()
-    )
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFWav2Vec2ModelTester(
-            self,
-            conv_stride=(3, 3, 3),
-            feat_extract_norm="layer",
-            do_stable_layer_norm=True,
-            scope="robust",
-        )
-        self.config_tester = ConfigTester(self, config_class=Wav2Vec2Config, hidden_size=37)
-
-    # overwrite because input_values != input_ids
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["input_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    # overwrite because input_values != input_ids
-    def test_keyword_and_dict_args(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            outputs_dict = model(inputs)
-
-            inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            input_values = inputs_keywords.pop("input_values", None)
-            outputs_keywords = model(input_values, **inputs_keywords)
-            output_dict = outputs_dict[0].numpy()
-            output_keywords = outputs_keywords[0].numpy()
-
-            self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_hidden_states_output(config, inputs_dict, model_class):
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-
-            hidden_states = outputs.hidden_states
-            self.assertEqual(config.output_attentions, False)
-            self.assertEqual(len(hidden_states), expected_num_layers)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.model_tester.output_seq_length, self.model_tester.hidden_size],
-            )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(config, inputs_dict, model_class)
-
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-            check_hidden_states_output(config, inputs_dict, model_class)
-
-    def test_batched_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    # TODO (Joao): fix me
-    @unittest.skip("Broke with TF 2.10")
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    def test_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_training(*config_and_inputs)
-
-    @unittest.skip(reason="Wav2Vec2 has no input embeddings")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Wav2Vec2 has no tokens embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Wav2Vec2 has no input embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="Fix me! Wav2Vec2 hits OOM errors when loss is computed on full batch")
-    def test_dataset_conversion(self):
-        # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
-        pass
-
-    @unittest.skip(reason="Fix me! Wav2Vec2 hits OOM errors when loss is computed on full batch")
-    def test_keras_fit(self):
-        # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
-        pass
-
-
-@require_tf
-class TFWav2Vec2UtilsTest(unittest.TestCase):
-    def test_compute_mask_indices(self):
-        batch_size = 4
-        sequence_length = 60
-        mask_prob = 0.5
-        mask_length = 1
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-
-        self.assertListEqual(
-            tf.reduce_sum(mask, -1).numpy().tolist(), [mask_prob * sequence_length for _ in range(batch_size)]
-        )
-
-    def test_compute_mask_indices_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-
-        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
-        for batch_sum in tf.reduce_sum(mask, -1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-
-@require_tf
-@slow
-class TFWav2Vec2ModelIntegrationTest(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
-
-        return ds[:num_samples]
-
-    def test_inference_ctc_normal(self):
-        model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
-        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True)
-        input_speech = self._load_datasamples(1)
-
-        input_values = processor(input_speech, return_tensors="tf", sampling_rate=16000).input_values
-
-        logits = model(input_values).logits
-
-        predicted_ids = tf.argmax(logits, axis=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_normal_batched(self):
-        model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
-        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True)
-
-        input_speech = self._load_datasamples(2)
-
-        input_values = processor(input_speech, return_tensors="tf", padding=True, sampling_rate=16000).input_values
-
-        logits = model(input_values).logits
-
-        predicted_ids = tf.argmax(logits, axis=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_robust_batched(self):
-        model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
-        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", do_lower_case=True)
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="tf", padding=True, sampling_rate=16000)
-
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-
-        logits = model(input_values, attention_mask=attention_mask).logits
-
-        predicted_ids = tf.argmax(logits, axis=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around"
-            " him with the thousands of spectators were trivialities not worth thinking about",
-            "his instant panic was followed by a small sharp blow high on his chest",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    @require_pyctcdecode
-    @require_librosa
-    def test_wav2vec2_with_lm(self):
-        downloaded_folder = snapshot_download("patrickvonplaten/common_voice_es_sample")
-        file_path = glob.glob(downloaded_folder + "/*")[0]
-        sample = librosa.load(file_path, sr=16_000)[0]
-
-        model = TFWav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-        processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-
-        input_values = processor(sample, return_tensors="tf").input_values
-
-        logits = model(input_values).logits
-
-        transcription = processor.batch_decode(logits.numpy()).text
-
-        self.assertEqual(transcription[0], "el libro ha sido escrito por cervantes")
-
-    @require_pyctcdecode
-    @require_librosa
-    def test_wav2vec2_with_lm_pool(self):
-        downloaded_folder = snapshot_download("patrickvonplaten/common_voice_es_sample")
-        file_path = glob.glob(downloaded_folder + "/*")[0]
-        sample = librosa.load(file_path, sr=16_000)[0]
-
-        model = TFWav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-        processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-
-        input_values = processor(sample, return_tensors="tf").input_values
-
-        logits = model(input_values).logits
-
-        # test user-managed pool
-        with multiprocessing.get_context("fork").Pool(2) as pool:
-            transcription = processor.batch_decode(logits.numpy(), pool).text
-
-        self.assertEqual(transcription[0], "el libro ha sido escrito por cervantes")
-
-        # user-managed pool + num_processes should trigger a warning
-        with (
-            CaptureLogger(processing_wav2vec2_with_lm.logger) as cl,
-            multiprocessing.get_context("fork").Pool(2) as pool,
-        ):
-            transcription = processor.batch_decode(logits.numpy(), pool, num_processes=2).text
-
-        self.assertIn("num_process", cl.out)
-        self.assertIn("it will be ignored", cl.out)
-
-        self.assertEqual(transcription[0], "el libro ha sido escrito por cervantes")
-
-    @require_pyctcdecode
-    @require_librosa
-    def test_wav2vec2_with_lm_invalid_pool(self):
-        run_test_in_subprocess(test_case=self, target_func=_test_wav2vec2_with_lm_invalid_pool, inputs=None)
-
-    def test_inference_keyword_spotting(self):
-        model = TFWav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-ks", from_pt=True)
-        processor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ks")
-        input_data = self._load_superb("ks", 4)
-        inputs = processor(input_data["speech"], return_tensors="tf", padding=True)
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-        outputs = model(input_values, attention_mask)
-        predicted_logits, predicted_ids = (
-            tf.math.reduce_max(outputs.logits, axis=-1),
-            tf.argmax(outputs.logits, axis=-1),
-        )
-        expected_labels = [7, 6, 10, 9]
-        expected_logits = tf.convert_to_tensor([6.1186, 11.8961, 10.2931, 6.0898])
-        self.assertListEqual(predicted_ids.numpy().tolist(), expected_labels)
-        self.assertTrue(np.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_inference_intent_classification(self):
-        model = TFWav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-ic", from_pt=True)
-        processor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("ic", 4)
-        inputs = processor(input_data["speech"], return_tensors="tf", padding=True)
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-        outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits_action, predicted_ids_action = (
-            tf.math.reduce_max(outputs.logits[:, :6], axis=-1),
-            tf.argmax(outputs.logits[:, :6], axis=-1),
-        )
-        predicted_logits_object, predicted_ids_object = (
-            tf.math.reduce_max(outputs.logits[:, 6:20], axis=-1),
-            tf.argmax(outputs.logits[:, 6:20], axis=-1),
-        )
-        predicted_logits_location, predicted_ids_location = (
-            tf.math.reduce_max(outputs.logits[:, 20:24], axis=-1),
-            tf.argmax(outputs.logits[:, 20:24], axis=-1),
-        )
-        expected_labels_action = [0, 0, 2, 3]
-        expected_logits_action = tf.convert_to_tensor([0.4568, 11.0848, 1.6621, 9.3841])
-        expected_labels_object = [3, 10, 3, 4]
-        expected_logits_object = tf.convert_to_tensor([1.5322, 10.7094, 5.2469, 22.1318])
-        expected_labels_location = [0, 0, 0, 1]
-        expected_logits_location = tf.convert_to_tensor([1.5335, 6.5096, 10.5704, 11.0569])
-
-        self.assertListEqual(predicted_ids_action.numpy().tolist(), expected_labels_action)
-        self.assertListEqual(predicted_ids_object.numpy().tolist(), expected_labels_object)
-        self.assertListEqual(predicted_ids_location.numpy().tolist(), expected_labels_location)
-
-        self.assertTrue(np.allclose(predicted_logits_action, expected_logits_action, atol=1e-2))
-        self.assertTrue(np.allclose(predicted_logits_object, expected_logits_object, atol=1e-2))
-        self.assertTrue(np.allclose(predicted_logits_location, expected_logits_location, atol=1e-2))
-
-    def test_inference_speaker_identification(self):
-        model = TFWav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-sid", from_pt=True)
-        processor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-sid")
-        input_data = self._load_superb("si", 4)
-        output_logits = []
-        for example in input_data["speech"]:
-            input = processor(example, return_tensors="tf", padding=True)
-            output = model(input.input_values, attention_mask=None)
-            output_logits.append(output.logits[0])
-        output_logits = tf.stack(output_logits)
-        predicted_logits, predicted_ids = tf.math.reduce_max(output_logits, axis=-1), tf.argmax(output_logits, axis=-1)
-        expected_labels = [251, 1, 1, 3]
-        expected_logits = tf.convert_to_tensor([37.5627, 71.6362, 64.2419, 31.7778])
-        self.assertListEqual(predicted_ids.numpy().tolist(), expected_labels)
-        self.assertTrue(np.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_inference_emotion_recognition(self):
-        model = TFWav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er", from_pt=True)
-        processor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er")
-        input_data = self._load_superb("er", 4)
-        inputs = processor(input_data["speech"], return_tensors="tf", padding=True)
-
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-        outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits, predicted_ids = (
-            tf.math.reduce_max(outputs.logits, axis=-1),
-            tf.argmax(outputs.logits, axis=-1),
-        )
-
-        expected_labels = [1, 1, 2, 2]
-        # s3prl logits for the same batch
-        expected_logits = tf.convert_to_tensor([2.1722, 3.0779, 8.0287, 6.6797])
-
-        self.assertListEqual(predicted_ids.numpy().tolist(), expected_labels)
-        self.assertTrue(np.allclose(predicted_logits, expected_logits, atol=1e-2))
diff --git a/tests/models/whisper/test_modeling_flax_whisper.py b/tests/models/whisper/test_modeling_flax_whisper.py
deleted file mode 100644
index ee583c8719..0000000000
--- a/tests/models/whisper/test_modeling_flax_whisper.py
+++ /dev/null
@@ -1,805 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import functools
-import inspect
-import tempfile
-import unittest
-
-from transformers import WhisperConfig, is_flax_available
-from transformers.testing_utils import require_flax, slow
-from transformers.utils import cached_property
-from transformers.utils.import_utils import is_datasets_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor
-
-
-if is_datasets_available():
-    import datasets
-    from datasets import load_dataset
-
-if is_flax_available():
-    import jax
-    import numpy as np
-    from flax.core.frozen_dict import unfreeze
-    from flax.traverse_util import flatten_dict
-
-    from transformers import (
-        FLAX_MODEL_MAPPING,
-        FlaxWhisperForAudioClassification,
-        FlaxWhisperForConditionalGeneration,
-        FlaxWhisperModel,
-        WhisperFeatureExtractor,
-        WhisperProcessor,
-    )
-    from transformers.models.whisper.modeling_flax_whisper import sinusoidal_embedding_init
-
-
-@require_flax
-class FlaxWhisperModelTester:
-    config_cls = WhisperConfig
-    config_updates = {}
-    hidden_act = "gelu"
-
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=60,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        d_model=16,
-        decoder_attention_heads=4,
-        decoder_ffn_dim=16,
-        decoder_layers=2,
-        encoder_attention_heads=4,
-        encoder_ffn_dim=16,
-        encoder_layers=2,
-        input_channels=1,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=70,
-        max_source_positions=30,
-        max_target_positions=40,
-        bos_token_id=98,
-        eos_token_id=98,
-        pad_token_id=0,
-        num_mel_bins=80,
-        decoder_start_token_id=85,
-        num_conv_layers=1,
-        suppress_tokens=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.hidden_size = d_model
-        self.num_hidden_layers = encoder_layers
-        self.num_attention_heads = encoder_attention_heads
-        self.decoder_attention_heads = decoder_attention_heads
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.decoder_layers = decoder_layers
-        self.encoder_attention_heads = encoder_attention_heads
-        self.encoder_ffn_dim = encoder_ffn_dim
-        self.encoder_layers = encoder_layers
-        self.encoder_seq_length = seq_length // 2
-        self.decoder_seq_length = 1
-        self.input_channels = input_channels
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.num_mel_bins = num_mel_bins
-        self.max_position_embeddings = max_position_embeddings
-        self.max_source_positions = max_source_positions
-        self.max_target_positions = max_target_positions
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.num_conv_layers = num_conv_layers
-        self.suppress_tokens = suppress_tokens
-
-    def prepare_config_and_inputs_for_common(self):
-        input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size)
-
-        decoder_input_ids = np.array(self.batch_size * [[self.decoder_start_token_id]])
-
-        config = WhisperConfig(
-            vocab_size=self.vocab_size,
-            num_mel_bins=self.num_mel_bins,
-            decoder_start_token_id=self.decoder_start_token_id,
-            is_encoder_decoder=True,
-            activation_function=self.hidden_act,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_source_positions=self.max_source_positions,
-            max_target_positions=self.max_target_positions,
-            pad_token_id=self.pad_token_id,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            tie_word_embeddings=True,
-            d_model=self.d_model,
-            decoder_attention_heads=self.decoder_attention_heads,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            decoder_layers=self.decoder_layers,
-            encoder_attention_heads=self.encoder_attention_heads,
-            encoder_ffn_dim=self.encoder_ffn_dim,
-            encoder_layers=self.encoder_layers,
-            suppress_tokens=self.suppress_tokens,
-        )
-        inputs_dict = prepare_whisper_inputs_dict(config, input_features, decoder_input_ids)
-        return config, inputs_dict
-
-
-def prepare_whisper_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-):
-    if decoder_attention_mask is None:
-        decoder_attention_mask = np.concatenate(
-            [
-                np.ones(decoder_input_ids[:, :1].shape, dtype=np.int8),
-                np.not_equal(decoder_input_ids[:, 1:], config.pad_token_id).astype(np.int8),
-            ],
-            axis=-1,
-        )
-    return {
-        "input_features": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "decoder_attention_mask": decoder_attention_mask,
-    }
-
-
-def partialclass(cls, *args, **kwargs):
-    class NewCls(cls):
-        __init__ = functools.partialmethod(cls.__init__, *args, **kwargs)
-
-    return NewCls
-
-
-def make_partial_class(full_class, *args, **kwargs):
-    partial_class = partialclass(full_class, *args, **kwargs)
-    partial_class.__name__ = full_class.__name__
-    partial_class.__module__ = full_class.__module__
-
-    return partial_class
-
-
-@require_flax
-class FlaxWhisperModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxWhisperForConditionalGeneration, FlaxWhisperModel) if is_flax_available() else ()
-    is_encoder_decoder = True
-    test_pruning = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = FlaxWhisperModelTester(self)
-        _, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        self.init_shape = (1,) + inputs_dict["input_features"].shape[1:]
-
-        self.all_model_classes = (
-            make_partial_class(model_class, input_shape=self.init_shape) for model_class in self.all_model_classes
-        )
-        self.config_tester = ConfigTester(self, config_class=WhisperConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    # overwrite because of `input_features`
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.__call__)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["input_features", "decoder_input_ids"]
-            self.assertListEqual(arg_names[:2], expected_arg_names)
-
-    # overwrite because of `input_features`
-    def test_jit_compilation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def model_jitted(input_features, decoder_input_ids, **kwargs):
-                    return model(input_features=input_features, decoder_input_ids=decoder_input_ids, **kwargs)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    # overwrite because of `input_features`
-    def test_save_load_from_base(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = make_partial_class(FLAX_MODEL_MAPPING[config.__class__], input_shape=self.init_shape)
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ == base_class.__name__:
-                continue
-
-            model = base_class(config)
-            base_params = flatten_dict(unfreeze(model.params))
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                head_model = model_class.from_pretrained(tmpdirname)
-
-                base_param_from_head = flatten_dict(unfreeze(head_model.params[head_model.base_model_prefix]))
-
-                for key in base_param_from_head.keys():
-                    max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite because of `input_features`
-    def test_save_load_to_base(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = make_partial_class(FLAX_MODEL_MAPPING[config.__class__], input_shape=self.init_shape)
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ == base_class.__name__:
-                continue
-
-            model = model_class(config)
-            base_params_from_head = flatten_dict(unfreeze(model.params[model.base_model_prefix]))
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    def test_encoder_sinusoidal_embed_positions(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            params = model.params
-            if model.base_model_prefix in params:
-                params = model.params[model.base_model_prefix]
-
-            embeds = params["encoder"]["embed_positions"]["embedding"]
-            sinusoids = sinusoidal_embedding_init(None, embeds.shape)
-            self.assertTrue(jax.numpy.allclose(embeds, sinusoids))
-
-
-@slow
-@require_flax
-class FlaxWhisperModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_processor(self):
-        return WhisperProcessor.from_pretrained("openai/whisper-base")
-
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_tiny_logits_librispeech(self):
-        model = FlaxWhisperModel.from_pretrained("openai/whisper-tiny", from_pt=True)
-        input_speech = self._load_datasamples(1)
-        feature_extractor = WhisperFeatureExtractor()
-        input_features = feature_extractor(input_speech, return_tensors="np").input_features
-
-        logits = model(
-            input_features,
-            decoder_input_ids=np.array([[50258, 50259, 50359]]),
-            output_hidden_states=False,
-            output_attentions=False,
-            return_dict=False,
-        )
-
-        # fmt: off
-        EXPECTED_LOGITS = np.array(
-            [
-                2.9892, -6.7607, 5.7348, 3.6096, 0.2152, -5.7321, 4.8855, -1.6407,
-                0.2823, -1.5718, 10.4269, 3.4427, 0.0219, -8.0612, 3.4784, 8.4246,
-                4.0575, -2.2864, 11.1084, 0.9963, 0.9884, -8.5154, -3.5469, -9.3713,
-                0.9786, 3.5435, 7.4850, -5.2579, -1.4366, 10.4841
-            ]
-        )
-        # fmt: on
-        self.assertTrue(np.allclose(logits[0][0, 0, :30], EXPECTED_LOGITS, atol=1e-4))
-
-    def test_small_en_logits_librispeech(self):
-        model = FlaxWhisperModel.from_pretrained("openai/whisper-small.en", from_pt=True)
-        input_speech = self._load_datasamples(1)
-        feature_extractor = WhisperFeatureExtractor()
-        input_features = feature_extractor(input_speech, return_tensors="np").input_features
-
-        logits = model(
-            input_features,
-            decoder_input_ids=np.array([model.config.decoder_start_token_id]),
-            output_hidden_states=False,
-            output_attentions=False,
-            return_dict=False,
-        )
-
-        logits = logits[0] @ model.params["model"]["decoder"]["embed_tokens"]["embedding"].T
-
-        # fmt: off
-        EXPECTED_LOGITS = np.array(
-            [
-                -3.6784, -7.7211, -9.5070, -11.9286, -7.6489, -9.7026, -5.6188,
-                -8.0104, -4.6238, -5.1833, -9.0485, -3.4079, -5.4874, -2.6935,
-                -6.3479, -7.3398, -6.9558, -7.6867, -7.4748, -8.3463, -9.9781,
-                -10.8389, -10.3105, -11.7201, -9.7261, -7.1590, -5.9272, -12.4509,
-                -11.1146, -8.1918
-            ]
-        )
-        # fmt: on
-        self.assertTrue(np.allclose(logits[0, 0, :30], EXPECTED_LOGITS, atol=1e-4))
-
-    def test_large_logits_librispeech(self):
-        model = FlaxWhisperModel.from_pretrained("openai/whisper-large", from_pt=True)
-        input_speech = self._load_datasamples(1)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
-        processed_inputs = processor(
-            audio=input_speech, text="This part of the speech", add_special_tokens=False, return_tensors="np"
-        )
-        input_features = processed_inputs.input_features
-        decoder_input_ids = processed_inputs.labels
-
-        logits = model(
-            input_features,
-            decoder_input_ids=decoder_input_ids,
-            output_hidden_states=False,
-            output_attentions=False,
-            return_dict=False,
-        )
-
-        logits = logits[0] @ model.params["model"]["decoder"]["embed_tokens"]["embedding"].T
-
-        # fmt: off
-        EXPECTED_LOGITS = np.array(
-            [
-                2.1382, 0.9381, 4.4671, 3.5589, 2.4022, 3.8576, -0.6521, 2.5472,
-                1.8301, 1.9957, 2.3432, 1.4678, 0.5459, 2.2597, 1.5179, 2.5357,
-                1.1624, 0.6194, 1.0757, 1.8259, 2.4076, 1.6601, 2.3503, 1.3376,
-                1.9891, 1.8635, 3.8931, 5.3699, 4.4772, 3.9184
-            ]
-        )
-        # fmt: on
-        self.assertTrue(np.allclose(logits[0, 0, :30], EXPECTED_LOGITS, atol=1e-4))
-
-    def test_tiny_en_generation(self):
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
-        model.config.decoder_start_token_id = 50257
-
-        input_speech = self._load_datasamples(1)
-        input_features = processor.feature_extractor(
-            raw_speech=input_speech, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="jax"
-        ).input_features
-
-        generated_ids = model.generate(input_features, num_beams=5, max_length=20).sequences
-        transcript = processor.tokenizer.decode(generated_ids[0])
-
-        EXPECTED_TRANSCRIPT = (
-            "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle"
-            " classes and we are glad to"
-        )
-        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-    def test_tiny_generation(self):
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny", from_pt=True)
-
-        input_speech = self._load_datasamples(1)
-        input_features = processor.feature_extractor(
-            raw_speech=input_speech, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="jax"
-        ).input_features
-
-        generated_ids = model.generate(input_features, num_beams=5, max_length=20).sequences
-        transcript = processor.tokenizer.decode(generated_ids[0])
-
-        EXPECTED_TRANSCRIPT = (
-            "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle"
-            " classes and we are glad"
-        )
-        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-    def test_large_generation(self):
-        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
-        model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-large", from_pt=True)
-
-        input_speech = self._load_datasamples(1)
-        input_features = processor.feature_extractor(
-            raw_speech=input_speech, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="jax"
-        ).input_features
-
-        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
-
-        generated_ids = model.generate(input_features, num_beams=5, max_length=20).sequences
-        transcript = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-
-        EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we are glad"
-        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-    def test_large_generation_multilingual(self):
-        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
-        model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-large", from_pt=True)
-
-        ds = load_dataset("legacy-datasets/common_voice", "ja", split="test", streaming=True, trust_remote_code=True)
-        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
-        input_speech = next(iter(ds))["audio"]["array"]
-        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="np")
-
-        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="transcribe")
-        generated_ids = model.generate(input_features, do_sample=False, max_length=20).sequences
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-
-        EXPECTED_TRANSCRIPT = "木村さんに電話を貸してもらいました"
-        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
-        generated_ids = model.generate(
-            input_features,
-            do_sample=False,
-            max_length=20,
-        ).sequences
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-
-        EXPECTED_TRANSCRIPT = " Kimura-san called me."
-        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="translate")
-        generated_ids = model.generate(input_features, do_sample=False, max_length=20).sequences
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-
-        EXPECTED_TRANSCRIPT = " I borrowed a phone from Kimura san"
-        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-    def test_large_batched_generation(self):
-        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
-        model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-large", from_pt=True)
-
-        input_speech = self._load_datasamples(4)
-        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="np").input_features
-        generated_ids = model.generate(input_features, max_length=20).sequences
-
-        # fmt: off
-        EXPECTED_LOGITS = np.array(
-            [
-                [50258, 50358, 50363, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 293, 321, 366, 5404, 281],
-                [50258, 50358, 50363, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50257, 50257],
-                [50258, 50358, 50363, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904, 9256],
-                [50258, 50358, 50363, 634, 575, 12525, 22618, 1968, 6144, 35617, 20084, 1756, 311, 589, 307, 534, 10281, 934, 439, 11]
-            ]
-        )
-        # fmt: on
-
-        self.assertTrue(np.allclose(generated_ids, EXPECTED_LOGITS))
-
-        # fmt: off
-        EXPECTED_TRANSCRIPT = [
-            " Mr. Quilter is the apostle of the middle classes and we are glad to",
-            " Nor is Mr. Quilter's manner less interesting than his matter.",
-            " He tells us that at this festive season of the year, with Christmas and roast beef",
-            " He has grave doubts whether Sir Frederick Layton's work is really Greek after all,",
-        ]
-        # fmt: on
-
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
-        self.assertListEqual(transcript, EXPECTED_TRANSCRIPT)
-
-    def test_tiny_en_batched_generation(self):
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
-
-        input_speech = self._load_datasamples(4)
-        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="np").input_features
-        generated_ids = model.generate(input_features, max_length=20).sequences
-
-        # fmt: off
-        EXPECTED_LOGITS = np.array(
-            [
-                [50257, 50362, 1770, 13, 2264, 346, 353, 318, 262, 46329, 286, 262, 3504, 6097, 11, 290, 356, 389, 9675, 284],
-                [50257, 50362, 5414, 318, 1770, 13, 2264, 346, 353, 338, 5642, 1342, 3499, 621, 465, 2300, 13, 50256, 50256, 50256],
-                [50257, 50362, 679, 4952, 514, 326, 379, 428, 43856, 1622, 286, 262, 614, 11, 351, 6786, 290, 32595, 12023, 28236],
-                [50257, 50362, 679, 468, 12296, 17188, 1771, 7361, 26113, 18881, 1122, 338, 670, 318, 1107, 8312, 706, 477, 290, 460]
-            ]
-
-        )
-        # fmt: on
-
-        self.assertTrue(np.allclose(generated_ids, EXPECTED_LOGITS))
-
-        # fmt: off
-        EXPECTED_TRANSCRIPT = [
-            " Mr. Quilter is the apostle of the middle classes, and we are glad to",
-            " Nor is Mr. Quilter's manner less interesting than his matter.",
-            " He tells us that at this festive season of the year, with Christmas and roast beef looming",
-            " He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can",
-        ]
-        # fmt: on
-
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
-        self.assertListEqual(transcript, EXPECTED_TRANSCRIPT)
-
-    @slow
-    def test_tiny_timestamp_generation(self):
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-
-        input_speech = np.concatenate(self._load_datasamples(4))
-        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="jax").input_features
-
-        generate_fn = jax.jit(functools.partial(model.generate, max_length=448, return_timestamps=True))
-
-        generated_ids = generate_fn(input_features)
-
-        EXPECTED_OUTPUT = np.array([50258, 50259, 50359, 50364, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 11, 293, 321, 366, 5404, 281, 2928, 702, 14943, 13, 50692, 50692, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50926, 50926, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904, 9256, 450, 10539, 51208, 51208, 949, 505, 11, 14138, 10117, 490, 3936, 293, 1080, 3542, 5160, 881, 26336, 281, 264, 1575, 13, 51552, 51552, 634, 575, 12525, 22618, 1968, 6144, 35617, 7354, 1292, 6, 589, 307, 534, 10281, 934, 439, 11, 293, 51836, 51836, 50257])  # fmt: skip
-
-        self.assertTrue(np.allclose(generated_ids, EXPECTED_OUTPUT))
-
-        EXPECTED_TRANSCRIPT = [
-            {
-                "text": (
-                    " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is"
-                    " Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season"
-                    " of the year, with Christmas and roast beef looming before us, similarly drawn from eating and"
-                    " its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins'"
-                    " work is really Greek after all, and"
-                ),
-                "offsets": [
-                    {
-                        "text": (
-                            " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
-                        ),
-                        "timestamp": (0.0, 6.5600000000000005),
-                    },
-                    {
-                        "text": " Nor is Mr. Quilter's manner less interesting than his matter.",
-                        "timestamp": (6.5600000000000005, 11.24),
-                    },
-                    {
-                        "text": (
-                            " He tells us that at this festive season of the year, with Christmas and roast beef"
-                            " looming"
-                        ),
-                        "timestamp": (11.24, 16.88),
-                    },
-                    {
-                        "text": (
-                            " before us, similarly drawn from eating and its results occur most readily to the mind."
-                        ),
-                        "timestamp": (16.88, 23.76),
-                    },
-                    {
-                        "text": (
-                            " He has grave doubts whether Sir Frederick Latins' work is really Greek after all, and"
-                        ),
-                        "timestamp": (23.76, 29.44),
-                    },
-                ],
-            }
-        ]
-
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True)
-        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-
-class FlaxWhisperEncoderModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=60,
-        is_training=True,
-        use_labels=True,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        input_channels=1,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        max_source_positions=30,
-        num_mel_bins=80,
-        num_conv_layers=1,
-        suppress_tokens=None,
-        classifier_proj_size=4,
-        num_labels=2,
-        is_encoder_decoder=False,
-        is_decoder=False,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.input_channels = input_channels
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.num_mel_bins = num_mel_bins
-        self.max_position_embeddings = max_position_embeddings
-        self.max_source_positions = max_source_positions
-        self.num_conv_layers = num_conv_layers
-        self.suppress_tokens = suppress_tokens
-        self.classifier_proj_size = classifier_proj_size
-        self.num_labels = num_labels
-        self.is_encoder_decoder = is_encoder_decoder
-        self.is_decoder = is_decoder
-
-    def get_config(self):
-        return WhisperConfig(
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            input_channels=self.input_channels,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            max_source_positions=self.max_source_positions,
-            decoder_ffn_dim=self.hidden_size,
-            encoder_ffn_dim=self.hidden_size,
-            suppress_tokens=self.suppress_tokens,
-            classifier_proj_size=self.classifier_proj_size,
-            num_labels=self.num_labels,
-            is_encoder_decoder=self.is_encoder_decoder,
-            is_decoder=self.is_decoder,
-        )
-
-    def prepare_whisper_encoder_inputs_dict(
-        self,
-        input_features,
-    ):
-        return {
-            "input_features": input_features,
-        }
-
-    def prepare_config_and_inputs(self):
-        input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length])
-
-        config = self.get_config()
-        inputs_dict = self.prepare_whisper_encoder_inputs_dict(
-            input_features=input_features,
-        )
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def get_subsampled_output_lengths(self, input_lengths):
-        """
-        Computes the output length of the convolutional layers
-        """
-
-        for i in range(self.num_conv_layers):
-            input_lengths = (input_lengths - 1) // 2 + 1
-
-        return input_lengths
-
-    @property
-    def encoder_seq_length(self):
-        return self.get_subsampled_output_lengths(self.seq_length)
-
-
-@require_flax
-class WhisperEncoderModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxWhisperForAudioClassification,) if is_flax_available() else ()
-    is_encoder_decoder = False
-    fx_compatible = False
-    test_pruning = False
-    test_missing_keys = False
-
-    input_name = "input_features"
-
-    def setUp(self):
-        self.model_tester = FlaxWhisperEncoderModelTester(self)
-        _, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        self.init_shape = (1,) + inputs_dict["input_features"].shape[1:]
-
-        self.all_model_classes = (
-            make_partial_class(model_class, input_shape=self.init_shape) for model_class in self.all_model_classes
-        )
-        self.config_tester = ConfigTester(self, config_class=WhisperConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    # overwrite because of `input_features`
-    def test_jit_compilation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def model_jitted(input_features, **kwargs):
-                    return model(input_features=input_features, **kwargs)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    # overwrite because of `input_features`
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.__call__)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["input_features", "attention_mask", "output_attentions"]
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    def test_inputs_embeds(self):
-        pass
-
-    # WhisperEncoder has no inputs_embeds and thus the `get_input_embeddings` fn is not implemented
-    def test_model_common_attributes(self):
-        pass
-
-    # WhisperEncoder cannot resize token embeddings since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # WhisperEncoder does not have any base model
-    def test_save_load_to_base(self):
-        pass
-
-    # WhisperEncoder does not have any base model
-    def test_save_load_from_base(self):
-        pass
diff --git a/tests/models/whisper/test_modeling_tf_whisper.py b/tests/models/whisper/test_modeling_tf_whisper.py
deleted file mode 100644
index e978fe2fe1..0000000000
--- a/tests/models/whisper/test_modeling_tf_whisper.py
+++ /dev/null
@@ -1,950 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow Whisper model."""
-
-from __future__ import annotations
-
-import inspect
-import os
-import tempfile
-import traceback
-import unittest
-
-import numpy as np
-
-from transformers import GenerationConfig, WhisperConfig, WhisperFeatureExtractor, WhisperProcessor
-from transformers.testing_utils import (
-    is_tf_available,
-    require_read_token,
-    require_tf,
-    require_tokenizers,
-    run_test_in_subprocess,
-    slow,
-)
-from transformers.utils import cached_property
-from transformers.utils.import_utils import is_datasets_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_datasets_available():
-    import datasets
-    from datasets import load_dataset
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import TFWhisperForConditionalGeneration, TFWhisperModel, set_seed
-    from transformers.models.whisper.modeling_tf_whisper import (
-        TFWhisperDecoder,
-        TFWhisperEncoder,
-        sinusoidal_embedding_init,
-    )
-
-
-def prepare_whisper_inputs_dict(
-    config,
-    input_features,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-):
-    if decoder_attention_mask is None:
-        decoder_attention_mask = tf.where(decoder_input_ids != config.pad_token_id, 1, 0)
-    return {
-        "input_features": input_features,
-        "decoder_input_ids": decoder_input_ids,
-        "decoder_attention_mask": decoder_attention_mask,
-    }
-
-
-@require_tf
-class TFWhisperModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=60,
-        is_training=True,
-        use_labels=False,
-        vocab_size=200,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        input_channels=1,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        max_source_positions=30,
-        max_target_positions=60,
-        bos_token_id=98,
-        eos_token_id=98,
-        pad_token_id=0,
-        num_mel_bins=80,
-        decoder_start_token_id=85,
-        num_conv_layers=1,
-        suppress_tokens=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.input_channels = input_channels
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.num_mel_bins = num_mel_bins
-        self.max_position_embeddings = max_position_embeddings
-        self.max_source_positions = max_source_positions
-        self.max_target_positions = max_target_positions
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.num_conv_layers = num_conv_layers
-        self.suppress_tokens = suppress_tokens
-
-    def prepare_config_and_inputs(self):
-        input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size)
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-        inputs_dict = prepare_whisper_inputs_dict(
-            config,
-            attention_mask=None,
-            input_features=input_features,
-            decoder_input_ids=decoder_input_ids,
-        )
-        return config, inputs_dict
-
-    def get_config(self):
-        return WhisperConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            input_channels=self.input_channels,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            max_source_positions=self.max_source_positions,
-            max_target_positions=self.max_target_positions,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_ffn_dim=self.hidden_size,
-            encoder_ffn_dim=self.hidden_size,
-            decoder_start_token_id=self.decoder_start_token_id,
-            suppress_tokens=self.suppress_tokens,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def get_subsampled_output_lengths(self, input_lengths):
-        """
-        Computes the output length of the convolutional layers
-        """
-
-        for i in range(self.num_conv_layers):
-            input_lengths = (input_lengths - 1) // 2 + 1
-
-        return input_lengths
-
-    def create_and_check_model_forward(self, config, inputs_dict):
-        model = TFWhisperModel(config=config)
-
-        input_features = inputs_dict["input_features"]
-        decoder_input_ids = inputs_dict["decoder_input_ids"]
-
-        # first forward pass
-        last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
-
-        self.parent.assertTrue(last_hidden_state.shape, (13, 7, 16))
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = TFWhisperModel(config=config).get_decoder()
-        # take a slice so we're shorter than the sequence length and can append later
-        input_ids = inputs_dict["decoder_input_ids"][:, :-10]
-        attention_mask = inputs_dict["decoder_attention_mask"][:, :-10]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_token = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_tokens = tf.where(next_token <= 2, 2, next_token)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = np.random.randint(0, output_from_past.shape[-1])
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = TFWhisperModel(config=config)
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = TFWhisperEncoder.from_pretrained(tmpdirname)
-
-        encoder_last_hidden_state_2 = encoder(inputs_dict["input_features"])[0]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = TFWhisperDecoder.from_pretrained(tmpdirname)
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max() < 1e-3)
-
-
-@require_tf
-class TFWhisperModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TFWhisperModel, TFWhisperForConditionalGeneration) if is_tf_available() else ()
-    all_generative_model_classes = (TFWhisperForConditionalGeneration,) if is_tf_available() else ()
-    pipeline_model_mapping = {"feature-extraction": TFWhisperModel} if is_tf_available() else {}
-    is_encoder_decoder = True
-    fx_compatible = False
-    test_pruning = False
-    test_missing_keys = False
-    test_onnx = False
-
-    input_name = "input_features"
-
-    # TODO (ydshieh): undo skip once a fix is done on TF side.
-    @unittest.skip("Skip for now as TF 2.13 breaks it on GPU")
-    def test_xla_generate_slow(self):
-        super().test_xla_generate_slow()
-
-    def setUp(self):
-        self.model_tester = TFWhisperModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=WhisperConfig)
-        self.maxDiff = 3000
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            model.build_in_name_scope()
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=False)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_model_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_forward(*config_and_inputs)
-
-    def test_requires_grad_encoder_embed_positions(self):
-        config = self.model_tester.get_config()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            encoder = model.get_encoder()
-            self.assertFalse(encoder.embed_positions.trainable)
-
-    def test_encoder_sinusoidal_embed_positions(self):
-        config = self.model_tester.get_config()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.build_in_name_scope()
-
-            embeds = model.get_encoder().embed_positions.get_weights()[0]
-            sinusoids = sinusoidal_embedding_init(embeds.shape).numpy()
-            self.assertTrue(np.allclose(embeds, sinusoids))
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def _get_input_ids_and_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict[self.input_name]
-
-        # cut to half length & take max batch_size 3
-        max_batch_size = 3
-        input_ids = input_ids[:max_batch_size, :, :]
-
-        # generate max 3 tokens
-        max_length = 4
-        if config.eos_token_id is not None and config.pad_token_id is None:
-            # hack to allow generate for models such as GPT2 as is done in `generate()`
-            config.pad_token_id = config.eos_token_id
-
-        return config, input_ids, None, max_length
-
-    # not implemented currently
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip("Training is not yet supported")
-    def test_training(self):
-        pass
-
-    @unittest.skip("fp16 is not yet supported for TF models")
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        config.max_target_positions = 400
-        input_features = input_dict["input_features"]
-        model = TFWhisperForConditionalGeneration(config)
-        model.generate(input_features)
-        model.generate(input_features, num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "input_features",
-                "decoder_input_ids",
-                "decoder_attention_mask",
-            ]
-            expected_arg_names.extend(
-                ["decoder_position_ids", "head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
-                if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
-                else ["encoder_outputs"]
-            )
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            subsampled_seq_length = model._get_feat_extract_output_lengths(seq_length)
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [subsampled_seq_length, self.model_tester.hidden_size],
-            )
-
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.assertIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_length)
-
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", encoder_key_length)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-
-            subsampled_encoder_seq_length = model._get_feat_extract_output_lengths(encoder_seq_length)
-            subsampled_encoder_key_length = model._get_feat_extract_output_lengths(encoder_key_length)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
-            )
-            out_len = len(outputs)
-
-            correct_outlen = 5
-
-            # loss is at first position
-            if "labels" in inputs_dict:
-                correct_outlen += 1  # loss is added to beginning
-            if "past_key_values" in outputs:
-                correct_outlen += 1  # past_key_values have been returned
-
-            self.assertEqual(out_len, correct_outlen)
-
-            # decoder attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertIsInstance(decoder_attentions, (list, tuple))
-            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-            )
-
-            # cross attentions
-            cross_attentions = outputs.cross_attentions
-            self.assertIsInstance(cross_attentions, (list, tuple))
-            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(cross_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    decoder_seq_length,
-                    subsampled_encoder_key_length,
-                ],
-            )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            added_hidden_states = 2
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
-            )
-
-
-def _load_datasamples(num_samples):
-    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-    # automatic decoding with librispeech
-    speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-    return [x["array"] for x in speech_samples]
-
-
-def _test_large_logits_librispeech(in_queue, out_queue, timeout):
-    error = None
-    try:
-        _ = in_queue.get(timeout=timeout)
-
-        set_seed(0)
-
-        model = TFWhisperModel.from_pretrained("openai/whisper-large")
-
-        input_speech = _load_datasamples(1)
-
-        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
-        processed_inputs = processor(
-            audio=input_speech, text="This part of the speech", add_special_tokens=False, return_tensors="tf"
-        )
-        input_features = processed_inputs.input_features
-        decoder_input_ids = processed_inputs.labels
-
-        logits = model(
-            input_features,
-            decoder_input_ids=decoder_input_ids,
-            output_hidden_states=False,
-            output_attentions=False,
-            use_cache=False,
-        )
-
-        logits = logits.last_hidden_state @ tf.transpose(model.model.decoder.embed_tokens.weights[0])
-
-        # fmt: off
-        EXPECTED_LOGITS = tf.convert_to_tensor(
-            [
-                2.1382, 0.9381, 4.4671, 3.5589, 2.4022, 3.8576, -0.6521, 2.5472,
-                1.8301, 1.9957, 2.3432, 1.4678, 0.5459, 2.2597, 1.5179, 2.5357,
-                1.1624, 0.6194, 1.0757, 1.8259, 2.4076, 1.6601, 2.3503, 1.3376,
-                1.9891, 1.8635, 3.8931, 5.3699, 4.4772, 3.9184
-            ]
-        )
-        # fmt: on
-
-        unittest.TestCase().assertTrue(np.allclose(logits[0, 0, :30], EXPECTED_LOGITS, atol=1e-4))
-    except Exception:
-        error = f"{traceback.format_exc()}"
-
-    results = {"error": error}
-    out_queue.put(results, timeout=timeout)
-    out_queue.join()
-
-
-def _test_large_generation(in_queue, out_queue, timeout):
-    error = None
-    try:
-        _ = in_queue.get(timeout=timeout)
-
-        set_seed(0)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
-        model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
-
-        input_speech = _load_datasamples(1)
-        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="tf").input_features
-
-        generated_ids = model.generate(
-            input_features,
-            do_sample=False,
-            max_length=20,
-        )
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-
-        EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we are glad"
-        unittest.TestCase().assertEqual(transcript, EXPECTED_TRANSCRIPT)
-    except Exception:
-        error = f"{traceback.format_exc()}"
-
-    results = {"error": error}
-    out_queue.put(results, timeout=timeout)
-    out_queue.join()
-
-
-def _test_large_generation_multilingual(in_queue, out_queue, timeout):
-    error = None
-    try:
-        _ = in_queue.get(timeout=timeout)
-
-        set_seed(0)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
-        model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
-
-        # update generation config
-        generation_config = GenerationConfig.from_pretrained("openai/whisper-large-v2")
-
-        token = os.getenv("HF_HUB_READ_TOKEN", True)
-        ds = load_dataset(
-            "mozilla-foundation/common_voice_6_1",
-            "ja",
-            split="test",
-            streaming=True,
-            trust_remote_code=True,
-            token=token,
-        )
-        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
-        input_speech = next(iter(ds))["audio"]["array"]
-        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="tf").input_features
-
-        generated_ids = model.generate(
-            input_features,
-            do_sample=False,
-            max_length=20,
-            language="<|ja|>",
-            task="transcribe",
-            generation_config=generation_config,
-        )
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-
-        EXPECTED_TRANSCRIPT = "木村さんに電話を貸してもらいました"
-        unittest.TestCase().assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-        generated_ids = model.generate(
-            input_features,
-            do_sample=False,
-            max_length=20,
-            language="<|en|>",
-            task="transcribe",
-            generation_config=generation_config,
-        )
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-
-        EXPECTED_TRANSCRIPT = " Kimura-san called me."
-        unittest.TestCase().assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-        generated_ids = model.generate(
-            input_features,
-            do_sample=False,
-            max_length=20,
-            language="<|ja|>",
-            task="translate",
-            generation_config=generation_config,
-        )
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-
-        EXPECTED_TRANSCRIPT = " I borrowed a phone from Kimura san"
-        unittest.TestCase().assertEqual(transcript, EXPECTED_TRANSCRIPT)
-    except Exception:
-        error = f"{traceback.format_exc()}"
-
-    results = {"error": error}
-    out_queue.put(results, timeout=timeout)
-    out_queue.join()
-
-
-def _test_large_batched_generation(in_queue, out_queue, timeout):
-    error = None
-    try:
-        _ = in_queue.get(timeout=timeout)
-
-        set_seed(0)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
-        model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
-
-        input_speech = _load_datasamples(4)
-        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="tf").input_features
-        generated_ids_1 = model.generate(input_features[0:2], max_length=20)
-        generated_ids_2 = model.generate(input_features[2:4], max_length=20)
-        generated_ids = np.concatenate([generated_ids_1, generated_ids_2])
-
-        # fmt: off
-        EXPECTED_IDS = [
-            [50258, 50259, 50359, 50363, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 293, 321, 366, 5404],
-            [50258, 50259, 50359, 50363, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50257],
-            [50258, 50259, 50359, 50363, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904],
-            [50258, 50259, 50359, 50363, 634, 575, 12525, 22618, 1968, 6144, 35617, 20084, 1756, 311, 589, 307, 534, 10281, 934, 439]
-        ]
-        # fmt: on
-
-        unittest.TestCase().assertEqual(generated_ids.tolist(), EXPECTED_IDS)
-
-        # fmt: off
-        EXPECTED_TRANSCRIPT = [
-            " Mr. Quilter is the apostle of the middle classes and we are glad",
-            " Nor is Mr. Quilter's manner less interesting than his matter.",
-            " He tells us that at this festive season of the year, with Christmas and roast",
-            " He has grave doubts whether Sir Frederick Layton's work is really Greek after all"
-        ]
-        # fmt: on
-
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
-        unittest.TestCase().assertListEqual(transcript, EXPECTED_TRANSCRIPT)
-    except Exception:
-        error = f"{traceback.format_exc()}"
-
-    results = {"error": error}
-    out_queue.put(results, timeout=timeout)
-    out_queue.join()
-
-
-@require_tf
-@require_tokenizers
-class TFWhisperModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_processor(self):
-        return WhisperProcessor.from_pretrained("openai/whisper-base")
-
-    def _load_datasamples(self, num_samples):
-        return _load_datasamples(num_samples)
-
-    @slow
-    def test_tiny_logits_librispeech(self):
-        set_seed(0)
-        model = TFWhisperModel.from_pretrained("openai/whisper-tiny")
-        input_speech = self._load_datasamples(1)
-        feature_extractor = WhisperFeatureExtractor()
-        input_features = feature_extractor(input_speech, return_tensors="tf").input_features
-
-        logits = model(
-            input_features,
-            decoder_input_ids=tf.convert_to_tensor([[50258, 50259, 50359]]),
-            output_hidden_states=False,
-            output_attentions=False,
-            return_dict=False,
-            use_cache=False,
-        )
-
-        # fmt: off
-        EXPECTED_LOGITS = tf.convert_to_tensor(
-            [
-                2.9892, -6.7607, 5.7348, 3.6096, 0.2152, -5.7321, 4.8855, -1.6407,
-                0.2823, -1.5718, 10.4269, 3.4427, 0.0219, -8.0612, 3.4784, 8.4246,
-                4.0575, -2.2864, 11.1084, 0.9963, 0.9884, -8.5154, -3.5469, -9.3713,
-                0.9786, 3.5435, 7.4850, -5.2579, -1.4366, 10.4841
-            ]
-        )
-        # fmt: on
-        self.assertTrue(np.allclose(logits[0][0, 0, :30], EXPECTED_LOGITS, atol=1e-4))
-
-        # fmt: off
-        EXPECTED_GENERATION = tf.convert_to_tensor(
-            [
-                -1.4651, -2.6944, 2.7821, 2.3793, 4.0738, 0.0188, -3.3203, 1.9836,
-                0.0520, 0.7095, 1.1063, 0.2952, -3.6786, -0.5249, 0.3105, 4.7691,
-                1.1562, 1.3046, 0.5810, -0.3624, 1.7006, 1.3424, 0.9817, 2.1958,
-                1.8775, -5.7046, -0.7679, 4.0113, 2.6848, 2.8609
-            ]
-        )
-        # fmt: on
-
-        head_logits = logits[0] @ tf.transpose(model.model.decoder.embed_tokens.weights[0])
-        self.assertTrue(np.allclose(head_logits[0, 0, :30], EXPECTED_GENERATION, atol=1e-4))
-
-    @slow
-    def test_small_en_logits_librispeech(self):
-        set_seed(0)
-        model = TFWhisperModel.from_pretrained("openai/whisper-small.en")
-
-        input_speech = self._load_datasamples(1)
-
-        feaure_extractor = WhisperFeatureExtractor()
-        input_features = feaure_extractor(input_speech, return_tensors="tf").input_features
-
-        logits = model(
-            input_features,
-            decoder_input_ids=tf.convert_to_tensor([[model.config.decoder_start_token_id]]),
-            output_hidden_states=False,
-            output_attentions=False,
-            use_cache=False,
-        )
-
-        logits = logits.last_hidden_state @ tf.transpose(model.model.decoder.embed_tokens.weights[0])
-
-        # fmt: off
-        EXPECTED_LOGITS = tf.convert_to_tensor(
-            [
-                -3.6784, -7.7211, -9.5070, -11.9286, -7.6489, -9.7026, -5.6188,
-                -8.0104, -4.6238, -5.1833, -9.0485, -3.4079, -5.4874, -2.6935,
-                -6.3479, -7.3398, -6.9558, -7.6867, -7.4748, -8.3463, -9.9781,
-                -10.8389, -10.3105, -11.7201, -9.7261, -7.1590, -5.9272, -12.4509,
-                -11.1146, -8.1918
-            ]
-        )
-        # fmt: on
-        self.assertTrue(np.allclose(logits[0, 0, :30], EXPECTED_LOGITS, atol=1e-4))
-
-    @slow
-    def test_large_logits_librispeech(self):
-        run_test_in_subprocess(test_case=self, target_func=_test_large_logits_librispeech, inputs=None)
-
-    @slow
-    def test_tiny_en_generation(self):
-        set_seed(0)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-        model.config.decoder_start_token_id = 50257
-
-        input_speech = self._load_datasamples(1)
-        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="tf").input_features
-
-        generated_ids = model.generate(input_features, num_beams=5, max_length=20)
-        transcript = processor.tokenizer.batch_decode(generated_ids)[0]
-
-        EXPECTED_TRANSCRIPT = (
-            "<|startoftranscript|><|notimestamps|> Mr. Quilter is the apostle of the middle"
-            " classes, and we are glad to"
-        )
-        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-    @slow
-    def test_tiny_generation(self):
-        set_seed(0)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-
-        input_speech = self._load_datasamples(1)
-        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="tf").input_features
-
-        generated_ids = model.generate(input_features, num_beams=5, max_length=20)
-        transcript = processor.tokenizer.decode(generated_ids[0])
-
-        EXPECTED_TRANSCRIPT = (
-            "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle"
-            " classes and we are glad"
-        )
-        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-    @slow
-    def test_tiny_xla_generation(self):
-        set_seed(0)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-
-        input_speech = self._load_datasamples(1)
-        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="tf").input_features
-
-        xla_generate = tf.function(model.generate, jit_compile=True)
-
-        generated_ids = model.generate(input_features, num_beams=5, max_length=20)
-        generated_ids_xla = xla_generate(input_features, num_beams=5, max_length=20)
-
-        transcript = processor.tokenizer.decode(generated_ids[0])
-        transcript_xla = processor.tokenizer.decode(generated_ids_xla[0])
-
-        EXPECTED_TRANSCRIPT = (
-            "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle"
-            " classes and we are glad"
-        )
-        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-        self.assertEqual(transcript_xla, EXPECTED_TRANSCRIPT)
-
-    @slow
-    def test_large_generation(self):
-        run_test_in_subprocess(test_case=self, target_func=_test_large_generation, inputs=None)
-
-    @slow
-    @require_read_token
-    def test_large_generation_multilingual(self):
-        run_test_in_subprocess(test_case=self, target_func=_test_large_generation_multilingual, inputs=None)
-
-    @slow
-    def test_large_batched_generation(self):
-        run_test_in_subprocess(test_case=self, target_func=_test_large_batched_generation, inputs=None)
-
-    @slow
-    def test_tiny_en_batched_generation(self):
-        set_seed(0)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-
-        input_speech = self._load_datasamples(4)
-        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="tf").input_features
-        generated_ids = model.generate(input_features, max_length=20)
-
-        # fmt: off
-        EXPECTED_LOGITS = tf.convert_to_tensor(
-            [
-                [50257, 50362, 1770, 13, 2264, 346, 353, 318, 262, 46329, 286, 262, 3504, 6097, 11, 290, 356, 389, 9675, 284],
-                [50257, 50362, 5414, 318, 1770, 13, 2264, 346, 353, 338, 5642, 1342, 3499, 621, 465, 2300, 13, 50256, 50256, 50256],
-                [50257, 50362, 679, 4952, 514, 326, 379, 428, 43856, 1622, 286, 262, 614, 11, 351, 6786, 290, 32595, 12023, 28236],
-                [50257, 50362, 679, 468, 12296, 17188, 1771, 7361, 26113, 18881, 1122, 338, 670, 318, 1107, 8312, 706, 477, 290, 460]
-            ]
-
-        )
-        # fmt: on
-
-        self.assertTrue(np.allclose(generated_ids, EXPECTED_LOGITS))
-
-        # fmt: off
-        EXPECTED_TRANSCRIPT = [
-            " Mr. Quilter is the apostle of the middle classes, and we are glad to",
-            " Nor is Mr. Quilter's manner less interesting than his matter.",
-            " He tells us that at this festive season of the year, with Christmas and roast beef looming",
-            " He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can",
-        ]
-        # fmt: on
-
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
-        self.assertListEqual(transcript, EXPECTED_TRANSCRIPT)
-
-    @slow
-    def test_tiny_en_batched_xla_generation(self):
-        set_seed(0)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-
-        input_speech = self._load_datasamples(4)
-        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="tf").input_features
-
-        xla_generate = tf.function(model.generate, jit_compile=True)
-
-        generated_ids = model.generate(input_features, max_length=20)
-        generated_ids_xla = xla_generate(input_features, max_length=20)
-
-        # fmt: off
-        EXPECTED_LOGITS = tf.convert_to_tensor(
-            [
-                [50257, 50362, 1770, 13, 2264, 346, 353, 318, 262, 46329, 286, 262, 3504, 6097, 11, 290, 356, 389, 9675, 284],
-                [50257, 50362, 5414, 318, 1770, 13, 2264, 346, 353, 338, 5642, 1342, 3499, 621, 465, 2300, 13, 50256, 50256, 50256],
-                [50257, 50362, 679, 4952, 514, 326, 379, 428, 43856, 1622, 286, 262, 614, 11, 351, 6786, 290, 32595, 12023, 28236],
-                [50257, 50362, 679, 468, 12296, 17188, 1771, 7361, 26113, 18881, 1122, 338, 670, 318, 1107, 8312, 706, 477, 290, 460]
-            ]
-
-        )
-        # fmt: on
-
-        self.assertTrue(np.allclose(generated_ids, EXPECTED_LOGITS))
-        self.assertTrue(np.allclose(generated_ids_xla, EXPECTED_LOGITS))
-
-        # fmt: off
-        EXPECTED_TRANSCRIPT = [
-            " Mr. Quilter is the apostle of the middle classes, and we are glad to",
-            " Nor is Mr. Quilter's manner less interesting than his matter.",
-            " He tells us that at this festive season of the year, with Christmas and roast beef looming",
-            " He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can",
-        ]
-        # fmt: on
-
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
-        transcript_xla = processor.batch_decode(generated_ids_xla, skip_special_tokens=True)
-        self.assertListEqual(transcript, EXPECTED_TRANSCRIPT)
-        self.assertListEqual(transcript_xla, EXPECTED_TRANSCRIPT)
diff --git a/tests/models/xglm/test_modeling_flax_xglm.py b/tests/models/xglm/test_modeling_flax_xglm.py
deleted file mode 100644
index 8e0d644885..0000000000
--- a/tests/models/xglm/test_modeling_flax_xglm.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import XGLMConfig, XGLMTokenizer, is_flax_available
-from transformers.testing_utils import require_flax, require_sentencepiece, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_flax_available():
-    import jax
-    import jax.numpy as jnp
-    import numpy as np
-
-    from transformers.models.xglm.modeling_flax_xglm import FlaxXGLMForCausalLM, FlaxXGLMModel
-
-
-@require_flax
-class FlaxXGLMModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        d_model=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        ffn_dim=37,
-        activation_function="gelu",
-        activation_dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = d_model
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.ffn_dim = ffn_dim
-        self.activation_function = activation_function
-        self.activation_dropout = activation_dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = None
-        self.bos_token_id = 0
-        self.eos_token_id = 2
-        self.pad_token_id = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = np.clip(ids_tensor([self.batch_size, self.seq_length], self.vocab_size), 3, self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = XGLMConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            num_layers=self.num_hidden_layers,
-            attention_heads=self.num_attention_heads,
-            ffn_dim=self.ffn_dim,
-            activation_function=self.activation_function,
-            activation_dropout=self.activation_dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            use_cache=True,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-        )
-
-        return (config, input_ids, input_mask)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-    def check_use_cache_forward(self, model_class_name, config, input_ids, attention_mask):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
-        attention_mask = jnp.ones((input_ids.shape[0], max_decoder_length), dtype="i4")
-
-        position_ids = jnp.broadcast_to(
-            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
-        )
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-        )
-
-        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            attention_mask=attention_mask,
-            past_key_values=outputs_cache.past_key_values,
-            position_ids=position_ids,
-        )
-
-        outputs = model(input_ids)
-
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input_ids, attention_mask):
-        max_decoder_length = 20
-        model = model_class_name(config)
-
-        attention_mask_cache = jnp.concatenate(
-            [attention_mask, jnp.zeros((attention_mask.shape[0], max_decoder_length - attention_mask.shape[1]))],
-            axis=-1,
-        )
-
-        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
-        position_ids = jnp.broadcast_to(
-            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
-        )
-
-        outputs_cache = model(
-            input_ids[:, :-1],
-            attention_mask=attention_mask_cache,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-        )
-        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
-        outputs_cache_next = model(
-            input_ids[:, -1:],
-            past_key_values=outputs_cache.past_key_values,
-            attention_mask=attention_mask_cache,
-            position_ids=position_ids,
-        )
-
-        outputs = model(input_ids, attention_mask=attention_mask)
-        diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5]))
-        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
-
-
-@require_sentencepiece
-@require_flax
-class FlaxXGLMModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxXGLMModel, FlaxXGLMForCausalLM) if is_flax_available() else ()
-
-    def setUp(self):
-        self.model_tester = FlaxXGLMModelTester(self)
-
-    def test_use_cache_forward(self):
-        for model_class_name in self.all_model_classes:
-            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
-            self.model_tester.check_use_cache_forward(model_class_name, config, input_ids, attention_mask)
-
-    def test_use_cache_forward_with_attn_mask(self):
-        for model_class_name in self.all_model_classes:
-            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
-            self.model_tester.check_use_cache_forward_with_attn_mask(
-                model_class_name, config, input_ids, attention_mask
-            )
-
-    @slow
-    def test_batch_generation(self):
-        tokenizer = XGLMTokenizer.from_pretrained("XGLM", padding_side="left")
-        inputs = tokenizer(["Hello this is a long string", "Hey"], return_tensors="np", padding=True, truncation=True)
-
-        model = FlaxXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
-        model.config.num_beams = 1
-        model.config.do_sample = False
-
-        jit_generate = jax.jit(model.generate)
-
-        output_sequences = jit_generate(inputs["input_ids"], attention_mask=inputs["attention_mask"]).sequences
-
-        output_string = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
-
-        expected_string = [
-            "Hello this is a long string of questions, but I'm not sure if I'm",
-            "Hey, I'm a newbie to the forum and I'",
-        ]
-
-        self.assertListEqual(output_string, expected_string)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("facebook/xglm-564M")
-            outputs = model(np.ones((1, 1)))
-            self.assertIsNotNone(outputs)
diff --git a/tests/models/xglm/test_modeling_tf_xglm.py b/tests/models/xglm/test_modeling_tf_xglm.py
deleted file mode 100644
index 1fa58bd3d3..0000000000
--- a/tests/models/xglm/test_modeling_tf_xglm.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import XGLMConfig, XGLMTokenizer, is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers.models.xglm.modeling_tf_xglm import (
-        TFXGLMForCausalLM,
-        TFXGLMModel,
-    )
-
-
-@require_tf
-class TFXGLMModelTester:
-    config_cls = XGLMConfig
-    config_updates = {}
-    hidden_act = "gelu"
-
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        d_model=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        ffn_dim=37,
-        activation_function="gelu",
-        activation_dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = d_model
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.ffn_dim = ffn_dim
-        self.activation_function = activation_function
-        self.activation_dropout = activation_dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = None
-        self.bos_token_id = 0
-        self.eos_token_id = 2
-        self.pad_token_id = 1
-
-    def get_large_model_config(self):
-        return XGLMConfig.from_pretrained("facebook/xglm-564M")
-
-    def prepare_config_and_inputs(self):
-        input_ids = tf.clip_by_value(
-            ids_tensor([self.batch_size, self.seq_length], self.vocab_size), clip_value_min=0, clip_value_max=3
-        )
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        head_mask = floats_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-        )
-
-    def get_config(self):
-        return XGLMConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            num_layers=self.num_hidden_layers,
-            attention_heads=self.num_attention_heads,
-            ffn_dim=self.ffn_dim,
-            activation_function=self.activation_function,
-            activation_dropout=self.activation_dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            use_cache=True,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            return_dict=True,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "head_mask": head_mask,
-        }
-
-        return config, inputs_dict
-
-
-@require_tf
-class TFXGLMModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TFXGLMModel, TFXGLMForCausalLM) if is_tf_available() else ()
-    all_generative_model_classes = (TFXGLMForCausalLM,) if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": TFXGLMModel, "text-generation": TFXGLMForCausalLM} if is_tf_available() else {}
-    )
-    test_onnx = False
-    test_missing_keys = False
-    test_pruning = False
-
-    def setUp(self):
-        self.model_tester = TFXGLMModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=XGLMConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/xglm-564M"
-        model = TFXGLMModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="Currently, model embeddings are going to undergo a major refactor.")
-    def test_resize_token_embeddings(self):
-        super().test_resize_token_embeddings()
-
-
-@require_tf
-class TFXGLMModelLanguageGenerationTest(unittest.TestCase):
-    @slow
-    def test_lm_generate_xglm(self, verify_outputs=True):
-        model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
-        input_ids = tf.convert_to_tensor([[2, 268, 9865]], dtype=tf.int32)  # The dog
-        # </s> The dog is a very friendly dog. He is very affectionate and loves to play with other
-        expected_output_ids = [2, 268, 9865, 67, 11, 1988, 57252, 9865, 5, 984, 67, 1988, 213838, 1658, 53, 70446, 33, 6657, 278, 1581]  # fmt: skip
-        output_ids = model.generate(input_ids, do_sample=False, num_beams=1)
-        if verify_outputs:
-            self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
-
-    @slow
-    def test_xglm_sample(self):
-        tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
-        model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
-
-        tf.random.set_seed(0)
-        tokenized = tokenizer("Today is a nice day and", return_tensors="tf")
-        input_ids = tokenized.input_ids
-        # forces the generation to happen on CPU, to avoid GPU-related quirks (and assure same output regardless of the available devices)
-        with tf.device(":/CPU:0"):
-            output_ids = model.generate(input_ids, do_sample=True, seed=[7, 0])
-        output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-
-        EXPECTED_OUTPUT_STR = (
-            "Today is a nice day and warm evening here over Southern Alberta!! Today when they closed schools due"
-        )
-        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
-
-    @slow
-    def test_batch_generation(self):
-        model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
-        tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
-
-        tokenizer.padding_side = "left"
-
-        # use different length sentences to test batching
-        sentences = [
-            "This is an extremely long sentence that only exists to test the ability of the model to cope with "
-            "left-padding, such as in batched generation. The output for the sequence below should be the same "
-            "regardless of whether left padding is applied or not. When",
-            "Hello, my dog is a little",
-        ]
-
-        inputs = tokenizer(sentences, return_tensors="tf", padding=True)
-        input_ids = inputs["input_ids"]
-
-        outputs = model.generate(input_ids=input_ids, attention_mask=inputs["attention_mask"], max_new_tokens=12)
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="tf").input_ids
-        output_non_padded = model.generate(input_ids=inputs_non_padded, max_new_tokens=12)
-
-        inputs_padded = tokenizer(sentences[1], return_tensors="tf").input_ids
-        output_padded = model.generate(input_ids=inputs_padded, max_new_tokens=12)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "This is an extremely long sentence that only exists to test the ability of the model to cope with "
-            "left-padding, such as in batched generation. The output for the sequence below should be the same "
-            "regardless of whether left padding is applied or not. When left padding is applied, the sequence will be "
-            "a single",
-            "Hello, my dog is a little bit of a shy one, but he is very friendly",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
-
-    @slow
-    def test_loss_with_padding(self):
-        tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
-        model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
-
-        tokenizer.padding_side = "right"
-
-        sequence = "Sequence"
-
-        tokenized_non_padded = tokenizer(sequence, return_tensors="tf")
-        labels_non_padded = tokenized_non_padded.input_ids
-        loss_non_padded = model(tokenized_non_padded, labels=labels_non_padded).loss
-
-        tokenized_padded = tokenizer(sequence, padding="max_length", max_length=16, return_tensors="tf")
-        labels_padded = tokenized_padded.input_ids
-        labels_padded = tf.where(labels_padded == tokenizer.pad_token_id, -100, labels_padded)
-        loss_padded = model(tokenized_padded, labels=labels_padded).loss
-
-        tf.debugging.assert_near(loss_non_padded, loss_padded, atol=1e-3)
diff --git a/tests/models/xlm/test_modeling_tf_xlm.py b/tests/models/xlm/test_modeling_tf_xlm.py
deleted file mode 100644
index b0a20ce0c6..0000000000
--- a/tests/models/xlm/test_modeling_tf_xlm.py
+++ /dev/null
@@ -1,403 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TFXLMForMultipleChoice,
-        TFXLMForQuestionAnsweringSimple,
-        TFXLMForSequenceClassification,
-        TFXLMForTokenClassification,
-        TFXLMModel,
-        TFXLMWithLMHeadModel,
-        XLMConfig,
-    )
-
-
-class TFXLMModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_lengths = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.gelu_activation = True
-        self.sinusoidal_embeddings = False
-        self.causal = False
-        self.asm = False
-        self.n_langs = 2
-        self.vocab_size = 99
-        self.n_special = 0
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.summary_type = "last"
-        self.use_proj = True
-        self.scope = None
-        self.bos_token_id = 0
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_mask = random_attention_mask([self.batch_size, self.seq_length], dtype=tf.float32)
-
-        input_lengths = None
-        if self.use_input_lengths:
-            input_lengths = (
-                ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
-            )  # small variation of seq_length
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
-
-        sequence_labels = None
-        token_labels = None
-        is_impossible_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = XLMConfig(
-            vocab_size=self.vocab_size,
-            n_special=self.n_special,
-            emb_dim=self.hidden_size,
-            n_layers=self.num_hidden_layers,
-            n_heads=self.num_attention_heads,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            gelu_activation=self.gelu_activation,
-            sinusoidal_embeddings=self.sinusoidal_embeddings,
-            asm=self.asm,
-            causal=self.causal,
-            n_langs=self.n_langs,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            summary_type=self.summary_type,
-            use_proj=self.use_proj,
-            bos_token_id=self.bos_token_id,
-        )
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            choice_labels,
-            input_mask,
-        )
-
-    def create_and_check_xlm_model(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = TFXLMModel(config=config)
-        inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_xlm_lm_head(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = TFXLMWithLMHeadModel(config)
-
-        inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
-        outputs = model(inputs)
-
-        result = outputs
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_xlm_qa(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = TFXLMForQuestionAnsweringSimple(config)
-
-        inputs = {"input_ids": input_ids, "lengths": input_lengths}
-
-        result = model(inputs)
-
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_xlm_sequence_classif(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = TFXLMForSequenceClassification(config)
-
-        inputs = {"input_ids": input_ids, "lengths": input_lengths}
-
-        result = model(inputs)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def create_and_check_xlm_for_token_classification(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        config.num_labels = self.num_labels
-        model = TFXLMForTokenClassification(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_xlm_for_multiple_choice(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        config.num_choices = self.num_choices
-        model = TFXLMForMultipleChoice(config=config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            choice_labels,
-            input_mask,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "langs": token_type_ids,
-            "lengths": input_lengths,
-        }
-        return config, inputs_dict
-
-
-@require_tf
-class TFXLMModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFXLMModel,
-            TFXLMWithLMHeadModel,
-            TFXLMForSequenceClassification,
-            TFXLMForQuestionAnsweringSimple,
-            TFXLMForTokenClassification,
-            TFXLMForMultipleChoice,
-        )
-        if is_tf_available()
-        else ()
-    )
-    all_generative_model_classes = (
-        (TFXLMWithLMHeadModel,) if is_tf_available() else ()
-    )  # TODO (PVP): Check other models whether language generation is also applicable
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFXLMModel,
-            "fill-mask": TFXLMWithLMHeadModel,
-            "question-answering": TFXLMForQuestionAnsweringSimple,
-            "text-classification": TFXLMForSequenceClassification,
-            "text-generation": TFXLMWithLMHeadModel,
-            "token-classification": TFXLMForTokenClassification,
-            "zero-shot": TFXLMForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_case_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        image_processor_name,
-        feature_extractor_name,
-        processor_name,
-    ):
-        if (
-            pipeline_test_case_name == "QAPipelineTests"
-            and tokenizer_name is not None
-            and not tokenizer_name.endswith("Fast")
-        ):
-            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
-            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
-            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = TFXLMModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_xlm_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_model(*config_and_inputs)
-
-    def test_xlm_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_lm_head(*config_and_inputs)
-
-    def test_xlm_qa(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_qa(*config_and_inputs)
-
-    def test_xlm_sequence_classif(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_for_multiple_choice(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "FacebookAI/xlm-mlm-en-2048"
-        model = TFXLMModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_tf
-class TFXLMModelLanguageGenerationTest(unittest.TestCase):
-    @slow
-    def test_lm_generate_xlm_mlm_en_2048(self):
-        model = TFXLMWithLMHeadModel.from_pretrained("FacebookAI/xlm-mlm-en-2048")
-        input_ids = tf.convert_to_tensor([[14, 447]], dtype=tf.int32)  # the president
-        expected_output_ids = [
-            14,
-            447,
-            14,
-            447,
-            14,
-            447,
-            14,
-            447,
-            14,
-            447,
-            14,
-            447,
-            14,
-            447,
-            14,
-            447,
-            14,
-            447,
-            14,
-            447,
-        ]  # the president the president the president the president the president the president the president the president the president the president
-        # TODO(PVP): this and other input_ids I tried for generation give pretty bad results. Not sure why. Model might just not be made for auto-regressive inference
-        output_ids = model.generate(input_ids, do_sample=False)
-        self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
diff --git a/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py b/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py
deleted file mode 100644
index 17ae593b16..0000000000
--- a/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import AutoTokenizer, is_flax_available
-from transformers.testing_utils import require_flax, require_sentencepiece, require_tokenizers, slow
-
-
-if is_flax_available():
-    import jax.numpy as jnp
-
-    from transformers import FlaxXLMRobertaModel
-
-
-@require_sentencepiece
-@require_tokenizers
-@require_flax
-class FlaxXLMRobertaModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_flax_xlm_roberta_base(self):
-        model = FlaxXLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
-        tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
-        text = "The dog is cute and lives in the garden house"
-        input_ids = jnp.array([tokenizer.encode(text)])
-
-        expected_output_shape = (1, 12, 768)  # batch_size, sequence_length, embedding_vector_dim
-        expected_output_values_last_dim = jnp.array(
-            [[-0.0101, 0.1218, -0.0803, 0.0801, 0.1327, 0.0776, -0.1215, 0.2383, 0.3338, 0.3106, 0.0300, 0.0252]]
-        )
-
-        output = model(input_ids)["last_hidden_state"]
-        self.assertEqual(output.shape, expected_output_shape)
-        # compare the actual values for a slice of last dim
-        self.assertTrue(jnp.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
diff --git a/tests/models/xlm_roberta/test_modeling_tf_xlm_roberta.py b/tests/models/xlm_roberta/test_modeling_tf_xlm_roberta.py
deleted file mode 100644
index f28038bc22..0000000000
--- a/tests/models/xlm_roberta/test_modeling_tf_xlm_roberta.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-
-
-if is_tf_available():
-    import numpy as np
-    import tensorflow as tf
-
-    from transformers import TFXLMRobertaModel
-
-
-@require_tf
-@require_sentencepiece
-@require_tokenizers
-class TFFlaubertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_output_embeds_base_model(self):
-        model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-base")
-
-        features = {
-            "input_ids": tf.convert_to_tensor([[0, 2646, 10269, 83, 99942, 2]], dtype=tf.int32),  # "My dog is cute"
-            "attention_mask": tf.convert_to_tensor([[1, 1, 1, 1, 1, 1]], dtype=tf.int32),
-        }
-
-        output = model(features)["last_hidden_state"]
-        expected_shape = tf.TensorShape((1, 6, 768))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = tf.convert_to_tensor(
-            [
-                [
-                    [0.0681762, 0.10894451, 0.06772504],
-                    [-0.06423668, 0.02366615, 0.04329344],
-                    [-0.06057295, 0.09974135, -0.00070584],
-                ]
-            ],
-            dtype=tf.float32,
-        )
-
-        self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
diff --git a/tests/models/xlnet/test_modeling_tf_xlnet.py b/tests/models/xlnet/test_modeling_tf_xlnet.py
deleted file mode 100644
index e9708a5807..0000000000
--- a/tests/models/xlnet/test_modeling_tf_xlnet.py
+++ /dev/null
@@ -1,540 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import inspect
-import random
-import unittest
-
-from transformers import XLNetConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers.models.xlnet.modeling_tf_xlnet import (
-        TFXLNetForMultipleChoice,
-        TFXLNetForQuestionAnsweringSimple,
-        TFXLNetForSequenceClassification,
-        TFXLNetForTokenClassification,
-        TFXLNetLMHeadModel,
-        TFXLNetModel,
-    )
-
-
-class TFXLNetModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.mem_len = 10
-        # self.key_len = seq_length + mem_len
-        self.clamp_len = -1
-        self.reuse_len = 15
-        self.is_training = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.cutoffs = [10, 50, 80]
-        self.hidden_size = 32
-        self.num_attention_heads = 4
-        self.d_inner = 128
-        self.num_hidden_layers = 2
-        self.type_sequence_label_size = 2
-        self.untie_r = True
-        self.bi_data = False
-        self.same_length = False
-        self.initializer_range = 0.05
-        self.seed = 1
-        self.type_vocab_size = 2
-        self.bos_token_id = 1
-        self.eos_token_id = 2
-        self.pad_token_id = 5
-        self.num_choices = 4
-
-    def prepare_config_and_inputs(self):
-        input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        segment_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-        input_mask = random_attention_mask([self.batch_size, self.seq_length], dtype=tf.float32)
-
-        input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
-        perm_mask = tf.zeros((self.batch_size, self.seq_length + 1, self.seq_length), dtype=tf.float32)
-        perm_mask_last = tf.ones((self.batch_size, self.seq_length + 1, 1), dtype=tf.float32)
-        perm_mask = tf.concat([perm_mask, perm_mask_last], axis=-1)
-        # perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-        target_mapping = tf.zeros((self.batch_size, 1, self.seq_length), dtype=tf.float32)
-        target_mapping_last = tf.ones((self.batch_size, 1, 1), dtype=tf.float32)
-        target_mapping = tf.concat([target_mapping, target_mapping_last], axis=-1)
-        # target_mapping[:, 0, -1] = 1.0  # predict last token
-
-        sequence_labels = None
-        lm_labels = None
-        is_impossible_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
-
-        config = XLNetConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            n_head=self.num_attention_heads,
-            d_inner=self.d_inner,
-            n_layer=self.num_hidden_layers,
-            untie_r=self.untie_r,
-            mem_len=self.mem_len,
-            clamp_len=self.clamp_len,
-            same_length=self.same_length,
-            reuse_len=self.reuse_len,
-            bi_data=self.bi_data,
-            initializer_range=self.initializer_range,
-            num_labels=self.type_sequence_label_size,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            eos_token_id=self.eos_token_id,
-        )
-
-        return (
-            config,
-            input_ids_1,
-            input_ids_2,
-            input_ids_q,
-            perm_mask,
-            input_mask,
-            target_mapping,
-            segment_ids,
-            lm_labels,
-            sequence_labels,
-            is_impossible_labels,
-        )
-
-    def set_seed(self):
-        random.seed(self.seed)
-        tf.random.set_seed(self.seed)
-
-    def create_and_check_xlnet_base_model(
-        self,
-        config,
-        input_ids_1,
-        input_ids_2,
-        input_ids_q,
-        perm_mask,
-        input_mask,
-        target_mapping,
-        segment_ids,
-        lm_labels,
-        sequence_labels,
-        is_impossible_labels,
-    ):
-        model = TFXLNetModel(config)
-
-        inputs = {"input_ids": input_ids_1, "input_mask": input_mask, "token_type_ids": segment_ids}
-        result = model(inputs)
-
-        inputs = [input_ids_1, input_mask]
-        result = model(inputs)
-
-        config.use_mems_eval = False
-        model = TFXLNetModel(config)
-        no_mems_outputs = model(inputs)
-        self.parent.assertEqual(len(no_mems_outputs), 1)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertListEqual(
-            [mem.shape for mem in result.mems],
-            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-    def create_and_check_xlnet_lm_head(
-        self,
-        config,
-        input_ids_1,
-        input_ids_2,
-        input_ids_q,
-        perm_mask,
-        input_mask,
-        target_mapping,
-        segment_ids,
-        lm_labels,
-        sequence_labels,
-        is_impossible_labels,
-    ):
-        model = TFXLNetLMHeadModel(config)
-
-        inputs_1 = {"input_ids": input_ids_1, "token_type_ids": segment_ids}
-        all_logits_1, mems_1 = model(inputs_1).to_tuple()
-
-        inputs_2 = {"input_ids": input_ids_2, "mems": mems_1, "token_type_ids": segment_ids}
-        all_logits_2, mems_2 = model(inputs_2).to_tuple()
-
-        inputs_3 = {"input_ids": input_ids_q, "perm_mask": perm_mask, "target_mapping": target_mapping}
-        logits, _ = model(inputs_3).to_tuple()
-
-        self.parent.assertEqual(all_logits_1.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertListEqual(
-            [mem.shape for mem in mems_1],
-            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-        self.parent.assertEqual(all_logits_2.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertListEqual(
-            [mem.shape for mem in mems_2],
-            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-    def create_and_check_xlnet_qa(
-        self,
-        config,
-        input_ids_1,
-        input_ids_2,
-        input_ids_q,
-        perm_mask,
-        input_mask,
-        target_mapping,
-        segment_ids,
-        lm_labels,
-        sequence_labels,
-        is_impossible_labels,
-    ):
-        model = TFXLNetForQuestionAnsweringSimple(config)
-
-        inputs = {"input_ids": input_ids_1, "attention_mask": input_mask, "token_type_ids": segment_ids}
-        result = model(inputs)
-
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertListEqual(
-            [mem.shape for mem in result.mems],
-            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-    def create_and_check_xlnet_sequence_classif(
-        self,
-        config,
-        input_ids_1,
-        input_ids_2,
-        input_ids_q,
-        perm_mask,
-        input_mask,
-        target_mapping,
-        segment_ids,
-        lm_labels,
-        sequence_labels,
-        is_impossible_labels,
-    ):
-        model = TFXLNetForSequenceClassification(config)
-
-        result = model(input_ids_1)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-        self.parent.assertListEqual(
-            [mem.shape for mem in result.mems],
-            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-    def create_and_check_xlnet_for_token_classification(
-        self,
-        config,
-        input_ids_1,
-        input_ids_2,
-        input_ids_q,
-        perm_mask,
-        input_mask,
-        target_mapping,
-        segment_ids,
-        lm_labels,
-        sequence_labels,
-        is_impossible_labels,
-    ):
-        config.num_labels = input_ids_1.shape[1]
-        model = TFXLNetForTokenClassification(config)
-        inputs = {
-            "input_ids": input_ids_1,
-            "attention_mask": input_mask,
-            # 'token_type_ids': token_type_ids
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, config.num_labels))
-        self.parent.assertListEqual(
-            [mem.shape for mem in result.mems],
-            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-    def create_and_check_xlnet_for_multiple_choice(
-        self,
-        config,
-        input_ids_1,
-        input_ids_2,
-        input_ids_q,
-        perm_mask,
-        input_mask,
-        target_mapping,
-        segment_ids,
-        lm_labels,
-        sequence_labels,
-        is_impossible_labels,
-    ):
-        config.num_choices = self.num_choices
-        model = TFXLNetForMultipleChoice(config=config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids_1, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(segment_ids, 1), (1, self.num_choices, 1))
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-        }
-        result = model(inputs)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-        self.parent.assertListEqual(
-            [mem.shape for mem in result.mems],
-            [(self.seq_length, self.batch_size * self.num_choices, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids_1,
-            input_ids_2,
-            input_ids_q,
-            perm_mask,
-            input_mask,
-            target_mapping,
-            segment_ids,
-            lm_labels,
-            sequence_labels,
-            is_impossible_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids_1}
-        return config, inputs_dict
-
-
-@require_tf
-class TFXLNetModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFXLNetModel,
-            TFXLNetLMHeadModel,
-            TFXLNetForSequenceClassification,
-            TFXLNetForTokenClassification,
-            TFXLNetForQuestionAnsweringSimple,
-            TFXLNetForMultipleChoice,
-        )
-        if is_tf_available()
-        else ()
-    )
-    all_generative_model_classes = (
-        (TFXLNetLMHeadModel,) if is_tf_available() else ()
-    )  # TODO (PVP): Check other models whether language generation is also applicable
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFXLNetModel,
-            "question-answering": TFXLNetForQuestionAnsweringSimple,
-            "text-classification": TFXLNetForSequenceClassification,
-            "text-generation": TFXLNetLMHeadModel,
-            "token-classification": TFXLNetForTokenClassification,
-            "zero-shot": TFXLNetForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    # Note that `TFXLNetModelTest` is not a subclass of `GenerationTesterMixin`, so no contrastive generation tests
-    # from there is run against `TFXLNetModel`.
-    @unittest.skip("XLNet has special cache mechanism and is currently not working with contrastive generation")
-    def test_xla_generate_contrastive(self):
-        super().test_xla_generate_contrastive()
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_case_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        image_processor_name,
-        feature_extractor_name,
-        processor_name,
-    ):
-        # Exception encountered when calling layer '...'
-        return True
-
-    def setUp(self):
-        self.model_tester = TFXLNetModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_xlnet_base_model(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_base_model(*config_and_inputs)
-
-    def test_xlnet_lm_head(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
-
-    def test_xlnet_sequence_classif(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
-
-    def test_xlnet_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_for_token_classification(*config_and_inputs)
-
-    def test_xlnet_qa(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
-
-    def test_xlnet_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_for_multiple_choice(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "xlnet/xlnet-base-cased"
-        model = TFXLNetModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip("Some of the XLNet models misbehave with flexible input shapes.")
-    def test_compile_tf_model(self):
-        pass
-
-    # overwrite since `TFXLNetLMHeadModel` doesn't cut logits/labels
-    def test_loss_computation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            if getattr(model, "hf_compute_loss", None):
-                # The number of elements in the loss should be the same as the number of elements in the label
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                added_label = prepared_for_class[
-                    sorted(prepared_for_class.keys() - inputs_dict.keys(), reverse=True)[0]
-                ]
-                expected_loss_size = added_label.shape.as_list()[:1]
-
-                # `TFXLNetLMHeadModel` doesn't cut logits/labels
-                # if model.__class__ in get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING):
-                #     # if loss is causal lm loss, labels are shift, so that one label per batch
-                #     # is cut
-                #     loss_size = loss_size - self.model_tester.batch_size
-
-                # Test that model correctly compute the loss with kwargs
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                input_name = "input_ids" if "input_ids" in prepared_for_class else "pixel_values"
-                input_ids = prepared_for_class.pop(input_name)
-
-                loss = model(input_ids, **prepared_for_class)[0]
-                self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
-
-                # Test that model correctly compute the loss with a dict
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                loss = model(prepared_for_class)[0]
-                self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
-
-                # Test that model correctly compute the loss with a tuple
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-
-                # Get keys that were added with the _prepare_for_class function
-                label_keys = prepared_for_class.keys() - inputs_dict.keys()
-                signature = inspect.signature(model.call).parameters
-                signature_names = list(signature.keys())
-
-                # Create a dictionary holding the location of the tensors in the tuple
-                tuple_index_mapping = {0: input_name}
-                for label_key in label_keys:
-                    label_key_index = signature_names.index(label_key)
-                    tuple_index_mapping[label_key_index] = label_key
-                sorted_tuple_index_mapping = sorted(tuple_index_mapping.items())
-                # Initialize a list with their default values, update the values and convert to a tuple
-                list_input = []
-
-                for name in signature_names:
-                    if name != "kwargs":
-                        list_input.append(signature[name].default)
-
-                for index, value in sorted_tuple_index_mapping:
-                    list_input[index] = prepared_for_class[value]
-
-                tuple_input = tuple(list_input)
-
-                # Send to model
-                loss = model(tuple_input[:-1])[0]
-
-                self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
-
-
-@require_tf
-class TFXLNetModelLanguageGenerationTest(unittest.TestCase):
-    @slow
-    def test_lm_generate_xlnet_base_cased(self):
-        model = TFXLNetLMHeadModel.from_pretrained("xlnet/xlnet-base-cased")
-        # fmt: off
-        input_ids = tf.convert_to_tensor(
-            [
-                [
-                    67, 2840, 19, 18, 1484, 20, 965, 29077, 8719, 1273, 21, 45, 273, 17, 10, 15048, 28, 27511, 21, 4185, 11, 41, 2444, 9, 32, 1025, 20, 8719, 26, 23, 673, 966, 19, 29077, 20643, 27511, 20822, 20643, 19, 17, 6616, 17511, 18, 8978, 20, 18, 777, 9, 19233, 1527, 17669, 19, 24, 673, 17, 28756, 150, 12943, 4354, 153, 27, 442, 37, 45, 668, 21, 24, 256, 20, 416, 22, 2771, 4901, 9, 12943, 4354, 153, 51, 24, 3004, 21, 28142, 23, 65, 20, 18, 416, 34, 24, 2958, 22947, 9, 1177, 45, 668, 3097, 13768, 23, 103, 28, 441, 148, 48, 20522, 19, 12943, 4354, 153, 12860, 34, 18, 326, 27, 17492, 684, 21, 6709, 9, 8585, 123, 266, 19, 12943, 4354, 153, 6872, 24, 3004, 20, 18, 9225, 2198, 19, 12717, 103, 22, 401, 24, 6348, 9, 12943, 4354, 153, 1068, 2768, 2286, 19, 33, 104, 19, 176, 24, 9313, 19, 20086, 28, 45, 10292, 9, 4, 3,
-                ]
-            ],
-            dtype=tf.int32,
-        )
-        # fmt: on
-
-        #  In 1991, the remains of Russian Tsar Nicholas II and his family
-        #  (except for Alexei and Maria) are discovered.
-        #  The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
-        #  remainder of the story. 1883 Western Siberia,
-        #  a young Grigori Rasputin is asked by his father and a group of men to perform magic.
-        #  Rasputin has a vision and denounces one of the men as a horse thief. Although his
-        #  father initially slaps him for making such an accusation, Rasputin watches as the
-        #  man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
-        #  the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
-        #  with people, even a bishop, begging for his blessing. """
-
-        # fmt: off
-        expected_output_ids = [
-            67, 2840, 19, 18, 1484, 20, 965, 29077, 8719, 1273, 21, 45, 273, 17, 10, 15048, 28, 27511, 21, 4185, 11, 41, 2444, 9, 32, 1025, 20, 8719, 26, 23, 673, 966, 19, 29077, 20643, 27511, 20822, 20643, 19, 17, 6616, 17511, 18, 8978, 20, 18, 777, 9, 19233, 1527, 17669, 19, 24, 673, 17, 28756, 150, 12943, 4354, 153, 27, 442, 37, 45, 668, 21, 24, 256, 20, 416, 22, 2771, 4901, 9, 12943, 4354, 153, 51, 24, 3004, 21, 28142, 23, 65, 20, 18, 416, 34, 24, 2958, 22947, 9, 1177, 45, 668, 3097, 13768, 23, 103, 28, 441, 148, 48, 20522, 19, 12943, 4354, 153, 12860, 34, 18, 326, 27, 17492, 684, 21, 6709, 9, 8585, 123, 266, 19, 12943, 4354, 153, 6872, 24, 3004, 20, 18, 9225, 2198, 19, 12717, 103, 22, 401, 24, 6348, 9, 12943, 4354, 153, 1068, 2768, 2286, 19, 33, 104, 19, 176, 24, 9313, 19, 20086, 28, 45, 10292, 9, 4, 3, 19, 12943, 4354, 153, 27, 442, 22, 2771, 4901, 9, 69, 27, 442, 22, 2771, 24, 11335, 20, 18, 9225, 2198, 9, 69, 27, 442, 22, 2771, 24, 11335, 20, 18, 9225, 2198, 9, 69, 27, 442, 22, 2771,
-        ]
-        # fmt: on
-        #  In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria)
-        #  are discovered. The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich,
-        #  narrates the remainder of the story. 1883 Western Siberia, a young Grigori Rasputin
-        #  is asked by his father and a group of men to perform magic. Rasputin has a vision and
-        #  denounces one of the men as a horse thief. Although his father initially slaps
-        #  him for making such an accusation, Rasputin watches as the man is chased outside and beaten.
-        #  Twenty years later, Rasputin sees a vision of the Virgin Mary, prompting him to become a priest.
-        #  Rasputin quickly becomes famous, with people, even a bishop, begging for his blessing.
-        #  <sep><cls>, Rasputin is asked to perform magic. He is asked to perform a ritual of the Virgin Mary.
-        #  He is asked to perform a ritual of the Virgin Mary. He is asked to perform
-
-        output_ids = model.generate(input_ids, max_length=200, do_sample=False)
-
-        self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
diff --git a/tests/sagemaker/scripts/tensorflow/run_tf_dist.py b/tests/sagemaker/scripts/tensorflow/run_tf_dist.py
deleted file mode 100644
index 87e9626826..0000000000
--- a/tests/sagemaker/scripts/tensorflow/run_tf_dist.py
+++ /dev/null
@@ -1,192 +0,0 @@
-import argparse
-import logging
-import os
-import sys
-import time
-
-import tensorflow as tf
-from datasets import load_dataset
-from tqdm import tqdm
-
-from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-from transformers.modeling_tf_utils import keras
-from transformers.utils import is_sagemaker_dp_enabled
-
-
-if os.environ.get("SDP_ENABLED") or is_sagemaker_dp_enabled():
-    SDP_ENABLED = True
-    os.environ["SAGEMAKER_INSTANCE_TYPE"] = "p3dn.24xlarge"
-    import smdistributed.dataparallel.tensorflow as sdp
-else:
-    SDP_ENABLED = False
-
-
-def fit(model, loss, opt, train_dataset, epochs, train_batch_size, max_steps=None):
-    pbar = tqdm(train_dataset)
-    for i, batch in enumerate(pbar):
-        with tf.GradientTape() as tape:
-            inputs, targets = batch
-            outputs = model(batch)
-            loss_value = loss(targets, outputs.logits)
-
-        if SDP_ENABLED:
-            tape = sdp.DistributedGradientTape(tape, sparse_as_dense=True)
-
-        grads = tape.gradient(loss_value, model.trainable_variables)
-        opt.apply_gradients(zip(grads, model.trainable_variables))
-
-        pbar.set_description(f"Loss: {loss_value:.4f}")
-
-        if SDP_ENABLED and i == 0:
-            sdp.broadcast_variables(model.variables, root_rank=0)
-            sdp.broadcast_variables(opt.variables(), root_rank=0)
-
-        if max_steps and i >= max_steps:
-            break
-
-    train_results = {"loss": loss_value.numpy()}
-    return train_results
-
-
-def get_datasets(tokenizer, train_batch_size, eval_batch_size):
-    # Load dataset
-    train_dataset, test_dataset = load_dataset("stanfordnlp/imdb", split=["train", "test"])
-
-    # Preprocess train dataset
-    train_dataset = train_dataset.map(
-        lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True
-    )
-    train_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])
-
-    train_features = {
-        x: train_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length])
-        for x in ["input_ids", "attention_mask"]
-    }
-    tf_train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_dataset["label"]))
-
-    # Preprocess test dataset
-    test_dataset = test_dataset.map(
-        lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True
-    )
-    test_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])
-
-    test_features = {
-        x: test_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length])
-        for x in ["input_ids", "attention_mask"]
-    }
-    tf_test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_dataset["label"]))
-
-    if SDP_ENABLED:
-        tf_train_dataset = tf_train_dataset.shard(sdp.size(), sdp.rank())
-        tf_test_dataset = tf_test_dataset.shard(sdp.size(), sdp.rank())
-    tf_train_dataset = tf_train_dataset.batch(train_batch_size, drop_remainder=True)
-    tf_test_dataset = tf_test_dataset.batch(eval_batch_size, drop_remainder=True)
-
-    return tf_train_dataset, tf_test_dataset
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    # Hyperparameters sent by the client are passed as command-line arguments to the script.
-    parser.add_argument("--epochs", type=int, default=3)
-    parser.add_argument("--per_device_train_batch_size", type=int, default=16)
-    parser.add_argument("--per_device_eval_batch_size", type=int, default=8)
-    parser.add_argument("--model_name_or_path", type=str)
-    parser.add_argument("--learning_rate", type=str, default=5e-5)
-    parser.add_argument("--do_train", type=bool, default=True)
-    parser.add_argument("--do_eval", type=bool, default=True)
-    parser.add_argument("--output_dir", type=str)
-    parser.add_argument("--max_steps", type=int, default=None)
-
-    # Data, model, and output directories
-    parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
-    parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
-    parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
-
-    args, _ = parser.parse_known_args()
-
-    # Set up logging
-    logger = logging.getLogger(__name__)
-
-    logging.basicConfig(
-        level=logging.getLevelName("INFO"),
-        handlers=[logging.StreamHandler(sys.stdout)],
-        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-    )
-
-    if SDP_ENABLED:
-        sdp.init()
-
-        gpus = tf.config.experimental.list_physical_devices("GPU")
-        for gpu in gpus:
-            tf.config.experimental.set_memory_growth(gpu, True)
-        if gpus:
-            tf.config.experimental.set_visible_devices(gpus[sdp.local_rank()], "GPU")
-
-    # Load model and tokenizer
-    model = TFAutoModelForSequenceClassification.from_pretrained(args.model_name_or_path)
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
-
-    # get datasets
-    tf_train_dataset, tf_test_dataset = get_datasets(
-        tokenizer=tokenizer,
-        train_batch_size=args.per_device_train_batch_size,
-        eval_batch_size=args.per_device_eval_batch_size,
-    )
-
-    # fine optimizer and loss
-    optimizer = keras.optimizers.Adam(learning_rate=args.learning_rate)
-    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-    metrics = [keras.metrics.SparseCategoricalAccuracy()]
-    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
-
-    # Training
-    if args.do_train:
-        # train_results = model.fit(tf_train_dataset, epochs=args.epochs, batch_size=args.train_batch_size)
-        start_train_time = time.time()
-        train_results = fit(
-            model,
-            loss,
-            optimizer,
-            tf_train_dataset,
-            args.epochs,
-            args.per_device_train_batch_size,
-            max_steps=args.max_steps,
-        )
-        end_train_time = time.time() - start_train_time
-        logger.info("*** Train ***")
-        logger.info(f"train_runtime = {end_train_time}")
-
-        output_eval_file = os.path.join(args.output_dir, "train_results.txt")
-
-        if not SDP_ENABLED or sdp.rank() == 0:
-            with open(output_eval_file, "w") as writer:
-                logger.info("***** Train results *****")
-                logger.info(train_results)
-                for key, value in train_results.items():
-                    logger.info(f"  {key} = {value}")
-                    writer.write(f"{key} = {value}\n")
-
-    # Evaluation
-    if args.do_eval and (not SDP_ENABLED or sdp.rank() == 0):
-        result = model.evaluate(tf_test_dataset, batch_size=args.per_device_eval_batch_size, return_dict=True)
-        logger.info("*** Evaluate ***")
-
-        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
-
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results *****")
-            logger.info(result)
-            for key, value in result.items():
-                logger.info(f"  {key} = {value}")
-                writer.write(f"{key} = {value}\n")
-
-    # Save result
-    if SDP_ENABLED:
-        if sdp.rank() == 0:
-            model.save_pretrained(args.output_dir)
-            tokenizer.save_pretrained(args.output_dir)
-    else:
-        model.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
diff --git a/tests/test_modeling_flax_common.py b/tests/test_modeling_flax_common.py
deleted file mode 100644
index 97aedcb8e4..0000000000
--- a/tests/test_modeling_flax_common.py
+++ /dev/null
@@ -1,840 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import inspect
-import json
-import random
-import tempfile
-
-import numpy as np
-
-from transformers import is_flax_available
-from transformers.models.auto import get_values
-from transformers.testing_utils import CaptureLogger, require_flax
-from transformers.utils import CONFIG_NAME, GENERATION_CONFIG_NAME, logging
-
-
-if is_flax_available():
-    import os
-
-    import jax
-    import jax.numpy as jnp
-    from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
-    from flax.serialization import from_bytes
-    from flax.traverse_util import flatten_dict, unflatten_dict
-
-    from transformers import (
-        FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        FLAX_MODEL_MAPPING,
-        FlaxAutoModel,
-        FlaxAutoModelForSequenceClassification,
-        FlaxBertModel,
-    )
-    from transformers.modeling_flax_utils import FLAX_WEIGHTS_INDEX_NAME, FLAX_WEIGHTS_NAME
-
-    os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "0.12"  # assumed parallelism: 8
-
-
-def ids_tensor(shape, vocab_size, rng=None):
-    """Creates a random int32 tensor of the shape within the vocab size."""
-    if rng is None:
-        rng = random.Random()
-
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
-
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.randint(0, vocab_size - 1))
-
-    output = np.array(values, dtype=jnp.int32).reshape(shape)
-
-    return output
-
-
-def floats_tensor(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = random.Random()
-
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
-
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.random() * scale)
-
-    return np.array(values, dtype=jnp.float32).reshape(shape)
-
-
-def random_attention_mask(shape, rng=None):
-    attn_mask = ids_tensor(shape, vocab_size=2, rng=rng)
-    # make sure that at least one token is attended to for each batch
-    attn_mask[:, -1] = 1
-    return attn_mask
-
-
-def get_params(params, from_head_prefix=None):
-    """Function extracts relevant parameters into flatten dict from model params,
-    appends batch normalization statistics if present"""
-
-    # If Both parameters and batch normalization statistics are present
-    if "batch_stats" in params:
-        # Extract only parameters for the specified head prefix (if specified) and add batch statistics
-        if from_head_prefix is not None:
-            extracted_params = flatten_dict(unfreeze(params["params"][from_head_prefix]))
-            extracted_params.update(flatten_dict(params["batch_stats"][from_head_prefix]))
-        else:
-            extracted_params = flatten_dict(unfreeze(params["params"]))
-            extracted_params.update(flatten_dict(params["batch_stats"]))
-
-    # Only parameters are present
-    else:
-        if from_head_prefix is not None:
-            extracted_params = flatten_dict(unfreeze(params[from_head_prefix]))
-        else:
-            extracted_params = flatten_dict(unfreeze(params))
-
-    return extracted_params
-
-
-@require_flax
-class FlaxModelTesterMixin:
-    model_tester = None
-    all_model_classes = ()
-    test_mismatched_shapes = True
-    is_encoder_decoder = False
-    test_head_masking = False
-    has_attentions = True
-
-    @property
-    def all_generative_model_classes(self):
-        return tuple(model_class for model_class in self.all_model_classes if model_class.can_generate())
-
-    def _prepare_for_class(self, inputs_dict, model_class):
-        inputs_dict = copy.deepcopy(inputs_dict)
-
-        # hack for now until we have AutoModel classes
-        if "ForMultipleChoice" in model_class.__name__:
-            inputs_dict = {
-                k: jnp.broadcast_to(v[:, None], (v.shape[0], self.model_tester.num_choices, v.shape[-1]))
-                if isinstance(v, (jnp.ndarray, np.ndarray)) and k != "indices_prng_key"
-                else v
-                for k, v in inputs_dict.items()
-            }
-
-        return inputs_dict
-
-    def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
-        diff = np.abs(a - b).max()
-        self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
-
-    def test_model_outputs_equivalence(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
-            dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
-
-            def recursive_check(tuple_object, dict_object):
-                if isinstance(tuple_object, (list, tuple)):
-                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
-                        recursive_check(tuple_iterable_value, dict_iterable_value)
-                elif tuple_object is None:
-                    return
-                else:
-                    self.assert_almost_equals(jnp.nan_to_num(tuple_object), jnp.nan_to_num(dict_object), 1e-5)
-
-            recursive_check(tuple_output, dict_output)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-    def test_from_pretrained_save_pretrained(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                model = model_class(config)
-
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                outputs = model(**prepared_inputs_dict).to_tuple()
-
-                # verify that normal save_pretrained works as expected
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    model.save_pretrained(tmpdirname)
-
-                    # the config file (and the generation config file, if it can generate) should be saved
-                    self.assertTrue(os.path.exists(os.path.join(tmpdirname, CONFIG_NAME)))
-                    self.assertEqual(
-                        model.can_generate(), os.path.exists(os.path.join(tmpdirname, GENERATION_CONFIG_NAME))
-                    )
-
-                    model_loaded = model_class.from_pretrained(tmpdirname)
-
-                outputs_loaded = model_loaded(**prepared_inputs_dict).to_tuple()
-                for output_loaded, output in zip(outputs_loaded, outputs):
-                    self.assert_almost_equals(output_loaded, output, 1e-3)
-
-                # verify that save_pretrained for distributed training
-                # with `params=params` works as expected
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    model.save_pretrained(tmpdirname, params=model.params)
-                    model_loaded = model_class.from_pretrained(tmpdirname)
-
-                outputs_loaded = model_loaded(**prepared_inputs_dict).to_tuple()
-                for output_loaded, output in zip(outputs_loaded, outputs):
-                    self.assert_almost_equals(output_loaded, output, 1e-3)
-
-    def test_save_load_from_base(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = base_class(config)
-            base_params = get_params(model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                head_model = model_class.from_pretrained(tmpdirname)
-
-                base_param_from_head = get_params(head_model.params, from_head_prefix=head_model.base_model_prefix)
-
-                for key in base_param_from_head.keys():
-                    max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    def test_save_load_to_base(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            base_params_from_head = get_params(model.params, from_head_prefix=model.base_model_prefix)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname)
-
-                base_params = get_params(base_model.params)
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    def test_jit_compilation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def model_jitted(input_ids, attention_mask=None, **kwargs):
-                    return model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.__call__)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            if model.config.is_encoder_decoder:
-                expected_arg_names = [
-                    "input_ids",
-                    "attention_mask",
-                    "decoder_input_ids",
-                    "decoder_attention_mask",
-                ]
-                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-            else:
-                expected_arg_names = ["input_ids", "attention_mask"]
-                self.assertListEqual(arg_names[:2], expected_arg_names)
-
-    def test_naming_convention(self):
-        for model_class in self.all_model_classes:
-            model_class_name = model_class.__name__
-            module_class_name = (
-                model_class_name[:-5] + "Module" if model_class_name[-5:] == "Model" else model_class_name + "Module"
-            )
-            bert_modeling_flax_module = __import__(model_class.__module__, fromlist=[module_class_name])
-            module_cls = getattr(bert_modeling_flax_module, module_class_name)
-
-            self.assertIsNotNone(module_cls)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.assertIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_attention_outputs(self):
-        if not self.has_attentions:
-            self.skipTest(reason="Model does not output attentions")
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_length = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_length)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-            out_len = len(outputs)
-
-            if self.is_encoder_decoder:
-                correct_outlen = 5
-
-                # Question Answering model returns start_logits and end_logits
-                if model_class in get_values(FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
-                    correct_outlen += 1  # start_logits and end_logits instead of only 1 output
-
-                self.assertEqual(out_len, correct_outlen)
-
-                # decoder attentions
-                decoder_attentions = outputs.decoder_attentions
-                self.assertIsInstance(decoder_attentions, (list, tuple))
-                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(decoder_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                )
-
-                # cross attentions
-                cross_attentions = outputs.cross_attentions
-                self.assertIsInstance(cross_attentions, (list, tuple))
-                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(cross_attentions[0].shape[-3:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        decoder_seq_length,
-                        encoder_key_length,
-                    ],
-                )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-
-    def test_load_with_mismatched_shapes(self):
-        if not self.test_mismatched_shapes:
-            return
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if model_class not in get_values(FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
-                continue
-
-            with self.subTest(msg=f"Testing {model_class}"):
-                with tempfile.TemporaryDirectory() as tmp_dir:
-                    model = model_class(config)
-                    model.save_pretrained(tmp_dir)
-
-                    # Fails when we don't set ignore_mismatched_sizes=True
-                    with self.assertRaises(ValueError):
-                        new_model = FlaxAutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42)
-                    with self.assertRaises(ValueError):
-                        new_model_without_prefix = FlaxAutoModel.from_pretrained(tmp_dir, vocab_size=10)
-
-                    logger = logging.get_logger("transformers.modeling_flax_utils")
-                    with CaptureLogger(logger) as cl:
-                        new_model = FlaxAutoModelForSequenceClassification.from_pretrained(
-                            tmp_dir, num_labels=42, ignore_mismatched_sizes=True
-                        )
-                    self.assertIn("the shapes did not match", cl.out)
-
-                    logits = new_model(**inputs_dict)["logits"]
-                    self.assertEqual(logits.shape[1], 42)
-
-                    with CaptureLogger(logger) as cl:
-                        new_model_without_prefix = FlaxAutoModel.from_pretrained(
-                            tmp_dir, vocab_size=10, ignore_mismatched_sizes=True
-                        )
-                    self.assertIn("the shapes did not match", cl.out)
-                    input_ids = ids_tensor((2, 8), 10)
-                    if self.is_encoder_decoder:
-                        new_model_without_prefix(input_ids, decoder_input_ids=input_ids)
-                    else:
-                        new_model_without_prefix(input_ids)
-
-    def test_default_params_dtype(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            # check if all params are still in float32 when dtype of computation is half-precision
-            model = model_class(config, dtype=jnp.float16)
-            types = jax.tree_util.tree_map(lambda x: x.dtype, model.params)
-            types = flatten_dict(types)
-
-            for name, type_ in types.items():
-                self.assertEqual(type_, jnp.float32, msg=f"param {name} is not initialized in fp32.")
-
-    def test_to_bf16(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            # cast all params to bf16
-            params = model.to_bf16(model.params)
-            types = flatten_dict(jax.tree_util.tree_map(lambda x: x.dtype, params))
-            # test if all params are in bf16
-            for name, type_ in types.items():
-                self.assertEqual(type_, jnp.bfloat16, msg=f"param {name} is not in bf16.")
-
-            # test masking
-            flat_params = flatten_dict(params)
-            key = random.choice(list(flat_params.keys()))  # choose a random param
-            mask = {path: path != key for path in flat_params}  # don't cast the key
-            mask = unflatten_dict(mask)
-
-            params = model.to_bf16(model.params, mask)
-            types = flatten_dict(jax.tree_util.tree_map(lambda x: x.dtype, params))
-            # test if all params are in bf16 except key
-            for name, type_ in types.items():
-                if name == key:
-                    self.assertEqual(type_, jnp.float32, msg=f"param {name} should be in fp32.")
-                else:
-                    self.assertEqual(type_, jnp.bfloat16, msg=f"param {name} is not in bf16.")
-
-    def test_to_fp16(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            # cast all params to fp16
-            params = model.to_fp16(model.params)
-            types = flatten_dict(jax.tree_util.tree_map(lambda x: x.dtype, params))
-            # test if all params are in fp16
-            for name, type_ in types.items():
-                self.assertEqual(type_, jnp.float16, msg=f"param {name} is not in fp16.")
-
-            # test masking
-            flat_params = flatten_dict(params)
-            key = random.choice(list(flat_params.keys()))  # choose a random param
-            mask = {path: path != key for path in flat_params}  # don't cast the key
-            mask = unflatten_dict(mask)
-
-            params = model.to_fp16(model.params, mask)
-            types = flatten_dict(jax.tree_util.tree_map(lambda x: x.dtype, params))
-            # test if all params are in fp16 except key
-            for name, type_ in types.items():
-                if name == key:
-                    self.assertEqual(type_, jnp.float32, msg=f"param {name} should be in fp32.")
-                else:
-                    self.assertEqual(type_, jnp.float16, msg=f"param {name} is not in fp16.")
-
-    def test_to_fp32(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            # cast all params to fp16 and back to fp32
-            params = model.to_fp16(model.params)
-            params = model.to_fp32(params)
-
-            # test if all params are in fp32
-            types = flatten_dict(jax.tree_util.tree_map(lambda x: x.dtype, params))
-            for name, type_ in types.items():
-                self.assertEqual(type_, jnp.float32, msg=f"param {name} is not in fp32.")
-
-            # test masking
-            flat_params = flatten_dict(params)
-            key = random.choice(list(flat_params.keys()))  # choose a random param
-            mask = {path: path != key for path in flat_params}  # don't cast the key
-            mask = unflatten_dict(mask)
-
-            # cast to fp16 and back to fp32 with mask
-            params = model.to_fp16(model.params)
-            params = model.to_fp32(params, mask)
-
-            # test if all params are in fp32 except key
-            types = flatten_dict(jax.tree_util.tree_map(lambda x: x.dtype, params))
-            for name, type_ in types.items():
-                if name == key:
-                    self.assertEqual(type_, jnp.float16, msg=f"param {name} should be in fp16.")
-                else:
-                    self.assertEqual(type_, jnp.float32, msg=f"param {name} is not in fp32.")
-
-    def test_save_load_in_fp16(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-        # convert weights to fp16 and save
-        params = model.to_fp16(model.params)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname, params=params)
-
-            # load the weights again and check if they are still in fp16
-            model = model_class.from_pretrained(tmpdirname)
-            types = flatten_dict(jax.tree_util.tree_map(lambda x: x.dtype, model.params))
-            for name, type_ in types.items():
-                self.assertEqual(type_, jnp.float16, msg=f"param {name} is not in fp16.")
-
-    def test_save_load_in_bf16(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-        # convert weights to bf16 and save
-        params = model.to_bf16(model.params)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname, params=params)
-
-            # load the weights again and check if they are still in fp16
-            model = model_class.from_pretrained(tmpdirname)
-            types = flatten_dict(jax.tree_util.tree_map(lambda x: x.dtype, model.params))
-            for name, type_ in types.items():
-                self.assertEqual(type_, jnp.bfloat16, msg=f"param {name} is not in bf16.")
-
-    def test_model_main_input_name(self):
-        for model_class in self.all_model_classes:
-            model_signature = inspect.signature(getattr(model_class, "__call__"))
-            # The main input is the name of the argument after `self`
-            observed_main_input_name = list(model_signature.parameters.keys())[1]
-            self.assertEqual(model_class.main_input_name, observed_main_input_name)
-
-    def test_headmasking(self):
-        if not self.test_head_masking:
-            return
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        def _prepare_layer_head_mask(i, attention_heads, num_hidden_layers):
-            if i == 0:
-                return np.concatenate([np.zeros(1, dtype=jnp.int32), np.ones(attention_heads - 1, dtype=jnp.int32)])
-            if i == num_hidden_layers - 1:
-                return np.concatenate([np.zeros(attention_heads - 1, dtype=jnp.int32), np.ones(1, dtype=jnp.int32)])
-            return np.ones(attention_heads, dtype=jnp.int32)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            inputs = self._prepare_for_class(inputs_dict, model_class).copy()
-            # Prepare head mask
-            inputs["head_mask"] = np.stack(
-                [
-                    _prepare_layer_head_mask(i, config.num_attention_heads, config.num_hidden_layers)
-                    for i in range(config.num_hidden_layers)
-                ]
-            )
-            outputs = model(**inputs)
-
-            def _check_attentions_validity(attentions):
-                # Remove NaN
-                for t in attentions:
-                    # Check we don't have more than 25% nans (arbitrary)
-                    self.assertLess(np.isnan(t).sum(), t.size / 4)
-                attentions = [np.where(np.isnan(t), 0.0, t) for t in attentions]
-
-                self.assertAlmostEqual(attentions[0][..., 0, :, :].sum(), 0.0)
-                self.assertNotEqual(attentions[0][..., -1, :, :].sum(), 0.0)
-                if len(attentions) > 2:  # encoder-decodere models have only 2 layers in each modules
-                    self.assertNotEqual(attentions[1][..., 0, :, :].sum(), 0.0)
-                self.assertAlmostEqual(attentions[-1][..., -2, :, :].sum(), 0.0)
-                self.assertNotEqual(attentions[-1][..., -1, :, :].sum(), 0.0)
-
-            if model.config.is_encoder_decoder:
-                raise NotImplementedError("The test has not been implemented for encoder-decoder models yet.")
-            else:
-                _check_attentions_validity(outputs.attentions)
-
-    def test_no_automatic_init(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            model = model_class(config, _do_init=False)
-
-            # Check that accessing params raises an ValueError when _do_init is False
-            with self.assertRaises(ValueError):
-                params = model.params
-
-            # Check if we params can be properly initialized when calling init_weights
-            params = model.init_weights(model.key, model.input_shape)
-            assert isinstance(params, (dict, FrozenDict)), f"params are not an instance of {FrozenDict}"
-            # Check if all required params are initialized
-            keys = set(flatten_dict(unfreeze(params)).keys())
-            self.assertTrue(all(k in keys for k in model.required_params))
-            # Check if the shapes match
-            flat_params = flatten_dict(unfreeze(params))
-            for k, v in flatten_dict(unfreeze(model.params_shape_tree)).items():
-                self.assertEqual(
-                    v.shape,
-                    flat_params[k].shape,
-                    f"Shapes of {k} do not match. Expecting {v.shape}, got {flat_params[k].shape}.",
-                )
-
-            # Check that setting params raises an ValueError when _do_init is False
-            with self.assertRaises(ValueError):
-                model.params = params
-
-            # Check if we can do a forward pass
-            inputs_dict["output_hidden_states"] = True
-            inputs = self._prepare_for_class(inputs_dict, model_class).copy()
-            model(**inputs, params=params)
-
-    def test_from_pretrained_with_no_automatic_init(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        def _assert_all_params_initialised(model, params):
-            # Check if all required params are loaded
-            keys = set(flatten_dict(unfreeze(params)).keys())
-            self.assertTrue(all(k in keys for k in model.required_params))
-            # Check if the shapes match
-            flat_params = flatten_dict(unfreeze(params))
-            for k, v in flatten_dict(unfreeze(model.params_shape_tree)).items():
-                self.assertEqual(
-                    v.shape,
-                    flat_params[k].shape,
-                    f"Shapes of {k} do not match. Expecting {v.shape}, got {flat_params[k].shape}.",
-                )
-
-        for model_class in self.all_model_classes:
-            # init the model
-            model = model_class(config)
-
-            # save the model in the temporary directory
-            # load the saved model with _do_init=False
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model, params = model_class.from_pretrained(tmpdirname, _do_init=False)
-
-            # Check that accessing params raises an ValueError when _do_init is False
-            with self.assertRaises(ValueError):
-                params = model.params
-
-            # Check if all required params are loaded
-            _assert_all_params_initialised(model, params)
-
-            # Check that setting params raises an ValueError when _do_init is False
-            with self.assertRaises(ValueError):
-                model.params = params
-
-            # Check if init_weights initializes missing keys from from_pretrained
-            flat_params = flatten_dict(unfreeze(params))
-            random_key = random.choice(list(flat_params.keys()))
-            flat_params.pop(random_key)
-            params = freeze(unflatten_dict(flat_params))
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, params=params)
-                model, params = model_class.from_pretrained(tmpdirname, _do_init=False)
-
-                params = model.init_weights(model.key, model.input_shape, params=params)
-                # Check if all required params are loaded
-                _assert_all_params_initialised(model, params)
-
-    def test_checkpoint_sharding_from_hub(self):
-        model = FlaxBertModel.from_pretrained("ArthurZ/flax-tiny-random-bert-sharded")
-        # the model above is the same as the model below, just a sharded version.
-        ref_model = FlaxBertModel.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
-        for p1, p2 in zip(flatten_dict(model.params).values(), flatten_dict(ref_model.params).values()):
-            assert np.allclose(np.array(p1), np.array(p2))
-
-    def test_checkpoint_sharding_local(self):
-        model = FlaxBertModel.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # We use the same folder for various sizes to make sure a new save erases the old checkpoint.
-            for max_size in ["150kB", "150kiB", "200kB", "200kiB"]:
-                model.save_pretrained(tmp_dir, max_shard_size=max_size)
-
-                # Get each shard file and its size
-                shard_to_size = {}
-                for shard in os.listdir(tmp_dir):
-                    if shard.endswith(".msgpack"):
-                        shard_file = os.path.join(tmp_dir, shard)
-                        shard_to_size[shard_file] = os.path.getsize(shard_file)
-
-                index_file = os.path.join(tmp_dir, FLAX_WEIGHTS_INDEX_NAME)
-                # Check there is an index but no regular weight file
-                self.assertTrue(os.path.isfile(index_file))
-                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, FLAX_WEIGHTS_NAME)))
-
-                # Check a file is bigger than max_size only when it has a single weight
-                for shard_file, size in shard_to_size.items():
-                    if max_size.endswith("kiB"):
-                        max_size_int = int(max_size[:-3]) * 2**10
-                    else:
-                        max_size_int = int(max_size[:-2]) * 10**3
-                    # Note: pickle adds some junk so the weight of the file can end up being slightly bigger than
-                    # the size asked for (since we count parameters)
-                    if size >= max_size_int + 50000:
-                        with open(shard_file, "rb") as state_f:
-                            state_file = from_bytes(FlaxBertModel, state_f.read())
-                            self.assertEqual(len(state_file), 1)
-
-                # Check the index and the shard files found match
-                with open(index_file, encoding="utf-8") as f:
-                    index = json.loads(f.read())
-
-                all_shards = set(index["weight_map"].values())
-                shards_found = {f for f in os.listdir(tmp_dir) if f.endswith(".msgpack")}
-                self.assertSetEqual(all_shards, shards_found)
-
-                # Finally, check the model can be reloaded
-                new_model = FlaxBertModel.from_pretrained(tmp_dir)
-                for p1, p2 in zip(flatten_dict(model.params).values(), flatten_dict(new_model.params).values()):
-                    self.assertTrue(np.allclose(np.array(p1), np.array(p2)))
-
-    def test_gradient_checkpointing(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            # prepare inputs
-            prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config)
-            remat_model = model_class(config)
-
-            try:
-                remat_model.enable_gradient_checkpointing()
-            except NotImplementedError:
-                continue
-
-            outputs = model(**prepared_inputs_dict)
-            remat_outputs = remat_model(**prepared_inputs_dict)
-
-            # ensure that the dicts of outputs contain the same keys
-            self.assertEqual(outputs.keys(), remat_outputs.keys())
-
-            outputs = outputs.to_tuple()
-            remat_outputs = remat_outputs.to_tuple()
-
-            # ensure that the outputs remain precisely equal
-            for output, remat_output in zip(outputs, remat_outputs):
-                self.assertTrue((output == remat_output).all())
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
deleted file mode 100644
index 8b9d7dc391..0000000000
--- a/tests/test_modeling_tf_common.py
+++ /dev/null
@@ -1,1340 +0,0 @@
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import copy
-import inspect
-import json
-import os
-import random
-import tempfile
-import unittest
-from importlib import import_module
-from math import isnan
-
-from datasets import Dataset
-
-from transformers import is_tf_available
-from transformers.models.auto import get_values
-from transformers.testing_utils import (
-    CaptureLogger,
-    require_tf,
-    require_tf2onnx,
-    slow,
-)
-from transformers.utils import CONFIG_NAME, GENERATION_CONFIG_NAME, logging
-from transformers.utils.generic import ModelOutput
-
-
-logger = logging.get_logger(__name__)
-
-
-if is_tf_available():
-    import numpy as np
-    import tensorflow as tf
-
-    from transformers import (
-        TF_MODEL_FOR_CAUSAL_LM_MAPPING,
-        TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
-        TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
-        TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
-        TF_MODEL_FOR_MASKED_LM_MAPPING,
-        TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
-        TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
-        TF_MODEL_FOR_PRETRAINING_MAPPING,
-        TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
-        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
-        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        TFAutoModel,
-        TFAutoModelForSequenceClassification,
-        TFSharedEmbeddings,
-    )
-    from transformers.modeling_tf_utils import keras
-
-    tf.config.experimental.enable_tensor_float_32_execution(False)
-
-
-def _config_zero_init(config):
-    configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
-        if "_range" in key or "_std" in key:
-            setattr(configs_no_init, key, 0.0)
-    return configs_no_init
-
-
-@require_tf
-class TFModelTesterMixin:
-    model_tester = None
-    all_model_classes = ()
-    all_generative_model_classes = ()
-    test_mismatched_shapes = True
-    test_resize_embeddings = True
-    test_head_masking = True
-    is_encoder_decoder = False
-    has_attentions = True
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict:
-        inputs_dict = copy.deepcopy(inputs_dict)
-
-        if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-            inputs_dict = {
-                k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1))
-                if isinstance(v, tf.Tensor) and v.ndim > 0
-                else v
-                for k, v in inputs_dict.items()
-            }
-
-        if return_labels:
-            if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in [
-                *get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING),
-                *get_values(TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING),
-            ]:
-                inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-                inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in [
-                *get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
-                *get_values(TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
-            ]:
-                inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING):
-                inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in [
-                *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
-                *get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING),
-                *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
-                *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING),
-                *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
-                *get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING),
-            ] and "labels" in dict(inspect.signature(model_class.call).parameters):
-                inputs_dict["labels"] = tf.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
-                )
-            elif model_class in get_values(TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING):
-                num_patches = self.model_tester.image_size // self.model_tester.patch_size
-                inputs_dict["bool_masked_pos"] = tf.zeros(
-                    (self.model_tester.batch_size, num_patches**2), dtype=tf.int32
-                )
-            elif model_class in get_values(TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING):
-                batch_size, num_channels, height, width = inputs_dict["pixel_values"].shape
-                inputs_dict["labels"] = tf.zeros((self.model_tester.batch_size, height, width), dtype=tf.int32)
-            elif model_class.__name__.endswith("ForCTC"):
-                # When we have enough CTC models for an AutoClass, we should use their mapping instead of name checks
-                inputs_dict["labels"] = tf.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
-                )
-
-        return inputs_dict
-
-    def test_initialization(self):
-        pass
-
-    def test_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=False)
-
-                # the config file (and the generation config file, if it can generate) should be saved
-                self.assertTrue(os.path.exists(os.path.join(tmpdirname, CONFIG_NAME)))
-                self.assertEqual(
-                    model.can_generate(), os.path.exists(os.path.join(tmpdirname, GENERATION_CONFIG_NAME))
-                )
-
-                model = model_class.from_pretrained(tmpdirname)
-                after_outputs = model(self._prepare_for_class(inputs_dict, model_class))
-
-                self.assert_outputs_same(after_outputs, outputs)
-
-    def test_save_load_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            model_config = model.get_config()
-            # make sure that returned config is jsonifiable, which is required by keras
-            json.dumps(model_config)
-            new_model = model_class.from_config(model.get_config())
-            # make sure it also accepts a normal config
-            _ = model_class.from_config(model.config)
-            _ = new_model(self._prepare_for_class(inputs_dict, model_class))  # Build model
-            new_model.set_weights(model.get_weights())
-            after_outputs = new_model(self._prepare_for_class(inputs_dict, model_class))
-
-            self.assert_outputs_same(after_outputs, outputs)
-
-    @slow
-    def test_saved_model_creation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = False
-        config.output_attentions = False
-
-        if hasattr(config, "use_cache"):
-            config.use_cache = False
-
-        model_class = self.all_model_classes[0]
-
-        class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-        model = model_class(config)
-
-        model(class_inputs_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname, saved_model=True)
-            saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
-            self.assertTrue(os.path.exists(saved_model_dir))
-
-    def test_prepare_serving_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = self.has_attentions
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            outputs = model(inputs)
-            serving_outputs = model.serving_output(outputs)
-
-            for k, v in serving_outputs.items():
-                # Check that we have one of three possible outputs: None, tuple of tensors or a tensor
-                if isinstance(v, tuple):
-                    self.assertTrue(all(isinstance(elem, tf.Tensor) for elem in v))
-                elif v is not None:
-                    self.assertIsInstance(v, tf.Tensor)
-                else:
-                    self.assertIsNone(v)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            if model.config.is_encoder_decoder:
-                expected_arg_names = [
-                    "input_ids",
-                    "attention_mask",
-                    "decoder_input_ids",
-                    "decoder_attention_mask",
-                ]
-                expected_arg_names.extend(["decoder_position_ids"] if "decoder_position_ids" in arg_names else [])
-                expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask"] if "head_mask" and "decoder_head_mask" in arg_names else []
-                )
-                expected_arg_names.extend(
-                    ["cross_attn_head_mask", "encoder_outputs"]
-                    if "cross_attn_head_mask" in arg_names
-                    else ["encoder_outputs"]
-                )
-                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-            else:
-                expected_arg_names = ["input_ids"]
-                self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_onnx_compliancy(self):
-        if not self.test_onnx:
-            return
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        INTERNAL_OPS = [
-            "Assert",
-            "AssignVariableOp",
-            "EmptyTensorList",
-            "ReadVariableOp",
-            "ResourceGather",
-            "TruncatedNormal",
-            "VarHandleOp",
-            "VarIsInitializedOp",
-        ]
-        onnx_ops = []
-
-        with open(os.path.join(".", "utils", "tf_ops", "onnx.json")) as f:
-            onnx_opsets = json.load(f)["opsets"]
-
-        for i in range(1, self.onnx_min_opset + 1):
-            onnx_ops.extend(onnx_opsets[str(i)])
-
-        for model_class in self.all_model_classes:
-            model_op_names = set()
-
-            with tf.Graph().as_default() as g:
-                model = model_class(config)
-                model.build_in_name_scope()
-
-                for op in g.get_operations():
-                    model_op_names.add(op.node_def.op)
-
-            model_op_names = sorted(model_op_names)
-            incompatible_ops = []
-
-            for op in model_op_names:
-                if op not in onnx_ops and op not in INTERNAL_OPS:
-                    incompatible_ops.append(op)
-
-            self.assertEqual(len(incompatible_ops), 0, incompatible_ops)
-
-    # `tf2onnx` issue page: https://github.com/onnx/tensorflow-onnx/issues/2172
-    # TODO: undo skip once a fix is done in `tf2onnx`
-    @unittest.skip("`tf2onnx` broke with TF 2.13")
-    @require_tf2onnx
-    @slow
-    def test_onnx_runtime_optimize(self):
-        if not self.test_onnx:
-            return
-
-        import onnxruntime
-        import tf2onnx
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes[:2]:
-            model = model_class(config)
-            model.build_in_name_scope()
-
-            onnx_model_proto, _ = tf2onnx.convert.from_keras(model, opset=self.onnx_min_opset)
-
-            onnxruntime.InferenceSession(onnx_model_proto.SerializeToString())
-
-    def test_keras_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        tf_main_layer_classes = {
-            module_member
-            for model_class in self.all_model_classes
-            for module in (import_module(model_class.__module__),)
-            for module_member_name in dir(module)
-            if module_member_name.endswith("MainLayer")
-            # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`.
-            and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
-            for module_member in (getattr(module, module_member_name),)
-            if isinstance(module_member, type)
-            and keras.layers.Layer in module_member.__bases__
-            and getattr(module_member, "_keras_serializable", False)
-        }
-        for main_layer_class in tf_main_layer_classes:
-            # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
-            if "T5" in main_layer_class.__name__:
-                # Take the same values than in TFT5ModelTester for this shared layer
-                shared = TFSharedEmbeddings(99, 32, name="shared")
-                config.use_cache = inputs_dict.pop("use_cache", None)
-                main_layer = main_layer_class(config, embed_tokens=shared)
-            else:
-                main_layer = main_layer_class(config)
-
-            symbolic_inputs = {
-                name: keras.Input(tensor.shape[1:], dtype=tensor.dtype)
-                for name, tensor in inputs_dict.items()
-                if tf.is_tensor(tensor)
-            }
-
-            model = keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
-            outputs = model(inputs_dict)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                filepath = os.path.join(tmpdirname, "keras_model.h5")
-                model.save(filepath)
-                if "T5" in main_layer_class.__name__:
-                    model = keras.models.load_model(
-                        filepath,
-                        custom_objects={
-                            main_layer_class.__name__: main_layer_class,
-                            "TFSharedEmbeddings": TFSharedEmbeddings,
-                        },
-                    )
-                else:
-                    model = keras.models.load_model(
-                        filepath, custom_objects={main_layer_class.__name__: main_layer_class}
-                    )
-                assert isinstance(model, keras.Model)
-                after_outputs = model(inputs_dict)
-                self.assert_outputs_same(after_outputs, outputs)
-
-    def assert_outputs_same(self, after_outputs, outputs):
-        # Make sure we don't have nans
-        if isinstance(after_outputs, tf.Tensor):
-            out_1 = after_outputs.numpy()
-        elif isinstance(after_outputs, dict):
-            out_1 = after_outputs[list(after_outputs.keys())[0]].numpy()
-        else:
-            out_1 = after_outputs[0].numpy()
-        out_2 = outputs[0].numpy()
-        self.assertEqual(out_1.shape, out_2.shape)
-        out_1 = out_1[~np.isnan(out_1)]
-        out_2 = out_2[~np.isnan(out_2)]
-        max_diff = np.amax(np.abs(out_1 - out_2))
-        self.assertLessEqual(max_diff, 1e-5)
-
-    # Don't copy this method to model specific test file!
-    # TODO: remove this method once the issues are all fixed!
-    def _make_attention_mask_non_null(self, inputs_dict):
-        """Make sure no sequence has all zeros as attention mask"""
-
-        for k in ["attention_mask", "encoder_attention_mask", "decoder_attention_mask"]:
-            if k in inputs_dict:
-                attention_mask = inputs_dict[k]
-
-                # Make sure no all 0s attention masks - to avoid failure at this moment.
-                # Put `1` at the beginning of sequences to make it still work when combining causal attention masks.
-                # TODO: remove this line once a fix regarding large negative values for attention mask is done.
-                attention_mask = tf.concat(
-                    [tf.ones_like(attention_mask[:, :1], dtype=attention_mask.dtype), attention_mask[:, 1:]], axis=-1
-                )
-
-                # Here we make the first sequence with all 0s as attention mask.
-                # Currently, this will fail for `TFWav2Vec2Model`. This is caused by the different large negative
-                # values, like `1e-4`, `1e-9`, `1e-30` and `-inf` for attention mask across models/frameworks.
-                # TODO: enable this block once the large negative values thing is cleaned up.
-                # (see https://github.com/huggingface/transformers/issues/14859)
-                # attention_mask = tf.concat(
-                #     [
-                #         tf.zeros_like(attention_mask[:1], dtype=tf.int32),
-                #         tf.cast(attention_mask[1:], dtype=tf.int32)
-                #     ],
-                #     axis=0
-                # )
-
-                inputs_dict[k] = attention_mask
-
-    # Don't copy this method to model specific test file!
-    # TODO: remove this method once the issues are all fixed!
-    def _postprocessing_to_ignore_test_cases(self, tf_outputs, pt_outputs, model_class):
-        """For temporarily ignoring some failed test cases (issues to be fixed)"""
-
-        tf_keys = {k for k, v in tf_outputs.items() if v is not None}
-        pt_keys = {k for k, v in pt_outputs.items() if v is not None}
-
-        key_differences = tf_keys.symmetric_difference(pt_keys)
-
-        if model_class.__name__ in [
-            "TFFlaubertWithLMHeadModel",
-            "TFFunnelForPreTraining",
-            "TFElectraForPreTraining",
-            "TFXLMWithLMHeadModel",
-        ]:
-            for k in key_differences:
-                if k in ["loss", "losses"]:
-                    tf_keys.discard(k)
-                    pt_keys.discard(k)
-        elif model_class.__name__.startswith("TFGPT2"):
-            # `TFGPT2` has `past_key_values` as a tensor while `GPT2` has it as a tuple.
-            tf_keys.discard("past_key_values")
-            pt_keys.discard("past_key_values")
-
-        # create new outputs from the remaining fields
-        new_tf_outputs = type(tf_outputs)(**{k: tf_outputs[k] for k in tf_keys})
-        new_pt_outputs = type(pt_outputs)(**{k: pt_outputs[k] for k in pt_keys})
-
-        return new_tf_outputs, new_pt_outputs
-
-    @slow
-    def test_compile_tf_model(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes[:2]:
-            # Prepare our model
-            model = model_class(config)
-            # These are maximally general inputs for the model, with multiple None dimensions
-            # Hopefully this will catch any conditionals that fail for flexible shapes
-            functional_inputs = {
-                key: keras.Input(shape=val.shape[1:], dtype=val.dtype, name=key)
-                for key, val in model.input_signature.items()
-                if key in model.dummy_inputs
-            }
-            outputs_dict = model(functional_inputs)
-
-            hidden_states = outputs_dict[0]
-
-            # Compile extended model
-            functional_model = keras.Model(inputs=functional_inputs, outputs=hidden_states)
-            model_out = functional_model.predict(model.dummy_inputs)  # Check we can pass inputs with the Keras API
-            self.assertTrue(model_out is not None)
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                functional_model.save(tmpdirname)  # Ensure we can save/export the whole functional model
-
-    def test_keyword_and_dict_args(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            outputs_dict = model(inputs)
-
-            inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            outputs_keywords = model(**inputs_keywords)
-            output_dict = outputs_dict[0].numpy()
-            output_keywords = outputs_keywords[0].numpy()
-
-            self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
-
-    def test_attention_outputs(self):
-        if not self.has_attentions:
-            self.skipTest(reason="Model does not output attentions")
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", self.model_tester.seq_length)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
-        decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-
-        def check_decoder_attentions_output(outputs):
-            out_len = len(outputs)
-            self.assertEqual(min(out_len % 2, out_len % 5), 0)  # differentiation due to newly added cross_attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-            )
-
-        def check_encoder_attentions_output(outputs):
-            attentions = [
-                t.numpy() for t in (outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions)
-            ]
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = False
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            out_len = len(outputs)
-            self.assertEqual(config.output_hidden_states, False)
-            check_encoder_attentions_output(outputs)
-
-            if self.is_encoder_decoder:
-                model = model_class(config)
-                outputs = model(self._prepare_for_class(inputs_dict, model_class))
-                self.assertEqual(config.output_hidden_states, False)
-                check_decoder_attentions_output(outputs)
-
-            # Check that output attentions can also be changed via the config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            self.assertEqual(config.output_hidden_states, False)
-            check_encoder_attentions_output(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = True
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-
-            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
-            self.assertEqual(model.config.output_hidden_states, True)
-            check_encoder_attentions_output(outputs)
-
-    def test_headmasking(self):
-        if not self.test_head_masking:
-            return
-
-        random.Random().seed(42)
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        random.Random().seed()
-
-        inputs_dict["output_attentions"] = True
-        config.output_hidden_states = True
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-
-            # Prepare head_mask
-            def prepare_layer_head_mask(i, attention_heads, num_hidden_layers):
-                if i == 0:
-                    return tf.concat(
-                        (tf.zeros(1, dtype=tf.float32), tf.ones(attention_heads - 1, dtype=tf.float32)), 0
-                    )
-                elif i == num_hidden_layers - 1:
-                    return tf.concat(
-                        (tf.zeros(attention_heads - 1, dtype=tf.float32), tf.ones(1, dtype=tf.float32)), 0
-                    )
-                else:
-                    return tf.ones(attention_heads, dtype=tf.float32)
-
-            head_mask = tf.stack(
-                [
-                    prepare_layer_head_mask(i, config.num_attention_heads, config.num_hidden_layers)
-                    for i in range(config.num_hidden_layers)
-                ],
-                0,
-            )
-
-            inputs = self._prepare_for_class(inputs_dict, model_class).copy()
-            inputs["head_mask"] = head_mask
-            if model.config.is_encoder_decoder:
-                signature = inspect.signature(model.call)
-                arg_names = [*signature.parameters.keys()]
-                if "decoder_head_mask" in arg_names:  # necessary differentiation because of T5 model
-                    inputs["decoder_head_mask"] = head_mask
-                if "cross_attn_head_mask" in arg_names:
-                    inputs["cross_attn_head_mask"] = head_mask
-
-            outputs = model(**inputs, return_dict=True)
-
-            def check_attentions_validity(attentions):
-                # Remove Nan
-                for t in attentions:
-                    self.assertLess(
-                        (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), (tf.size(t) / 4).numpy()
-                    )  # Check we don't have more than 25% nans (arbitrary)
-
-                attentions = [
-                    tf.where(tf.math.is_nan(t), 0.0, t) for t in attentions
-                ]  # remove them (the test is less complete)
-
-                self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0)
-                self.assertNotEqual(tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), 0.0)
-                if len(attentions) > 2:  # encoder-decodere models have only 2 layers in each modules
-                    self.assertNotEqual(tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), 0.0)
-                self.assertAlmostEqual(tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), 0.0)
-                self.assertNotEqual(tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), 0.0)
-
-            if model.config.is_encoder_decoder:
-                check_attentions_validity(outputs.encoder_attentions)
-                check_attentions_validity(outputs.decoder_attentions)
-                if "cross_attn_head_mask" in arg_names:
-                    check_attentions_validity(outputs.cross_attentions)
-            else:
-                check_attentions_validity(outputs.attentions)
-
-    def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_hidden_states_output(config, inputs_dict, model_class):
-            model = model_class(config)
-            outputs = model(self._prepare_for_class(inputs_dict, model_class))
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-
-            if model.config.is_encoder_decoder:
-                encoder_hidden_states = outputs.encoder_hidden_states
-                decoder_hidden_states = outputs.decoder_hidden_states
-
-                self.assertEqual(config.output_attentions, False)
-                self.assertEqual(len(encoder_hidden_states), expected_num_layers)
-                self.assertListEqual(
-                    list(encoder_hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size],
-                )
-                self.assertEqual(len(decoder_hidden_states), expected_num_layers)
-                self.assertListEqual(
-                    list(decoder_hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size],
-                )
-            else:
-                hidden_states = outputs.hidden_states
-                self.assertEqual(config.output_attentions, False)
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size],
-                )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(config, inputs_dict, model_class)
-
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-            check_hidden_states_output(config, inputs_dict, model_class)
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        text_in_text_out_models = (
-            get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
-            + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
-            + get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING)
-        )
-        speech_in_text_out_models = get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), keras.layers.Layer)
-
-            legacy_text_in_text_out = model.get_lm_head() is not None
-            if model_class in text_in_text_out_models or legacy_text_in_text_out:
-                out_embeddings = model.get_output_embeddings()
-                self.assertIsInstance(out_embeddings, keras.layers.Layer)
-                bias = model.get_bias()
-                if bias is not None:
-                    self.assertIsInstance(bias, dict)
-                    for _, v in bias.items():
-                        self.assertIsInstance(v, tf.Variable)
-            elif model_class in speech_in_text_out_models:
-                out_embeddings = model.get_output_embeddings()
-                self.assertIsInstance(out_embeddings, keras.layers.Layer)
-                bias = model.get_bias()
-                self.assertIsNone(bias)
-            else:
-                out_embeddings = model.get_output_embeddings()
-                assert out_embeddings is None
-                bias = model.get_bias()
-                self.assertIsNone(bias)
-
-    def test_determinism(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            first, second = (
-                model(self._prepare_for_class(inputs_dict, model_class), training=False)[0],
-                model(self._prepare_for_class(inputs_dict, model_class), training=False)[0],
-            )
-            out_1 = first.numpy()
-            out_2 = second.numpy()
-            out_1 = out_1[~np.isnan(out_1)]
-            out_2 = out_2[~np.isnan(out_2)]
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-    def test_model_outputs_equivalence(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
-            dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
-
-            def recursive_check(tuple_object, dict_object):
-                if isinstance(tuple_object, (list, tuple)):
-                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
-                        recursive_check(tuple_iterable_value, dict_iterable_value)
-                elif tuple_object is None:
-                    return
-                else:
-                    self.assertTrue(
-                        all(tf.equal(tuple_object, dict_object)),
-                        msg=(
-                            "Tuple and dict output are not equal. Difference:"
-                            f" {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}"
-                        ),
-                    )
-
-                recursive_check(tuple_output, dict_output)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            if self.has_attentions:
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
-
-            # Not all models accept "labels" in the forward pass (yet :) )
-            if "labels" in inspect.signature(model.call).parameters.keys():
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                check_equivalence(model, tuple_inputs, dict_inputs)
-
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-                if self.has_attentions:
-                    tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                    dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                    check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
-
-                    tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                    dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                    check_equivalence(
-                        model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
-                    )
-
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            inputs = copy.deepcopy(inputs_dict)
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids)
-            else:
-                inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids)
-
-            inputs = self._prepare_for_class(inputs, model_class)
-
-            model(inputs)
-
-    def test_numpy_arrays_inputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def prepare_numpy_arrays(inputs_dict):
-            inputs_np_dict = {}
-            for k, v in inputs_dict.items():
-                if tf.is_tensor(v):
-                    inputs_np_dict[k] = v.numpy()
-                else:
-                    inputs_np_dict[k] = np.array(k)
-
-            return inputs_np_dict
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            inputs_np = prepare_numpy_arrays(inputs)
-
-            output_for_dict_input = model(inputs_np)
-            output_for_kw_input = model(**inputs_np)
-            self.assert_outputs_same(output_for_dict_input, output_for_kw_input)
-
-    def test_valid_input_signature_and_dummies(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            call_args = inspect.signature(model.call).parameters
-            for key in model.input_signature:
-                self.assertIn(key, call_args)
-            for key in model.dummy_inputs:
-                self.assertIn(key, call_args)
-
-    def test_resize_token_embeddings(self):
-        # TODO (joao): after the embeddings refactor is complete, rework this test so as to rely exclusively on
-        # keras.layers.Embedding
-
-        if not self.test_resize_embeddings:
-            return
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def _get_word_embedding_weight(model, embedding_layer):
-            if isinstance(embedding_layer, keras.layers.Embedding):
-                # builds the embeddings layer
-                model.build_in_name_scope()
-                return embedding_layer.embeddings
-            else:
-                return model._get_word_embedding_weight(embedding_layer)
-
-        for model_class in self.all_model_classes:
-            for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
-                # build the embeddings
-                model = model_class(config=copy.deepcopy(config))  # `resize_token_embeddings` mutates `config`
-                old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
-                old_bias = model.get_bias()
-                old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
-                # reshape the embeddings
-                model.resize_token_embeddings(size)
-                new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
-                new_bias = model.get_bias()
-                new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
-
-                # check that the resized embeddings size matches the desired size.
-                assert_size = size if size is not None else config.vocab_size
-                self.assertEqual(new_input_embeddings.shape[0], assert_size)
-
-                # check that weights remain the same after resizing
-                models_equal = True
-                for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
-                    if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
-                        models_equal = False
-                self.assertTrue(models_equal)
-
-                if old_bias is not None and new_bias is not None:
-                    for old_weight, new_weight in zip(old_bias.values(), new_bias.values()):
-                        self.assertEqual(new_weight.shape[-1], assert_size)
-
-                        models_equal = True
-                        for p1, p2 in zip(tf.squeeze(old_weight), tf.squeeze(new_weight)):
-                            if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
-                                models_equal = False
-                        self.assertTrue(models_equal)
-
-                if old_output_embeddings is not None and new_output_embeddings is not None:
-                    self.assertEqual(new_output_embeddings.shape[0], assert_size)
-                    self.assertEqual(new_output_embeddings.shape[1], old_output_embeddings.shape[1])
-
-                    models_equal = True
-                    for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
-                        if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
-                            models_equal = False
-                    self.assertTrue(models_equal)
-
-    # TODO (Joao): this test is not slow, but it's tagged as such to keep track of failures on the scheduled CI runs,
-    # while passing push CI. Fix the underlying issues and remove the tag.
-    @slow
-    def test_save_load_after_resize_token_embeddings(self):
-        if not self.test_resize_embeddings:
-            return
-        config, original_inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            # create a model with resized (expended) embeddings
-            new_tokens_size = 10
-            old_total_size = config.vocab_size
-            new_total_size = old_total_size + new_tokens_size
-            model = model_class(config=copy.deepcopy(config))  # `resize_token_embeddings` mutates `config`
-            model.build_in_name_scope()
-            model.resize_token_embeddings(new_total_size)
-
-            # fetch the output for an input exclusively made of new members of the vocabulary
-            inputs_dict = copy.deepcopy(original_inputs_dict)
-            ids_feat_name = None
-            if "input_ids" in inputs_dict:
-                ids_feat_name = "input_ids"
-            elif "decoder_input_ids" in inputs_dict:
-                ids_feat_name = "decoder_input_ids"
-            else:
-                assert False, "No input ids feature found in the inputs dict"
-
-            new_vocab_input_ids = ids_tensor(inputs_dict[ids_feat_name].shape, new_tokens_size)
-            new_vocab_input_ids += old_total_size
-            inputs_dict[ids_feat_name] = new_vocab_input_ids
-            if "input_ids" in inputs_dict:
-                inputs_dict["input_ids"] = new_vocab_input_ids
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"] = new_vocab_input_ids
-            prepared_inputs = self._prepare_for_class(inputs_dict, model_class)
-            outputs = model(**prepared_inputs)
-
-            # save and load the model
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=False)
-                model = model_class.from_pretrained(tmpdirname)
-                restored_model_outputs = model(**prepared_inputs)
-
-                # check that the output for the restored model is the same
-                self.assert_outputs_same(restored_model_outputs, outputs)
-
-    @unittest.skipIf(
-        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
-        reason="This test always passes on CPU.",
-    )
-    def test_embeddings_out_of_bounds_raise_exception(self):
-        # TF embeddings layers don't raise an exception when an index is out of bounds on GPU, so we manually raise it.
-        # This test should only fail on GPU for models where we haven't added the safety check.
-        if not self.test_resize_embeddings:
-            return
-        config, original_inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=config)
-            inputs_dict = copy.deepcopy(original_inputs_dict)
-            if "input_ids" in inputs_dict:
-                inputs_dict["input_ids"] = inputs_dict["input_ids"] * int(1e9)
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"] = inputs_dict["decoder_input_ids"] * int(1e9)
-            prepared_inputs = self._prepare_for_class(inputs_dict, model_class)
-            with self.assertRaises(tf.errors.InvalidArgumentError):
-                model(**prepared_inputs)
-
-    def test_loss_computation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            # The number of elements in the loss should be the same as the number of elements in the label
-            prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-            added_label_names = sorted(prepared_for_class.keys() - inputs_dict.keys(), reverse=True)
-            if not added_label_names:
-                continue  # This test is only for models with easily-separable labels
-            added_label = prepared_for_class[added_label_names[0]]
-            expected_loss_size = added_label.shape.as_list()[:1]
-
-            # Test that model correctly compute the loss with kwargs
-            prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-            possible_input_names = {"input_ids", "pixel_values", "input_features", "input_values"}
-            input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
-            model_input = prepared_for_class.pop(input_name)
-
-            outputs = model(model_input, **prepared_for_class)
-            if not isinstance(outputs, ModelOutput) or not hasattr(outputs, "loss"):
-                continue
-
-            loss = outputs.loss
-            self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
-
-            # Test that model correctly compute the loss when we mask some positions
-            prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-            possible_input_names = {"input_ids", "pixel_values", "input_features", "input_values"}
-            input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
-            model_input = prepared_for_class.pop(input_name)
-            if "labels" in prepared_for_class:
-                labels = prepared_for_class["labels"].numpy()
-                if len(labels.shape) > 1 and labels.shape[1] != 1:
-                    labels[0] = -100
-                    prepared_for_class["labels"] = tf.convert_to_tensor(labels)
-                    loss = model(model_input, **prepared_for_class)[0]
-                    self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
-                    self.assertTrue(not np.any(np.isnan(loss.numpy())))
-
-            # Test that model correctly compute the loss with a dict
-            prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-            loss = model(prepared_for_class)[0]
-            self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
-
-            # Test that model correctly compute the loss with a tuple
-            prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-
-            # Get keys that were added with the _prepare_for_class function
-            label_keys = prepared_for_class.keys() - inputs_dict.keys()
-            signature = inspect.signature(model.call).parameters
-            signature_names = list(signature.keys())
-
-            # Create a dictionary holding the location of the tensors in the tuple
-            tuple_index_mapping = {0: input_name}
-            for label_key in label_keys:
-                label_key_index = signature_names.index(label_key)
-                tuple_index_mapping[label_key_index] = label_key
-            sorted_tuple_index_mapping = sorted(tuple_index_mapping.items())
-            # Initialize a list with their default values, update the values and convert to a tuple
-            list_input = []
-
-            for name in signature_names:
-                if name != "kwargs":
-                    list_input.append(signature[name].default)
-
-            for index, value in sorted_tuple_index_mapping:
-                list_input[index] = prepared_for_class[value]
-
-            tuple_input = tuple(list_input)
-
-            # Send to model
-            loss = model(tuple_input[:-1])[0]
-
-            self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
-
-    def check_keras_fit_results(self, val_loss1, val_loss2, atol=1e-2, rtol=1e-3):
-        self.assertTrue(np.allclose(val_loss1, val_loss2, atol=atol, rtol=rtol))
-
-    @slow
-    def test_keras_fit(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            # Test that model correctly compute the loss with kwargs
-            prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-            # We also remove "return_loss" as this is covered by the train_step when using fit()
-            prepared_for_class = {
-                key: val
-                for key, val in prepared_for_class.items()
-                if key not in ("head_mask", "decoder_head_mask", "cross_attn_head_mask", "return_loss")
-            }
-            if "labels" in prepared_for_class and "decoder_input_ids" in prepared_for_class:
-                del prepared_for_class["decoder_input_ids"]
-
-            accuracy_classes = [
-                "ForPreTraining",
-                "ForCausalLM",
-                "ForMaskedLM",
-                "ForQuestionAnswering",
-                "ForMultipleChoice",
-                "ForSequenceClassification",
-                "ForTokenClassification",
-                "ForNextSentencePrediction",
-                "LMHeadModel",
-            ]
-            for accuracy_class in accuracy_classes:
-                if model.__class__.__name__.endswith(accuracy_class):
-                    metrics = [keras.metrics.SparseCategoricalAccuracy()]
-                    break
-            else:
-                metrics = []
-
-            if hasattr(self.model_tester, "batch_size"):
-                sample_weight = tf.convert_to_tensor([0.5] * self.model_tester.batch_size, dtype=tf.float32)
-            else:
-                sample_weight = None
-            # Build the model so we can get some constant weights and check outputs
-            outputs = model(prepared_for_class)
-            if getattr(outputs, "loss", None) is None:
-                continue
-            model_weights = model.get_weights()
-
-            # Run eagerly to save some expensive compilation times
-            model.compile(optimizer=keras.optimizers.SGD(0.0), run_eagerly=True, metrics=metrics)
-            # Make sure the model fits without crashing regardless of where we pass the labels
-            history1 = model.fit(
-                prepared_for_class,
-                validation_data=prepared_for_class,
-                sample_weight=sample_weight,
-                steps_per_epoch=1,
-                validation_steps=1,
-                shuffle=False,
-            )
-            val_loss1 = history1.history["val_loss"][0]
-            self.assertTrue(not isnan(val_loss1))
-            accuracy1 = {key: val[0] for key, val in history1.history.items() if key.endswith("accuracy")}
-
-            possible_label_cols = {
-                "labels",
-                "label",
-                "label_ids",
-                "start_positions",
-                "start_position",
-                "end_positions",
-                "end_position",
-                "next_sentence_label",
-            }
-            label_names = possible_label_cols.intersection(set(prepared_for_class))
-            if len(label_names) == 0:
-                # The next tests only make sense for models with separate inputs and labels, and do not make
-                # sense for models that don't clearly distinguish between the two (e.g. CLIP)
-                return
-            labels = {key: val for key, val in prepared_for_class.items() if key in label_names}
-            inputs_minus_labels = {key: val for key, val in prepared_for_class.items() if key not in label_names}
-            self.assertGreater(len(inputs_minus_labels), 0)
-
-            # We reinitialize the model here even though our learning rate was zero
-            # because BatchNorm updates weights by means other than gradient descent.
-            model.set_weights(model_weights)
-
-            history2 = model.fit(
-                inputs_minus_labels,
-                labels,
-                validation_data=(inputs_minus_labels, labels),
-                sample_weight=sample_weight,
-                steps_per_epoch=1,
-                validation_steps=1,
-                shuffle=False,
-            )
-            val_loss2 = history2.history["val_loss"][0]
-            self.assertTrue(not isnan(val_loss2))
-            accuracy2 = {key: val[0] for key, val in history2.history.items() if key.endswith("accuracy")}
-            self.check_keras_fit_results(val_loss1, val_loss2)
-            self.assertEqual(history1.history.keys(), history2.history.keys())
-            for key in history1.history.keys():
-                if not key.startswith("val_"):
-                    self.assertTrue("val_" + key in history1.history.keys(), "Outputs differ in train/test step!")
-            if metrics:
-                self.assertTrue(len(accuracy1) == len(accuracy2) > 0, "Missing metrics!")
-
-    def test_int_support(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            prepared_for_class = self._prepare_for_class(
-                inputs_dict.copy(),
-                model_class,
-                return_labels=True if "labels" in inspect.signature(model_class.call).parameters.keys() else False,
-            )
-            if not any(
-                tensor.dtype.is_integer for tensor in prepared_for_class.values() if isinstance(tensor, tf.Tensor)
-            ):
-                return  # No integer inputs means no need for this test
-
-            prepared_for_class = {
-                key: tf.cast(tensor, tf.int64) if isinstance(tensor, tf.Tensor) and tensor.dtype.is_integer else tensor
-                for key, tensor in prepared_for_class.items()
-            }
-            model = model_class(config)
-            model(**prepared_for_class)  # No assertion, we're just checking this doesn't throw an error
-            int32_prepared_for_class = {
-                key: tf.cast(tensor, tf.int32) if isinstance(tensor, tf.Tensor) and tensor.dtype.is_integer else tensor
-                for key, tensor in prepared_for_class.items()
-            }
-            model(**int32_prepared_for_class)  # No assertion, we're just checking this doesn't throw an error
-
-            # After testing that the model accepts all int inputs, confirm that its dummies are int32
-            for key, tensor in model.dummy_inputs.items():
-                self.assertTrue(
-                    isinstance(tensor, tf.Tensor) or keras.backend.is_keras_tensor(tensor),
-                    "Dummy inputs should be tf.Tensor!",
-                )
-                if tensor.dtype.is_integer:
-                    self.assertTrue(tensor.dtype == tf.int32, "Integer dummy inputs should be tf.int32!")
-
-            # Also confirm that the input_signature uses int32
-            for key, tensor_spec in model.input_signature.items():
-                if tensor_spec.dtype.is_integer:
-                    self.assertTrue(tensor_spec.dtype == tf.int32, "Input signatures should use tf.int32 for ints!")
-
-    def test_load_with_mismatched_shapes(self):
-        if not self.test_mismatched_shapes:
-            return
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if model_class not in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
-                continue
-
-            with self.subTest(msg=f"Testing {model_class}"):
-                with tempfile.TemporaryDirectory() as tmp_dir:
-                    model = model_class(config)
-                    inputs = self._prepare_for_class(inputs_dict, model_class)
-                    _ = model(**inputs)
-                    model.save_pretrained(tmp_dir)
-
-                    # Fails when we don't set ignore_mismatched_sizes=True
-                    with self.assertRaises(ValueError):
-                        new_model = TFAutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42)
-                    with self.assertRaises(ValueError):
-                        new_model_without_prefix = TFAutoModel.from_pretrained(tmp_dir, vocab_size=10)
-
-                    logger = logging.get_logger("transformers.modeling_tf_utils")
-                    with CaptureLogger(logger) as cl:
-                        new_model = TFAutoModelForSequenceClassification.from_pretrained(
-                            tmp_dir, num_labels=42, ignore_mismatched_sizes=True
-                        )
-                    self.assertIn("the shapes did not match", cl.out)
-
-                    logits = new_model(**inputs).logits
-                    self.assertEqual(logits.shape[1], 42)
-
-                    with CaptureLogger(logger) as cl:
-                        new_model_without_prefix = TFAutoModel.from_pretrained(
-                            tmp_dir, vocab_size=10, ignore_mismatched_sizes=True
-                        )
-                    self.assertIn("the shapes did not match", cl.out)
-
-                    # Although Tf models always have a prefix pointing to `MainLayer`,
-                    # we still add this "without prefix" test to keep a consistency between tf and pt tests.
-                    input_ids = ids_tensor((2, 8), 10)
-                    if self.is_encoder_decoder:
-                        new_model_without_prefix(input_ids, decoder_input_ids=input_ids)
-                    else:
-                        new_model_without_prefix(input_ids)
-
-    def test_model_main_input_name(self):
-        for model_class in self.all_model_classes:
-            model_signature = inspect.signature(getattr(model_class, "call"))
-            # The main input is the name of the argument after `self`
-            observed_main_input_name = list(model_signature.parameters.keys())[1]
-            self.assertEqual(model_class.main_input_name, observed_main_input_name)
-
-    def test_dataset_conversion(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class, return_labels=False)
-            if "labels" in tf_inputs_dict:
-                return  # This is some kinda funky decoder model that needs labels in its forward pass
-            tf_inputs_dict = {
-                key: val
-                for key, val in tf_inputs_dict.items()
-                if "head_mask" not in key and isinstance(val, tf.Tensor)
-            }
-            tf_inputs_dict["extra_unwanted_column"] = list(tf_inputs_dict.values())[0]  # Use a random other tensor
-            input_dataset = Dataset.from_dict(tf_inputs_dict)
-            tf_dataset = model.prepare_tf_dataset(
-                input_dataset, batch_size=len(input_dataset), drop_remainder=False, shuffle=False
-            )
-            test_batch = next(iter(tf_dataset))
-            if isinstance(test_batch, tf.Tensor):
-                self.assertEqual(len(test_batch), len(input_dataset))  # Assert we didn't lose any data
-            elif isinstance(test_batch, dict):
-                # Assert we discarded the unwanted extra column but kept everything else
-                self.assertEqual(len(test_batch), len(input_dataset.features) - 1)
-                self.assertNotIn("extra_unwanted_column", test_batch)
-                for tensor in test_batch.values():
-                    self.assertTrue(isinstance(tensor, tf.Tensor))
-                    self.assertEqual(len(tensor), len(input_dataset))  # Assert we didn't lose any data
-            model(test_batch, training=False)
-
-            if "labels" in inspect.signature(model_class.call).parameters.keys():
-                tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                if "labels" not in tf_inputs_dict:
-                    return  # This model isn't giving us labels after all, don't try training with it
-                tf_inputs_dict = {
-                    key: val
-                    for key, val in tf_inputs_dict.items()
-                    if "head_mask" not in key and isinstance(val, tf.Tensor)
-                }
-                tf_inputs_dict["extra_unwanted_column"] = list(tf_inputs_dict.values())[0]  # Use a random other tensor
-                input_dataset = Dataset.from_dict(tf_inputs_dict)
-                tf_dataset = model.prepare_tf_dataset(
-                    input_dataset, batch_size=len(input_dataset), drop_remainder=False, shuffle=False
-                )
-                test_batch, test_batch_labels = next(iter(tf_dataset))
-                self.assertGreater(len(test_batch_labels), 0)  # Assert the labels are present
-                feature_columns = 1 if isinstance(test_batch, tf.Tensor) else len(test_batch)
-                label_columns = 1 if isinstance(test_batch_labels, tf.Tensor) else len(test_batch_labels)
-                # Assert we discarded the unwanted extra column but kept everything else
-                self.assertEqual(feature_columns + label_columns, len(input_dataset.features) - 1)
-                if isinstance(test_batch, dict):
-                    self.assertNotIn("extra_unwanted_column", test_batch)
-                if isinstance(test_batch_labels, dict):
-                    self.assertNotIn("extra_unwanted_column", test_batch_labels)
-                model.compile(optimizer="sgd", run_eagerly=True)
-                model.train_on_batch(test_batch, test_batch_labels)
-
-
-def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
-    """Creates a random int32 tensor of the shape within the vocab size."""
-    if rng is None:
-        rng = random.Random()
-
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
-
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.randint(0, vocab_size - 1))
-
-    output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32)
-
-    return output
-
-
-def random_attention_mask(shape, rng=None, name=None, dtype=None):
-    attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype)
-    # Mark the first token as 1 (matches behaviour of PyTorch/Flax function)
-    attn_mask = tf.concat([tf.ones_like(attn_mask[:, :1]), attn_mask[:, 1:]], axis=1)
-    return attn_mask
-
-
-def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = random.Random()
-
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
-
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.random() * scale)
-
-    return tf.reshape(tf.constant(values, dtype=dtype if dtype is not None else tf.float32), shape=shape)
diff --git a/tests/utils/test_modeling_flax_utils.py b/tests/utils/test_modeling_flax_utils.py
deleted file mode 100644
index 7a2c516132..0000000000
--- a/tests/utils/test_modeling_flax_utils.py
+++ /dev/null
@@ -1,285 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-import unittest
-
-import numpy as np
-from huggingface_hub import HfFolder, snapshot_download
-
-from transformers import BertConfig, is_flax_available
-from transformers.testing_utils import (
-    TOKEN,
-    CaptureLogger,
-    TemporaryHubRepo,
-    is_staging_test,
-    require_flax,
-    require_safetensors,
-)
-from transformers.utils import FLAX_WEIGHTS_NAME, SAFE_WEIGHTS_NAME, logging
-
-
-if is_flax_available():
-    import os
-
-    from flax.core.frozen_dict import unfreeze
-    from flax.traverse_util import flatten_dict
-
-    from transformers import FlaxBertModel
-
-    os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "0.12"  # assumed parallelism: 8
-
-
-@require_flax
-@is_staging_test
-class FlaxModelPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    def test_push_to_hub(self):
-        with TemporaryHubRepo(token=self._token) as tmp_repo:
-            config = BertConfig(
-                vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-            )
-            model = FlaxBertModel(config)
-            model.push_to_hub(tmp_repo.repo_id, token=self._token)
-
-            new_model = FlaxBertModel.from_pretrained(tmp_repo.repo_id)
-
-            base_params = flatten_dict(unfreeze(model.params))
-            new_params = flatten_dict(unfreeze(new_model.params))
-
-            for key in base_params.keys():
-                max_diff = (base_params[key] - new_params[key]).sum().item()
-                self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    def test_push_to_hub_via_save_pretrained(self):
-        with TemporaryHubRepo(token=self._token) as tmp_repo:
-            config = BertConfig(
-                vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-            )
-            model = FlaxBertModel(config)
-            # Push to hub via save_pretrained
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                model.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
-
-            new_model = FlaxBertModel.from_pretrained(tmp_repo.repo_id)
-
-            base_params = flatten_dict(unfreeze(model.params))
-            new_params = flatten_dict(unfreeze(new_model.params))
-
-            for key in base_params.keys():
-                max_diff = (base_params[key] - new_params[key]).sum().item()
-                self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    def test_push_to_hub_in_organization(self):
-        with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
-            config = BertConfig(
-                vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-            )
-            model = FlaxBertModel(config)
-            model.push_to_hub(tmp_repo.repo_id, token=self._token)
-
-            new_model = FlaxBertModel.from_pretrained(tmp_repo.repo_id)
-
-            base_params = flatten_dict(unfreeze(model.params))
-            new_params = flatten_dict(unfreeze(new_model.params))
-
-            for key in base_params.keys():
-                max_diff = (base_params[key] - new_params[key]).sum().item()
-                self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    def test_push_to_hub_in_organization_via_save_pretrained(self):
-        with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
-            config = BertConfig(
-                vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-            )
-            model = FlaxBertModel(config)
-            # Push to hub via save_pretrained
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                model.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
-
-            new_model = FlaxBertModel.from_pretrained(tmp_repo.repo_id)
-
-            base_params = flatten_dict(unfreeze(model.params))
-            new_params = flatten_dict(unfreeze(new_model.params))
-
-            for key in base_params.keys():
-                max_diff = (base_params[key] - new_params[key]).sum().item()
-                self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-
-def check_models_equal(model1, model2):
-    models_are_equal = True
-    flat_params_1 = flatten_dict(model1.params)
-    flat_params_2 = flatten_dict(model2.params)
-    for key in flat_params_1.keys():
-        if np.sum(np.abs(flat_params_1[key] - flat_params_2[key])) > 1e-4:
-            models_are_equal = False
-
-    return models_are_equal
-
-
-@require_flax
-class FlaxModelUtilsTest(unittest.TestCase):
-    def test_model_from_pretrained_subfolder(self):
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
-        model = FlaxBertModel(config)
-
-        subfolder = "bert"
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, subfolder))
-
-            with self.assertRaises(OSError):
-                _ = FlaxBertModel.from_pretrained(tmp_dir)
-
-            model_loaded = FlaxBertModel.from_pretrained(tmp_dir, subfolder=subfolder)
-
-        self.assertTrue(check_models_equal(model, model_loaded))
-
-    def test_model_from_pretrained_subfolder_sharded(self):
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
-        model = FlaxBertModel(config)
-
-        subfolder = "bert"
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, subfolder), max_shard_size="10KB")
-
-            with self.assertRaises(OSError):
-                _ = FlaxBertModel.from_pretrained(tmp_dir)
-
-            model_loaded = FlaxBertModel.from_pretrained(tmp_dir, subfolder=subfolder)
-
-        self.assertTrue(check_models_equal(model, model_loaded))
-
-    def test_model_from_pretrained_hub_subfolder(self):
-        subfolder = "bert"
-        model_id = "hf-internal-testing/tiny-random-bert-subfolder"
-
-        with self.assertRaises(OSError):
-            _ = FlaxBertModel.from_pretrained(model_id)
-
-        model = FlaxBertModel.from_pretrained(model_id, subfolder=subfolder)
-
-        self.assertIsNotNone(model)
-
-    def test_model_from_pretrained_hub_subfolder_sharded(self):
-        subfolder = "bert"
-        model_id = "hf-internal-testing/tiny-random-bert-sharded-subfolder"
-        with self.assertRaises(OSError):
-            _ = FlaxBertModel.from_pretrained(model_id)
-
-        model = FlaxBertModel.from_pretrained(model_id, subfolder=subfolder)
-
-        self.assertIsNotNone(model)
-
-    @require_safetensors
-    def test_safetensors_save_and_load(self):
-        model = FlaxBertModel.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, safe_serialization=True)
-
-            # No msgpack file, only a model.safetensors
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, FLAX_WEIGHTS_NAME)))
-
-            new_model = FlaxBertModel.from_pretrained(tmp_dir)
-
-        self.assertTrue(check_models_equal(model, new_model))
-
-    @require_safetensors
-    def test_safetensors_load_from_hub(self):
-        """
-        This test checks that we can load safetensors from a checkpoint that only has those on the Hub
-        """
-        flax_model = FlaxBertModel.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
-
-        # Can load from the Flax-formatted checkpoint
-        safetensors_model = FlaxBertModel.from_pretrained("hf-internal-testing/tiny-bert-flax-safetensors-only")
-        self.assertTrue(check_models_equal(flax_model, safetensors_model))
-
-    @require_safetensors
-    def test_safetensors_load_from_local(self):
-        """
-        This test checks that we can load safetensors from a checkpoint that only has those on the Hub
-        """
-        with tempfile.TemporaryDirectory() as tmp:
-            location = snapshot_download("hf-internal-testing/tiny-bert-flax-only", cache_dir=tmp)
-            flax_model = FlaxBertModel.from_pretrained(location)
-
-        with tempfile.TemporaryDirectory() as tmp:
-            location = snapshot_download("hf-internal-testing/tiny-bert-flax-safetensors-only", cache_dir=tmp)
-            safetensors_model = FlaxBertModel.from_pretrained(location)
-
-        self.assertTrue(check_models_equal(flax_model, safetensors_model))
-
-    @require_safetensors
-    def test_safetensors_load_from_hub_msgpack_before_safetensors(self):
-        """
-        This test checks that we'll first download msgpack weights before safetensors
-        The safetensors file on that repo is a pt safetensors and therefore cannot be loaded without PyTorch
-        """
-        FlaxBertModel.from_pretrained("hf-internal-testing/tiny-bert-pt-safetensors-msgpack")
-
-    @require_safetensors
-    def test_safetensors_load_from_local_msgpack_before_safetensors(self):
-        """
-        This test checks that we'll first download msgpack weights before safetensors
-        The safetensors file on that repo is a pt safetensors and therefore cannot be loaded without PyTorch
-        """
-        with tempfile.TemporaryDirectory() as tmp:
-            location = snapshot_download("hf-internal-testing/tiny-bert-pt-safetensors-msgpack", cache_dir=tmp)
-            FlaxBertModel.from_pretrained(location)
-
-    @require_safetensors
-    def test_safetensors_flax_from_flax(self):
-        model = FlaxBertModel.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, safe_serialization=True)
-            new_model = FlaxBertModel.from_pretrained(tmp_dir)
-
-        self.assertTrue(check_models_equal(model, new_model))
-
-    @require_safetensors
-    def test_safetensors_flax_from_sharded_msgpack_with_sharded_safetensors_local(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            path = snapshot_download(
-                "hf-internal-testing/tiny-bert-flax-safetensors-msgpack-sharded", cache_dir=tmp_dir
-            )
-
-            # This should not raise even if there are two types of sharded weights
-            FlaxBertModel.from_pretrained(path)
-
-    @require_safetensors
-    def test_safetensors_flax_from_sharded_msgpack_with_sharded_safetensors_hub(self):
-        # This should not raise even if there are two types of sharded weights
-        # This should discard the safetensors weights in favor of the msgpack sharded weights
-        FlaxBertModel.from_pretrained("hf-internal-testing/tiny-bert-flax-safetensors-msgpack-sharded")
-
-    @require_safetensors
-    def test_safetensors_from_pt_bf16(self):
-        # This should not raise; should be able to load bf16-serialized torch safetensors without issue
-        # and without torch.
-        logger = logging.get_logger("transformers.modeling_flax_utils")
-
-        with CaptureLogger(logger) as cl:
-            FlaxBertModel.from_pretrained("hf-internal-testing/tiny-bert-pt-safetensors-bf16")
-
-        self.assertTrue(
-            "Some of the weights of FlaxBertModel were initialized in bfloat16 precision from the model checkpoint"
-            in cl.out
-        )
diff --git a/tests/utils/test_modeling_tf_core.py b/tests/utils/test_modeling_tf_core.py
deleted file mode 100644
index e15a804ef3..0000000000
--- a/tests/utils/test_modeling_tf_core.py
+++ /dev/null
@@ -1,403 +0,0 @@
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import copy
-import os
-import tempfile
-from importlib import import_module
-from math import isnan
-
-from transformers import is_tf_available
-from transformers.models.auto import get_values
-from transformers.testing_utils import require_tf, slow
-
-from ..test_modeling_tf_common import ids_tensor
-
-
-if is_tf_available():
-    import numpy as np
-    import tensorflow as tf
-
-    from transformers import (
-        TF_MODEL_FOR_CAUSAL_LM_MAPPING,
-        TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
-        TF_MODEL_FOR_MASKED_LM_MAPPING,
-        TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
-        TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
-        TF_MODEL_FOR_PRETRAINING_MAPPING,
-        TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        TFSharedEmbeddings,
-    )
-    from transformers.modeling_tf_utils import keras
-
-
-@require_tf
-class TFCoreModelTesterMixin:
-    model_tester = None
-    all_model_classes = ()
-    all_generative_model_classes = ()
-    test_mismatched_shapes = True
-    test_resize_embeddings = True
-    test_head_masking = True
-    is_encoder_decoder = False
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict:
-        inputs_dict = copy.deepcopy(inputs_dict)
-
-        if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-            inputs_dict = {
-                k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1))
-                if isinstance(v, tf.Tensor) and v.ndim > 0
-                else v
-                for k, v in inputs_dict.items()
-            }
-
-        if return_labels:
-            if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
-                inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-                inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in [
-                *get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
-                *get_values(TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
-            ]:
-                inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING):
-                inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in [
-                *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
-                *get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING),
-                *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
-                *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING),
-                *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
-            ]:
-                inputs_dict["labels"] = tf.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
-                )
-        return inputs_dict
-
-    @slow
-    def test_graph_mode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes[:2]:
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config)
-
-            @tf.function
-            def run_in_graph_mode():
-                return model(inputs)
-
-            outputs = run_in_graph_mode()
-            self.assertIsNotNone(outputs)
-
-    @slow
-    def test_xla_mode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes[:2]:
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config)
-
-            @tf.function(experimental_compile=True)
-            def run_in_graph_mode():
-                return model(inputs)
-
-            outputs = run_in_graph_mode()
-            self.assertIsNotNone(outputs)
-
-    @slow
-    def test_xla_fit(self):
-        # This is a copy of the test_keras_fit method, but we use XLA compilation instead of eager
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes[:2]:
-            model = model_class(config)
-            if getattr(model, "hf_compute_loss", None):
-                # Test that model correctly compute the loss with kwargs
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                # Is there a better way to remove these decoder inputs?
-                prepared_for_class = {
-                    key: val
-                    for key, val in prepared_for_class.items()
-                    if key not in ("head_mask", "decoder_head_mask", "cross_attn_head_mask", "decoder_input_ids")
-                }
-
-                possible_label_cols = {
-                    "labels",
-                    "label",
-                    "label_ids",
-                    "start_positions",
-                    "start_position",
-                    "end_positions",
-                    "end_position",
-                    "next_sentence_label",
-                }
-                label_names = possible_label_cols.intersection(set(prepared_for_class))
-                self.assertGreater(len(label_names), 0, msg="No matching label names found!")
-                labels = {key: val for key, val in prepared_for_class.items() if key in label_names}
-                inputs_minus_labels = {key: val for key, val in prepared_for_class.items() if key not in label_names}
-                self.assertGreater(len(inputs_minus_labels), 0)
-
-                # Make sure it works with XLA!
-                model.compile(optimizer=keras.optimizers.SGD(0.0), jit_compile=True)
-                # Make sure the model fits without crashing regardless of where we pass the labels
-                history = model.fit(
-                    prepared_for_class,
-                    validation_data=prepared_for_class,
-                    steps_per_epoch=1,
-                    validation_steps=1,
-                    shuffle=False,
-                    verbose=0,
-                )
-                loss = history.history["loss"][0]
-                self.assertTrue(not isnan(loss))
-                val_loss = history.history["val_loss"][0]
-                self.assertTrue(not isnan(val_loss))
-
-                # Now test it with separate labels, to make sure that path works in XLA too.
-                model = model_class(config)
-                model.compile(optimizer=keras.optimizers.SGD(0.0), jit_compile=True)
-                history = model.fit(
-                    inputs_minus_labels,
-                    labels,
-                    validation_data=(inputs_minus_labels, labels),
-                    steps_per_epoch=1,
-                    validation_steps=1,
-                    shuffle=False,
-                    verbose=0,
-                )
-
-                loss = history.history["loss"][0]
-                self.assertTrue(not isnan(loss))
-                val_loss = history.history["val_loss"][0]
-                self.assertTrue(not isnan(val_loss))
-
-    @slow
-    def test_saved_model_creation_extended(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        if hasattr(config, "use_cache"):
-            config.use_cache = True
-
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-
-        for model_class in self.all_model_classes[:2]:
-            class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config)
-            model.build_in_name_scope()
-            num_out = len(model(class_inputs_dict))
-
-            for key in list(class_inputs_dict.keys()):
-                # Remove keys not in the serving signature, as the SavedModel will not be compiled to deal with them
-                if key not in model.input_signature:
-                    del class_inputs_dict[key]
-                # Check it's a tensor, in case the inputs dict has some bools in it too
-                elif isinstance(class_inputs_dict[key], tf.Tensor) and class_inputs_dict[key].dtype.is_integer:
-                    class_inputs_dict[key] = tf.cast(class_inputs_dict[key], tf.int32)
-
-            if set(class_inputs_dict.keys()) != set(model.input_signature.keys()):
-                continue  # Some models have inputs that the preparation functions don't create, we skip those
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=True)
-                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
-                model = keras.models.load_model(saved_model_dir)
-                outputs = model(class_inputs_dict)
-
-                if self.is_encoder_decoder:
-                    output_hidden_states = outputs["encoder_hidden_states"]
-                    output_attentions = outputs["encoder_attentions"]
-                else:
-                    output_hidden_states = outputs["hidden_states"]
-                    output_attentions = outputs["attentions"]
-
-                self.assertEqual(len(outputs), num_out)
-
-                expected_num_layers = getattr(
-                    self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-                )
-
-                self.assertEqual(len(output_hidden_states), expected_num_layers)
-                self.assertListEqual(
-                    list(output_hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size],
-                )
-
-                self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(output_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-
-    @slow
-    def test_mixed_precision(self):
-        keras.mixed_precision.set_global_policy("mixed_float16")
-
-        # try/finally block to ensure subsequent tests run in float32
-        try:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            for model_class in self.all_model_classes[:2]:
-                class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-                outputs = model(class_inputs_dict)
-
-                self.assertIsNotNone(outputs)
-        finally:
-            keras.mixed_precision.set_global_policy("float32")
-
-    @slow
-    def test_train_pipeline_custom_model(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        # head_mask and decoder_head_mask has different shapes than other input args
-        if "head_mask" in inputs_dict:
-            del inputs_dict["head_mask"]
-        if "decoder_head_mask" in inputs_dict:
-            del inputs_dict["decoder_head_mask"]
-        if "cross_attn_head_mask" in inputs_dict:
-            del inputs_dict["cross_attn_head_mask"]
-        tf_main_layer_classes = {
-            module_member
-            for model_class in self.all_model_classes
-            for module in (import_module(model_class.__module__),)
-            for module_member_name in dir(module)
-            if module_member_name.endswith("MainLayer")
-            for module_member in (getattr(module, module_member_name),)
-            if isinstance(module_member, type)
-            and keras.layers.Layer in module_member.__bases__
-            and getattr(module_member, "_keras_serializable", False)
-        }
-
-        for main_layer_class in tf_main_layer_classes:
-            # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
-            if "T5" in main_layer_class.__name__:
-                # Take the same values than in TFT5ModelTester for this shared layer
-                shared = TFSharedEmbeddings(self.model_tester.vocab_size, self.model_tester.hidden_size, name="shared")
-                config.use_cache = False
-                main_layer = main_layer_class(config, embed_tokens=shared)
-            else:
-                main_layer = main_layer_class(config)
-
-            symbolic_inputs = {
-                name: keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
-            }
-
-            if hasattr(self.model_tester, "num_labels"):
-                num_labels = self.model_tester.num_labels
-            else:
-                num_labels = 2
-
-            X = tf.data.Dataset.from_tensor_slices(
-                (inputs_dict, np.ones((self.model_tester.batch_size, self.model_tester.seq_length, num_labels, 1)))
-            ).batch(1)
-
-            hidden_states = main_layer(symbolic_inputs)[0]
-            outputs = keras.layers.Dense(num_labels, activation="softmax", name="outputs")(hidden_states)
-            model = keras.models.Model(inputs=symbolic_inputs, outputs=[outputs])
-
-            model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["binary_accuracy"])
-            model.fit(X, epochs=1)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                filepath = os.path.join(tmpdirname, "keras_model.h5")
-                model.save(filepath)
-                if "T5" in main_layer_class.__name__:
-                    model = keras.models.load_model(
-                        filepath,
-                        custom_objects={
-                            main_layer_class.__name__: main_layer_class,
-                            "TFSharedEmbeddings": TFSharedEmbeddings,
-                        },
-                    )
-                else:
-                    model = keras.models.load_model(
-                        filepath, custom_objects={main_layer_class.__name__: main_layer_class}
-                    )
-                assert isinstance(model, keras.Model)
-                model(inputs_dict)
-
-    @slow
-    def test_graph_mode_with_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes[:2]:
-            model = model_class(config)
-
-            inputs = copy.deepcopy(inputs_dict)
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids)
-            else:
-                inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids)
-
-            inputs = self._prepare_for_class(inputs, model_class)
-
-            @tf.function
-            def run_in_graph_mode():
-                return model(inputs)
-
-            outputs = run_in_graph_mode()
-            self.assertIsNotNone(outputs)
-
-    def _generate_random_bad_tokens(self, num_bad_tokens, model):
-        # special tokens cannot be bad tokens
-        special_tokens = []
-        if model.config.bos_token_id is not None:
-            special_tokens.append(model.config.bos_token_id)
-        if model.config.pad_token_id is not None:
-            special_tokens.append(model.config.pad_token_id)
-        if model.config.eos_token_id is not None:
-            special_tokens.append(model.config.eos_token_id)
-
-        # create random bad tokens that are not special tokens
-        bad_tokens = []
-        while len(bad_tokens) < num_bad_tokens:
-            token = tf.squeeze(ids_tensor((1, 1), self.model_tester.vocab_size), 0).numpy()[0]
-            if token not in special_tokens:
-                bad_tokens.append(token)
-        return bad_tokens
-
-    def _check_generated_ids(self, output_ids):
-        for token_id in output_ids[0].numpy().tolist():
-            self.assertGreaterEqual(token_id, 0)
-            self.assertLess(token_id, self.model_tester.vocab_size)
-
-    def _check_match_tokens(self, generated_ids, bad_words_ids):
-        # for all bad word tokens
-        for bad_word_ids in bad_words_ids:
-            # for all slices in batch
-            for generated_ids_slice in generated_ids:
-                # for all word idx
-                for i in range(len(bad_word_ids), len(generated_ids_slice)):
-                    # if tokens match
-                    if generated_ids_slice[i - len(bad_word_ids) : i] == bad_word_ids:
-                        return True
-        return False
diff --git a/tests/utils/test_modeling_tf_utils.py b/tests/utils/test_modeling_tf_utils.py
deleted file mode 100644
index 079b385b51..0000000000
--- a/tests/utils/test_modeling_tf_utils.py
+++ /dev/null
@@ -1,662 +0,0 @@
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import json
-import os
-import random
-import tempfile
-import unittest
-import unittest.mock as mock
-
-from huggingface_hub import HfFolder, snapshot_download
-from requests.exceptions import HTTPError
-
-from transformers import is_tf_available
-from transformers.configuration_utils import PretrainedConfig
-from transformers.testing_utils import (  # noqa: F401
-    TOKEN,
-    USER,
-    CaptureLogger,
-    TemporaryHubRepo,
-    is_staging_test,
-    require_safetensors,
-    require_tf,
-    slow,
-)
-from transformers.utils import (
-    SAFE_WEIGHTS_INDEX_NAME,
-    SAFE_WEIGHTS_NAME,
-    TF2_WEIGHTS_INDEX_NAME,
-    TF2_WEIGHTS_NAME,
-    logging,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-if is_tf_available():
-    import h5py
-    import numpy as np
-    import tensorflow as tf
-
-    from transformers import (
-        BertConfig,
-        RagRetriever,
-        TFBertForSequenceClassification,
-        TFBertModel,
-        TFRagModel,
-    )
-    from transformers.modeling_tf_utils import keras, tf_shard_checkpoint, unpack_inputs
-    from transformers.tf_utils import stable_softmax
-
-    tf.config.experimental.enable_tensor_float_32_execution(False)
-
-
-@require_tf
-class TFModelUtilsTest(unittest.TestCase):
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    # tests whether the unpack_inputs function behaves as expected
-    def test_unpack_inputs(self):
-        class DummyModel:
-            def __init__(self):
-                config_kwargs = {"output_attentions": False, "output_hidden_states": False, "return_dict": False}
-                self.config = PretrainedConfig(**config_kwargs)
-                self.main_input_name = "input_ids"
-
-            @unpack_inputs
-            def call(
-                self,
-                input_ids=None,
-                past_key_values=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-            ):
-                return input_ids, past_key_values, output_attentions, output_hidden_states, return_dict
-
-            @unpack_inputs
-            def foo(self, pixel_values, output_attentions=None, output_hidden_states=None, return_dict=None):
-                return pixel_values, output_attentions, output_hidden_states, return_dict
-
-        dummy_model = DummyModel()
-        input_ids = tf.constant([0, 1, 2, 3], dtype=tf.int32)
-        past_key_values = tf.constant([4, 5, 6, 7], dtype=tf.int32)
-        pixel_values = tf.constant([8, 9, 10, 11], dtype=tf.int32)
-
-        # test case 1: Pass inputs as keyword arguments; Booleans are inherited from the config.
-        output = dummy_model.call(input_ids=input_ids, past_key_values=past_key_values)
-        tf.debugging.assert_equal(output[0], input_ids)
-        tf.debugging.assert_equal(output[1], past_key_values)
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-        self.assertFalse(output[4])
-
-        # test case 2: Same as above, but with positional arguments.
-        output = dummy_model.call(input_ids, past_key_values)
-        tf.debugging.assert_equal(output[0], input_ids)
-        tf.debugging.assert_equal(output[1], past_key_values)
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-        self.assertFalse(output[4])
-
-        # test case 3: We can also pack everything in the first input.
-        output = dummy_model.call(input_ids={"input_ids": input_ids, "past_key_values": past_key_values})
-        tf.debugging.assert_equal(output[0], input_ids)
-        tf.debugging.assert_equal(output[1], past_key_values)
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-        self.assertFalse(output[4])
-
-        # test case 4: Explicit boolean arguments should override the config.
-        output = dummy_model.call(
-            input_ids=input_ids, past_key_values=past_key_values, output_attentions=False, return_dict=True
-        )
-        tf.debugging.assert_equal(output[0], input_ids)
-        tf.debugging.assert_equal(output[1], past_key_values)
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-        self.assertTrue(output[4])
-
-        # test case 5: Unexpected arguments should raise an exception.
-        with self.assertRaises(ValueError):
-            output = dummy_model.call(input_ids=input_ids, past_key_values=past_key_values, foo="bar")
-
-        # test case 6: the decorator is independent from `main_input_name` -- it treats the first argument of the
-        # decorated function as its main input.
-        output = dummy_model.foo(pixel_values=pixel_values)
-        tf.debugging.assert_equal(output[0], pixel_values)
-        self.assertFalse(output[1])
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-
-    # Tests whether the stable softmax is stable on CPU, with and without XLA
-    def test_xla_stable_softmax(self):
-        large_penalty = -1e9
-        n_tokens = 10
-        batch_size = 8
-
-        def masked_softmax(x, boolean_mask):
-            numerical_mask = (1.0 - tf.cast(boolean_mask, dtype=tf.float32)) * large_penalty
-            masked_x = x + numerical_mask
-            return stable_softmax(masked_x)
-
-        xla_masked_softmax = tf.function(masked_softmax, jit_compile=True)
-        xla_stable_softmax = tf.function(stable_softmax, jit_compile=True)
-        x = tf.random.normal((batch_size, n_tokens))
-
-        # Same outcome regardless of the boolean mask here
-        masked_tokens = random.randint(0, n_tokens)
-        boolean_mask = tf.convert_to_tensor([[1] * (n_tokens - masked_tokens) + [0] * masked_tokens], dtype=tf.int32)
-
-        # We can randomly mask a random numerical input OUTSIDE XLA
-        numerical_mask = (1.0 - tf.cast(boolean_mask, dtype=tf.float32)) * large_penalty
-        masked_x = x + numerical_mask
-        xla_out = xla_stable_softmax(masked_x)
-        out = stable_softmax(masked_x)
-        assert tf.experimental.numpy.allclose(xla_out, out)
-
-        # The stable softmax has the same output as the original softmax
-        unstable_out = tf.nn.softmax(masked_x)
-        assert tf.experimental.numpy.allclose(unstable_out, out)
-
-        # We can randomly mask a random numerical input INSIDE XLA
-        xla_out = xla_masked_softmax(x, boolean_mask)
-        out = masked_softmax(x, boolean_mask)
-        assert tf.experimental.numpy.allclose(xla_out, out)
-
-    def test_checkpoint_sharding_from_hub(self):
-        model = TFBertModel.from_pretrained("ArthurZ/tiny-random-bert-sharded")
-        # the model above is the same as the model below, just a sharded version.
-        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        for p1, p2 in zip(model.weights, ref_model.weights):
-            assert np.allclose(p1.numpy(), p2.numpy())
-
-    def test_sharded_checkpoint_with_prefix(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert", load_weight_prefix="a/b")
-        sharded_model = TFBertModel.from_pretrained("ArthurZ/tiny-random-bert-sharded", load_weight_prefix="a/b")
-        for p1, p2 in zip(model.weights, sharded_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-            self.assertTrue(p1.name.startswith("a/b/"))
-            self.assertTrue(p2.name.startswith("a/b/"))
-
-    def test_sharded_checkpoint_transfer(self):
-        # If this doesn't throw an error then the test passes
-        TFBertForSequenceClassification.from_pretrained("ArthurZ/tiny-random-bert-sharded")
-
-    def test_shard_checkpoint(self):
-        # This is the model we will use, total size 340,000 bytes.
-        model = keras.Sequential(
-            [
-                keras.layers.Dense(200, use_bias=False),  # size 80,000
-                keras.layers.Dense(200, use_bias=False),  # size 160,000
-                keras.layers.Dense(100, use_bias=False),  # size 80,000
-                keras.layers.Dense(50, use_bias=False),  # size 20,000
-            ]
-        )
-        inputs = tf.zeros((1, 100), dtype=tf.float32)
-        model(inputs)
-        weights = model.weights
-        weights_dict = {w.name: w for w in weights}
-        with self.subTest("No shard when max size is bigger than model size"):
-            shards, index = tf_shard_checkpoint(weights)
-            self.assertIsNone(index)
-            self.assertDictEqual(shards, {TF2_WEIGHTS_NAME: weights})
-
-        with self.subTest("Test sharding, no weights bigger than max size"):
-            shards, index = tf_shard_checkpoint(weights, max_shard_size="300kB")
-            # Split is first two layers then last two.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "dense/kernel:0": "tf_model-00001-of-00002.h5",
-                        "dense_1/kernel:0": "tf_model-00001-of-00002.h5",
-                        "dense_2/kernel:0": "tf_model-00002-of-00002.h5",
-                        "dense_3/kernel:0": "tf_model-00002-of-00002.h5",
-                    },
-                },
-            )
-
-            shard1 = [weights_dict["dense/kernel:0"], weights_dict["dense_1/kernel:0"]]
-            shard2 = [weights_dict["dense_2/kernel:0"], weights_dict["dense_3/kernel:0"]]
-            self.assertDictEqual(shards, {"tf_model-00001-of-00002.h5": shard1, "tf_model-00002-of-00002.h5": shard2})
-
-        with self.subTest("Test sharding with weights bigger than max size"):
-            shards, index = tf_shard_checkpoint(weights, max_shard_size="100kB")
-            # Split is first layer, second layer then last 2.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "dense/kernel:0": "tf_model-00001-of-00003.h5",
-                        "dense_1/kernel:0": "tf_model-00002-of-00003.h5",
-                        "dense_2/kernel:0": "tf_model-00003-of-00003.h5",
-                        "dense_3/kernel:0": "tf_model-00003-of-00003.h5",
-                    },
-                },
-            )
-
-            shard1 = [weights_dict["dense/kernel:0"]]
-            shard2 = [weights_dict["dense_1/kernel:0"]]
-            shard3 = [weights_dict["dense_2/kernel:0"], weights_dict["dense_3/kernel:0"]]
-            self.assertDictEqual(
-                shards,
-                {
-                    "tf_model-00001-of-00003.h5": shard1,
-                    "tf_model-00002-of-00003.h5": shard2,
-                    "tf_model-00003-of-00003.h5": shard3,
-                },
-            )
-
-    @slow
-    def test_special_layer_name_sharding(self):
-        retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
-        model = TFRagModel.from_pretrained("facebook/rag-token-nq", retriever=retriever)
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            for max_size in ["150kB", "150kiB", "200kB", "200kiB"]:
-                model.save_pretrained(tmp_dir, max_shard_size=max_size)
-                ref_model = TFRagModel.from_pretrained(tmp_dir, retriever=retriever)
-                for p1, p2 in zip(model.weights, ref_model.weights):
-                    assert np.allclose(p1.numpy(), p2.numpy())
-
-    @require_safetensors
-    def test_checkpoint_sharding_local(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # We use the same folder for various sizes to make sure a new save erases the old checkpoint.
-            for max_size in ["150kB", "150kiB", "200kB", "200kiB"]:
-                model.save_pretrained(tmp_dir, max_shard_size=max_size)
-
-                # Get each shard file and its size
-                shard_to_size = {}
-                for shard in os.listdir(tmp_dir):
-                    if shard.endswith(".h5"):
-                        shard_file = os.path.join(tmp_dir, shard)
-                        shard_to_size[shard_file] = os.path.getsize(shard_file)
-
-                index_file = os.path.join(tmp_dir, TF2_WEIGHTS_INDEX_NAME)
-                # Check there is an index but no regular weight file
-                self.assertTrue(os.path.isfile(index_file))
-                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_NAME)))
-
-                # Check a file is bigger than max_size only when it has a single weight
-                for shard_file, size in shard_to_size.items():
-                    if max_size.endswith("kiB"):
-                        max_size_int = int(max_size[:-3]) * 2**10
-                    else:
-                        max_size_int = int(max_size[:-2]) * 10**3
-                    # Note: pickle adds some junk so the weight of the file can end up being slightly bigger than
-                    # the size asked for (since we count parameters)
-                    if size >= max_size_int + 50000:
-                        with h5py.File(shard_file, "r") as state_file:
-                            self.assertEqual(len(state_file), 1)
-
-                # Check the index and the shard files found match
-                with open(index_file, encoding="utf-8") as f:
-                    index = json.loads(f.read())
-
-                all_shards = set(index["weight_map"].values())
-                shards_found = {f for f in os.listdir(tmp_dir) if f.endswith(".h5")}
-                self.assertSetEqual(all_shards, shards_found)
-
-                # Finally, check the model can be reloaded
-                new_model = TFBertModel.from_pretrained(tmp_dir)
-
-                model.build_in_name_scope()
-                new_model.build_in_name_scope()
-
-                for p1, p2 in zip(model.weights, new_model.weights):
-                    self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    def test_safetensors_checkpoint_sharding_local(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # We use the same folder for various sizes to make sure a new save erases the old checkpoint.
-            for max_size in ["150kB", "150kiB", "200kB", "200kiB"]:
-                model.save_pretrained(tmp_dir, max_shard_size=max_size, safe_serialization=True)
-
-                # Get each shard file and its size
-                shard_to_size = {}
-                for shard in os.listdir(tmp_dir):
-                    if shard.endswith(".h5"):
-                        shard_file = os.path.join(tmp_dir, shard)
-                        shard_to_size[shard_file] = os.path.getsize(shard_file)
-
-                index_file = os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)
-                # Check there is an index but no regular weight file
-                self.assertTrue(os.path.isfile(index_file))
-                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_NAME)))
-                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_INDEX_NAME)))
-
-                # Check the index and the shard files found match
-                with open(index_file, encoding="utf-8") as f:
-                    index = json.loads(f.read())
-
-                all_shards = set(index["weight_map"].values())
-                shards_found = {f for f in os.listdir(tmp_dir) if f.endswith(".safetensors")}
-                self.assertSetEqual(all_shards, shards_found)
-
-                # Finally, check the model can be reloaded
-                new_model = TFBertModel.from_pretrained(tmp_dir)
-
-                model.build_in_name_scope()
-                new_model.build_in_name_scope()
-
-                for p1, p2 in zip(model.weights, new_model.weights):
-                    self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @slow
-    def test_save_pretrained_signatures(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Short custom TF signature function.
-        # `input_signature` is specific to BERT.
-        @tf.function(
-            input_signature=[
-                [
-                    tf.TensorSpec([None, None], tf.int32, name="input_ids"),
-                    tf.TensorSpec([None, None], tf.int32, name="token_type_ids"),
-                    tf.TensorSpec([None, None], tf.int32, name="attention_mask"),
-                ]
-            ]
-        )
-        def serving_fn(input):
-            return model(input)
-
-        # Using default signature (default behavior) overrides 'serving_default'
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, saved_model=True, signatures=None)
-            model_loaded = keras.models.load_model(f"{tmp_dir}/saved_model/1")
-            self.assertTrue("serving_default" in list(model_loaded.signatures.keys()))
-
-        # Providing custom signature function
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, saved_model=True, signatures={"custom_signature": serving_fn})
-            model_loaded = keras.models.load_model(f"{tmp_dir}/saved_model/1")
-            self.assertTrue("custom_signature" in list(model_loaded.signatures.keys()))
-
-        # Providing multiple custom signature function
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                tmp_dir,
-                saved_model=True,
-                signatures={"custom_signature_1": serving_fn, "custom_signature_2": serving_fn},
-            )
-            model_loaded = keras.models.load_model(f"{tmp_dir}/saved_model/1")
-            self.assertTrue("custom_signature_1" in list(model_loaded.signatures.keys()))
-            self.assertTrue("custom_signature_2" in list(model_loaded.signatures.keys()))
-
-    @require_safetensors
-    def test_safetensors_save_and_load(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, safe_serialization=True)
-            # No tf_model.h5 file, only a model.safetensors
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_NAME)))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_INDEX_NAME)))
-
-            new_model = TFBertModel.from_pretrained(tmp_dir)
-
-            # Check models are equal
-            for p1, p2 in zip(model.weights, new_model.weights):
-                self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @require_safetensors
-    def test_safetensors_sharded_save_and_load(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, safe_serialization=True, max_shard_size="150kB")
-            # No tf weights or index file, only a safetensors index
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_NAME)))
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_INDEX_NAME)))
-
-            new_model = TFBertModel.from_pretrained(tmp_dir)
-
-            # Check models are equal
-            for p1, p2 in zip(model.weights, new_model.weights):
-                self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @require_safetensors
-    def test_safetensors_load_from_hub(self):
-        tf_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Can load from the TF-formatted checkpoint
-        safetensors_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors-tf")
-
-        # Check models are equal
-        for p1, p2 in zip(safetensors_model.weights, tf_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-        # Can load from the PyTorch-formatted checkpoint
-        safetensors_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors")
-
-        # Check models are equal
-        for p1, p2 in zip(safetensors_model.weights, tf_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @require_safetensors
-    def test_safetensors_tf_from_tf(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-bert-tf-only")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, safe_serialization=True)
-            new_model = TFBertModel.from_pretrained(tmp_dir)
-
-        for p1, p2 in zip(model.weights, new_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @require_safetensors
-    def test_safetensors_tf_from_sharded_h5_with_sharded_safetensors_local(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            path = snapshot_download("hf-internal-testing/tiny-bert-tf-safetensors-h5-sharded", cache_dir=tmp_dir)
-
-            # This should not raise even if there are two types of sharded weights
-            TFBertModel.from_pretrained(path)
-
-    @require_safetensors
-    def test_safetensors_tf_from_sharded_h5_with_sharded_safetensors_hub(self):
-        # Confirm that we can correctly load the safetensors weights from a sharded hub repo even when TF weights present
-        TFBertModel.from_pretrained("hf-internal-testing/tiny-bert-tf-safetensors-h5-sharded", use_safetensors=True)
-        # Confirm that we can access the TF weights too
-        TFBertModel.from_pretrained("hf-internal-testing/tiny-bert-tf-safetensors-h5-sharded", use_safetensors=False)
-
-    @require_safetensors
-    def test_safetensors_load_from_local(self):
-        """
-        This test checks that we can load safetensors from a checkpoint that only has those on the Hub
-        """
-        with tempfile.TemporaryDirectory() as tmp:
-            location = snapshot_download("hf-internal-testing/tiny-bert-tf-only", cache_dir=tmp)
-            tf_model = TFBertModel.from_pretrained(location)
-
-        with tempfile.TemporaryDirectory() as tmp:
-            location = snapshot_download("hf-internal-testing/tiny-bert-tf-safetensors-only", cache_dir=tmp)
-            safetensors_model = TFBertModel.from_pretrained(location)
-
-        for p1, p2 in zip(tf_model.weights, safetensors_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @require_safetensors
-    def test_safetensors_load_from_hub_from_safetensors_pt(self):
-        """
-        This test checks that we can load safetensors from a checkpoint that only has those on the Hub.
-        saved in the "pt" format.
-        """
-        tf_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-bert-h5")
-
-        # Can load from the PyTorch-formatted checkpoint
-        safetensors_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-bert-pt-safetensors")
-        for p1, p2 in zip(tf_model.weights, safetensors_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @require_safetensors
-    def test_safetensors_load_from_local_from_safetensors_pt(self):
-        """
-        This test checks that we can load safetensors from a local checkpoint that only has those
-        saved in the "pt" format.
-        """
-        with tempfile.TemporaryDirectory() as tmp:
-            location = snapshot_download("hf-internal-testing/tiny-bert-h5", cache_dir=tmp)
-            tf_model = TFBertModel.from_pretrained(location)
-
-        # Can load from the PyTorch-formatted checkpoint
-        with tempfile.TemporaryDirectory() as tmp:
-            location = snapshot_download("hf-internal-testing/tiny-bert-pt-safetensors", cache_dir=tmp)
-            safetensors_model = TFBertModel.from_pretrained(location)
-
-        for p1, p2 in zip(tf_model.weights, safetensors_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @require_safetensors
-    def test_safetensors_load_from_hub_h5_before_safetensors(self):
-        """
-        This test checks that we'll first download h5 weights before safetensors
-        The safetensors file on that repo is a pt safetensors and therefore cannot be loaded without PyTorch
-        """
-        TFBertModel.from_pretrained("hf-internal-testing/tiny-bert-pt-safetensors-msgpack")
-
-    @require_safetensors
-    def test_safetensors_load_from_local_h5_before_safetensors(self):
-        """
-        This test checks that we'll first download h5 weights before safetensors
-        The safetensors file on that repo is a pt safetensors and therefore cannot be loaded without PyTorch
-        """
-        with tempfile.TemporaryDirectory() as tmp:
-            location = snapshot_download("hf-internal-testing/tiny-bert-pt-safetensors-msgpack", cache_dir=tmp)
-            TFBertModel.from_pretrained(location)
-
-
-@require_tf
-@is_staging_test
-class TFModelPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    def test_push_to_hub(self):
-        with TemporaryHubRepo(token=self._token) as tmp_repo:
-            config = BertConfig(
-                vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-            )
-            model = TFBertModel(config)
-            # Make sure model is properly initialized
-            model.build_in_name_scope()
-
-            logging.set_verbosity_info()
-            logger = logging.get_logger("transformers.utils.hub")
-            with CaptureLogger(logger) as cl:
-                model.push_to_hub(tmp_repo.repo_id, token=self._token)
-            logging.set_verbosity_warning()
-            # Check the model card was created and uploaded.
-            self.assertIn("Uploading the following files to __DUMMY_TRANSFORMERS_USER__/test-model-tf", cl.out)
-
-            new_model = TFBertModel.from_pretrained(tmp_repo.repo_id)
-            models_equal = True
-            for p1, p2 in zip(model.weights, new_model.weights):
-                if not tf.math.reduce_all(p1 == p2):
-                    models_equal = False
-                    break
-            self.assertTrue(models_equal)
-
-    def test_push_to_hub_via_save_pretrained(self):
-        with TemporaryHubRepo(token=self._token) as tmp_repo:
-            config = BertConfig(
-                vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-            )
-            model = TFBertModel(config)
-            # Make sure model is properly initialized
-            model.build_in_name_scope()
-
-            # Push to hub via save_pretrained
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                model.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
-
-            new_model = TFBertModel.from_pretrained(tmp_repo.repo_id)
-            models_equal = True
-            for p1, p2 in zip(model.weights, new_model.weights):
-                if not tf.math.reduce_all(p1 == p2):
-                    models_equal = False
-                    break
-            self.assertTrue(models_equal)
-
-    def test_push_to_hub_in_organization(self):
-        with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
-            config = BertConfig(
-                vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-            )
-            model = TFBertModel(config)
-            # Make sure model is properly initialized
-            model.build_in_name_scope()
-
-            model.push_to_hub(tmp_repo.repo_id, token=self._token)
-
-            new_model = TFBertModel.from_pretrained(tmp_repo.repo_id)
-            models_equal = True
-            for p1, p2 in zip(model.weights, new_model.weights):
-                if not tf.math.reduce_all(p1 == p2):
-                    models_equal = False
-                    break
-            self.assertTrue(models_equal)
-
-    def test_push_to_hub_in_organization_via_save_pretrained(self):
-        with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
-            config = BertConfig(
-                vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-            )
-            model = TFBertModel(config)
-            # Make sure model is properly initialized
-            model.build_in_name_scope()
-
-            # Push to hub via save_pretrained
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                model.save_pretrained(tmp_dir, push_to_hub=True, token=self._token, repo_id=tmp_repo.repo_id)
-
-            new_model = TFBertModel.from_pretrained(tmp_repo.repo_id)
-            models_equal = True
-            for p1, p2 in zip(model.weights, new_model.weights):
-                if not tf.math.reduce_all(p1 == p2):
-                    models_equal = False
-                    break
-            self.assertTrue(models_equal)