Deprecate low use models (#30781)

* Deprecate models - graphormer - time_series_transformer - xlm_prophetnet - qdqbert - nat - ernie_m - tvlt - nezha - mega - jukebox - vit_hybrid - x_clip - deta - speech_to_text_2 - efficientformer - realm - gptsan_japanese * Fix up * Fix speech2text2 imports * Make sure message isn't indented * Fix docstrings * Correctly map for deprecated models from model_type * Uncomment out * Add back time series transformer and x-clip * Import fix and fix-up * Fix up with updated ruff
2024-05-28 18:07:07 +01:00
parent 7f08817be4
commit a564d10afe
142 changed files with 1308 additions and 11908 deletions
--- a/tests/models/deta/init.py
+++ b/tests/models/deta/init.py
--- a/tests/models/deta/test_image_processing_deta.py
+++ b/tests/models/deta/test_image_processing_deta.py
@@ -1,535 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import pathlib
-import unittest
-
-from transformers.testing_utils import require_torch, require_vision, slow
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import DetaImageProcessor
-
-
-class DetaImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_pad=True,
-    ):
-        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
-        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_pad = do_pad
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_rescale": self.do_rescale,
-            "rescale_factor": self.rescale_factor,
-            "do_pad": self.do_pad,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to DetaImageProcessor,
-        assuming do_resize is set to True with a scalar size.
-        """
-        if not batched:
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            else:
-                h, w = image.shape[1], image.shape[2]
-            if w < h:
-                expected_height = int(self.size["shortest_edge"] * h / w)
-                expected_width = self.size["shortest_edge"]
-            elif w > h:
-                expected_height = self.size["shortest_edge"]
-                expected_width = int(self.size["shortest_edge"] * w / h)
-            else:
-                expected_height = self.size["shortest_edge"]
-                expected_width = self.size["shortest_edge"]
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
-
-    def expected_output_image_shape(self, images):
-        height, width = self.get_expected_values(images, batched=True)
-        return self.num_channels, height, width
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_torch
-@require_vision
-class DetaImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = DetaImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        self.image_processor_tester = DetaImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "do_rescale"))
-        self.assertTrue(hasattr(image_processing, "do_pad"))
-        self.assertTrue(hasattr(image_processing, "size"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
-        self.assertEqual(image_processor.do_pad, True)
-
-    @slow
-    def test_call_pytorch_with_coco_detection_annotations(self):
-        # prepare image and target
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"image_id": 39769, "annotations": target}
-
-        # encode them
-        image_processing = DetaImageProcessor()
-        encoding = image_processing(images=image, annotations=target, return_tensors="pt")
-
-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1066])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-
-        # verify area
-        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
-
-    @slow
-    def test_call_pytorch_with_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-
-        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
-
-        # encode them
-        image_processing = DetaImageProcessor(format="coco_panoptic")
-        encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
-
-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1066])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-
-        # verify area
-        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        # verify masks
-        expected_masks_sum = 822873
-        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
-
-    @slow
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->Deta
-    def test_batched_coco_detection_annotations(self):
-        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
-
-        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        annotations_0 = {"image_id": 39769, "annotations": target}
-        annotations_1 = {"image_id": 39769, "annotations": target}
-
-        # Adjust the bounding boxes for the resized image
-        w_0, h_0 = image_0.size
-        w_1, h_1 = image_1.size
-        for i in range(len(annotations_1["annotations"])):
-            coords = annotations_1["annotations"][i]["bbox"]
-            new_bbox = [
-                coords[0] * w_1 / w_0,
-                coords[1] * h_1 / h_0,
-                coords[2] * w_1 / w_0,
-                coords[3] * h_1 / h_0,
-            ]
-            annotations_1["annotations"][i]["bbox"] = new_bbox
-
-        images = [image_0, image_1]
-        annotations = [annotations_0, annotations_1]
-
-        image_processing = DetaImageProcessor()
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            return_tensors="pt",  # do_convert_annotations=True
-        )
-
-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1066
-        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        expected_boxes_0 = torch.tensor(
-            [
-                [0.6879, 0.4609, 0.0755, 0.3691],
-                [0.2118, 0.3359, 0.2601, 0.1566],
-                [0.5011, 0.5000, 0.9979, 1.0000],
-                [0.5010, 0.5020, 0.9979, 0.9959],
-                [0.3284, 0.5944, 0.5884, 0.8112],
-                [0.8394, 0.5445, 0.3213, 0.9110],
-            ]
-        )
-        expected_boxes_1 = torch.tensor(
-            [
-                [0.4130, 0.2765, 0.0453, 0.2215],
-                [0.1272, 0.2016, 0.1561, 0.0940],
-                [0.3757, 0.4933, 0.7488, 0.9865],
-                [0.3759, 0.5002, 0.7492, 0.9955],
-                [0.1971, 0.5456, 0.3532, 0.8646],
-                [0.5790, 0.4115, 0.3430, 0.7161],
-            ]
-        )
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
-
-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
-        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
-
-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="pt",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = torch.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = torch.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = torch.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = torch.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
-
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->Deta
-    def test_batched_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
-
-        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-        annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-
-        w_0, h_0 = image_0.size
-        w_1, h_1 = image_1.size
-        for i in range(len(annotation_1["segments_info"])):
-            coords = annotation_1["segments_info"][i]["bbox"]
-            new_bbox = [
-                coords[0] * w_1 / w_0,
-                coords[1] * h_1 / h_0,
-                coords[2] * w_1 / w_0,
-                coords[3] * h_1 / h_0,
-            ]
-            annotation_1["segments_info"][i]["bbox"] = new_bbox
-
-        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
-
-        images = [image_0, image_1]
-        annotations = [annotation_0, annotation_1]
-
-        # encode them
-        image_processing = DetaImageProcessor(format="coco_panoptic")
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_tensors="pt",
-            return_segmentation_masks=True,
-        )
-
-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1066
-        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        expected_boxes_0 = torch.tensor(
-            [
-                [0.2625, 0.5437, 0.4688, 0.8625],
-                [0.7719, 0.4104, 0.4531, 0.7125],
-                [0.5000, 0.4927, 0.9969, 0.9854],
-                [0.1688, 0.2000, 0.2063, 0.0917],
-                [0.5492, 0.2760, 0.0578, 0.2187],
-                [0.4992, 0.4990, 0.9984, 0.9979],
-            ]
-        )
-        expected_boxes_1 = torch.tensor(
-            [
-                [0.1576, 0.3262, 0.2814, 0.5175],
-                [0.4634, 0.2463, 0.2720, 0.4275],
-                [0.3002, 0.2956, 0.5985, 0.5913],
-                [0.1013, 0.1200, 0.1238, 0.0550],
-                [0.3297, 0.1656, 0.0347, 0.1312],
-                [0.2997, 0.2994, 0.5994, 0.5987],
-            ]
-        )
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
-
-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
-        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
-
-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="pt",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = torch.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = torch.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = torch.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = torch.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
-
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Deta
-    def test_max_width_max_height_resizing_and_pad_strategy(self):
-        image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
-
-        # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
-        image_processor = DetaImageProcessor(
-            size={"max_height": 100, "max_width": 100},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_1], return_tensors="pt")
-        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
-
-        # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
-        image_processor = DetaImageProcessor(
-            size={"max_height": 300, "max_width": 100},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_1], return_tensors="pt")
-
-        # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
-        image_processor = DetaImageProcessor(
-            size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
-        )
-        inputs = image_processor(images=[image_1], return_tensors="pt")
-        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
-
-        # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
-        image_processor = DetaImageProcessor(
-            size={"max_height": 300, "max_width": 100},
-            do_pad=True,
-            pad_size={"height": 301, "width": 101},
-        )
-        inputs = image_processor(images=[image_1], return_tensors="pt")
-        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
-
-        ### Check for batch
-        image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
-
-        # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
-        image_processor = DetaImageProcessor(
-            size={"max_height": 150, "max_width": 100},
-            do_pad=True,
-            pad_size={"height": 150, "width": 100},
-        )
-        inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
-        self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
--- a/tests/models/deta/test_modeling_deta.py
+++ b/tests/models/deta/test_modeling_deta.py
@@ -1,671 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch DETA model."""
-
-import collections
-import inspect
-import math
-import re
-import unittest
-
-from transformers import DetaConfig, ResNetConfig, is_torch_available, is_torchvision_available, is_vision_available
-from transformers.file_utils import cached_property
-from transformers.testing_utils import require_torchvision, require_vision, slow, torch_device
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers.pytorch_utils import id_tensor_storage
-
-if is_torchvision_available():
-    from transformers import DetaForObjectDetection, DetaModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import AutoImageProcessor
-
-
-class DetaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=8,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=8,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        num_queries=12,
-        two_stage_num_proposals=12,
-        num_channels=3,
-        image_size=224,
-        n_targets=8,
-        num_labels=91,
-        num_feature_levels=4,
-        encoder_n_points=2,
-        decoder_n_points=6,
-        two_stage=True,
-        assign_first_stage=True,
-        assign_second_stage=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.num_queries = num_queries
-        self.two_stage_num_proposals = two_stage_num_proposals
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.n_targets = n_targets
-        self.num_labels = num_labels
-        self.num_feature_levels = num_feature_levels
-        self.encoder_n_points = encoder_n_points
-        self.decoder_n_points = decoder_n_points
-        self.two_stage = two_stage
-        self.assign_first_stage = assign_first_stage
-        self.assign_second_stage = assign_second_stage
-
-        # we also set the expected seq length for both encoder and decoder
-        self.encoder_seq_length = (
-            math.ceil(self.image_size / 8) ** 2
-            + math.ceil(self.image_size / 16) ** 2
-            + math.ceil(self.image_size / 32) ** 2
-            + math.ceil(self.image_size / 64) ** 2
-        )
-        self.decoder_seq_length = self.num_queries
-
-    def prepare_config_and_inputs(self, model_class_name):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device)
-
-        labels = None
-        if self.use_labels:
-            # labels is a list of Dict (each Dict being the labels for a given example in the batch)
-            labels = []
-            for i in range(self.batch_size):
-                target = {}
-                target["class_labels"] = torch.randint(
-                    high=self.num_labels, size=(self.n_targets,), device=torch_device
-                )
-                target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device)
-                target["masks"] = torch.rand(self.n_targets, self.image_size, self.image_size, device=torch_device)
-                labels.append(target)
-
-        config = self.get_config(model_class_name)
-        return config, pixel_values, pixel_mask, labels
-
-    def get_config(self, model_class_name):
-        resnet_config = ResNetConfig(
-            num_channels=3,
-            embeddings_size=10,
-            hidden_sizes=[10, 20, 30, 40],
-            depths=[1, 1, 2, 1],
-            hidden_act="relu",
-            num_labels=3,
-            out_features=["stage2", "stage3", "stage4"],
-            out_indices=[2, 3, 4],
-        )
-        two_stage = model_class_name == "DetaForObjectDetection"
-        assign_first_stage = model_class_name == "DetaForObjectDetection"
-        assign_second_stage = model_class_name == "DetaForObjectDetection"
-        return DetaConfig(
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            num_queries=self.num_queries,
-            two_stage_num_proposals=self.two_stage_num_proposals,
-            num_labels=self.num_labels,
-            num_feature_levels=self.num_feature_levels,
-            encoder_n_points=self.encoder_n_points,
-            decoder_n_points=self.decoder_n_points,
-            two_stage=two_stage,
-            assign_first_stage=assign_first_stage,
-            assign_second_stage=assign_second_stage,
-            backbone_config=resnet_config,
-            backbone=None,
-        )
-
-    def prepare_config_and_inputs_for_common(self, model_class_name="DetaModel"):
-        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs(model_class_name)
-        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
-        return config, inputs_dict
-
-    def create_and_check_deta_model(self, config, pixel_values, pixel_mask, labels):
-        model = DetaModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size))
-
-    def create_and_check_deta_freeze_backbone(self, config, pixel_values, pixel_mask, labels):
-        model = DetaModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        model.freeze_backbone()
-
-        for _, param in model.backbone.model.named_parameters():
-            self.parent.assertEqual(False, param.requires_grad)
-
-    def create_and_check_deta_unfreeze_backbone(self, config, pixel_values, pixel_mask, labels):
-        model = DetaModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        model.unfreeze_backbone()
-
-        for _, param in model.backbone.model.named_parameters():
-            self.parent.assertEqual(True, param.requires_grad)
-
-    def create_and_check_deta_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
-        model = DetaForObjectDetection(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.two_stage_num_proposals, self.num_labels))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.two_stage_num_proposals, 4))
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
-
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.two_stage_num_proposals, self.num_labels))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.two_stage_num_proposals, 4))
-
-
-@require_torchvision
-class DetaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (DetaModel, DetaForObjectDetection) if is_torchvision_available() else ()
-    pipeline_model_mapping = (
-        {"image-feature-extraction": DetaModel, "object-detection": DetaForObjectDetection}
-        if is_torchvision_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    test_torchscript = False
-    test_pruning = False
-    test_head_masking = False
-    test_missing_keys = False
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "ObjectDetectionPipelineTests":
-            return True
-
-        return False
-
-    @unittest.skip("Skip for now. PR #22437 causes some loading issue. See (not merged) #22656 for some discussions.")
-    def test_can_use_safetensors(self):
-        super().test_can_use_safetensors()
-
-    # special case for head models
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "DetaForObjectDetection":
-                labels = []
-                for i in range(self.model_tester.batch_size):
-                    target = {}
-                    target["class_labels"] = torch.ones(
-                        size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long
-                    )
-                    target["boxes"] = torch.ones(
-                        self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float
-                    )
-                    target["masks"] = torch.ones(
-                        self.model_tester.n_targets,
-                        self.model_tester.image_size,
-                        self.model_tester.image_size,
-                        device=torch_device,
-                        dtype=torch.float,
-                    )
-                    labels.append(target)
-                inputs_dict["labels"] = labels
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = DetaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DetaConfig, has_text_modality=False)
-
-    def test_config(self):
-        # we don't test common_properties and arguments_init as these don't apply for DETA
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-
-    def test_deta_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(model_class_name="DetaModel")
-        self.model_tester.create_and_check_deta_model(*config_and_inputs)
-
-    def test_deta_freeze_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(model_class_name="DetaModel")
-        self.model_tester.create_and_check_deta_freeze_backbone(*config_and_inputs)
-
-    def test_deta_unfreeze_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(model_class_name="DetaModel")
-        self.model_tester.create_and_check_deta_unfreeze_backbone(*config_and_inputs)
-
-    def test_deta_object_detection_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(model_class_name="DetaForObjectDetection")
-        self.model_tester.create_and_check_deta_object_detection_head_model(*config_and_inputs)
-
-    @unittest.skip(reason="DETA does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="DETA does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @unittest.skip(reason="DETA does not have a get_input_embeddings method")
-    def test_model_common_attributes(self):
-        pass
-
-    @unittest.skip(reason="DETA is not a generative model")
-    def test_generate_without_input_ids(self):
-        pass
-
-    @unittest.skip(reason="DETA does not use token embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_feature_levels,
-                    self.model_tester.encoder_n_points,
-                ],
-            )
-            out_len = len(outputs)
-
-            correct_outlen = 8
-
-            # loss is at first position
-            if "labels" in inputs_dict:
-                correct_outlen += 1  # loss is added to beginning
-            # Object Detection model returns pred_logits and pred_boxes
-            if model_class.__name__ == "DetaForObjectDetection":
-                correct_outlen += 2
-
-            self.assertEqual(out_len, correct_outlen)
-
-            # decoder attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertIsInstance(decoder_attentions, (list, tuple))
-            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, self.model_tester.num_queries, self.model_tester.num_queries],
-            )
-
-            # cross attentions
-            cross_attentions = outputs.cross_attentions
-            self.assertIsInstance(cross_attentions, (list, tuple))
-            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(cross_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_feature_levels,
-                    self.model_tester.decoder_n_points,
-                ],
-            )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_feature_levels,
-                    self.model_tester.encoder_n_points,
-                ],
-            )
-
-    # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        inputs = self._prepare_for_class(inputs_dict, model_class)
-
-        outputs = model(**inputs)
-
-        # we take the second output since last_hidden_state is the second item
-        output = outputs[1]
-
-        encoder_hidden_states = outputs.encoder_hidden_states[0]
-        encoder_attentions = outputs.encoder_attentions[0]
-        encoder_hidden_states.retain_grad()
-        encoder_attentions.retain_grad()
-
-        decoder_attentions = outputs.decoder_attentions[0]
-        decoder_attentions.retain_grad()
-
-        cross_attentions = outputs.cross_attentions[0]
-        cross_attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(encoder_hidden_states.grad)
-        self.assertIsNotNone(encoder_attentions.grad)
-        self.assertIsNotNone(decoder_attentions.grad)
-        self.assertIsNotNone(cross_attentions.grad)
-
-    def test_forward_auxiliary_loss(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.auxiliary_loss = True
-
-        # only test for object detection and segmentation model
-        for model_class in self.all_model_classes[1:]:
-            model = model_class(config)
-            model.to(torch_device)
-
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-            outputs = model(**inputs)
-
-            self.assertIsNotNone(outputs.auxiliary_outputs)
-            self.assertEqual(len(outputs.auxiliary_outputs), self.model_tester.num_hidden_layers - 1)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            if model.config.is_encoder_decoder:
-                expected_arg_names = ["pixel_values", "pixel_mask"]
-                expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask", "encoder_outputs"]
-                    if "head_mask" and "decoder_head_mask" in arg_names
-                    else []
-                )
-                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-            else:
-                expected_arg_names = ["pixel_values", "pixel_mask"]
-                self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    @unittest.skip(reason="Model doesn't use tied weights")
-    def test_tied_model_weights_key_ignore(self):
-        pass
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            # Skip the check for the backbone
-            for name, module in model.named_modules():
-                if module.__class__.__name__ == "DetaBackboneWithPositionalEncodings":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
-                    break
-
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if (
-                        "level_embed" in name
-                        or "sampling_offsets.bias" in name
-                        or "value_proj" in name
-                        or "output_proj" in name
-                        or "reference_points" in name
-                        or name in backbone_params
-                    ):
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
-    # Inspired by tests.test_modeling_common.ModelTesterMixin.test_tied_weights_keys
-    def test_tied_weights_keys(self):
-        for model_class in self.all_model_classes:
-            # We need to pass model class name to correctly initialize the config.
-            # If we don't pass it, the config for `DetaForObjectDetection`` will be initialized
-            # with `two_stage=False` and the test will fail because for that case `class_embed`
-            # weights are not tied.
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common(model_class_name=model_class.__name__)
-            config.tie_word_embeddings = True
-
-            model_tied = model_class(config)
-
-            ptrs = collections.defaultdict(list)
-            for name, tensor in model_tied.state_dict().items():
-                ptrs[id_tensor_storage(tensor)].append(name)
-
-            # These are all the pointers of shared tensors.
-            tied_params = [names for _, names in ptrs.items() if len(names) > 1]
-
-            tied_weight_keys = model_tied._tied_weights_keys if model_tied._tied_weights_keys is not None else []
-            # Detect we get a hit for each key
-            for key in tied_weight_keys:
-                is_tied_key = any(re.search(key, p) for group in tied_params for p in group)
-                self.assertTrue(is_tied_key, f"{key} is not a tied weight key for {model_class}.")
-
-            # Removed tied weights found from tied params -> there should only be one left after
-            for key in tied_weight_keys:
-                for i in range(len(tied_params)):
-                    tied_params[i] = [p for p in tied_params[i] if re.search(key, p) is None]
-
-            tied_params = [group for group in tied_params if len(group) > 1]
-            self.assertListEqual(
-                tied_params,
-                [],
-                f"Missing `_tied_weights_keys` for {model_class}: add all of {tied_params} except one.",
-            )
-
-
-TOLERANCE = 1e-4
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_torchvision
-@require_vision
-@slow
-class DetaModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("jozhang97/deta-resnet-50") if is_vision_available() else None
-
-    def test_inference_object_detection_head(self):
-        model = DetaForObjectDetection.from_pretrained("jozhang97/deta-resnet-50").to(torch_device)
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        expected_shape_logits = torch.Size((1, 300, model.config.num_labels))
-        self.assertEqual(outputs.logits.shape, expected_shape_logits)
-
-        expected_logits = torch.tensor(
-            [[-7.3978, -2.5406, -4.1668], [-8.2684, -3.9933, -3.8096], [-7.0515, -3.7973, -5.8516]]
-        ).to(torch_device)
-        expected_boxes = torch.tensor(
-            [[0.5043, 0.4973, 0.9998], [0.2542, 0.5489, 0.4748], [0.5490, 0.2765, 0.0570]]
-        ).to(torch_device)
-
-        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4))
-
-        expected_shape_boxes = torch.Size((1, 300, 4))
-        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
-        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
-
-        # verify postprocessing
-        results = image_processor.post_process_object_detection(
-            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
-        )[0]
-        expected_scores = torch.tensor([0.6392, 0.6276, 0.5546, 0.5260, 0.4706], device=torch_device)
-        expected_labels = [75, 17, 17, 75, 63]
-        expected_slice_boxes = torch.tensor([40.5866, 73.2107, 176.1421, 117.1751], device=torch_device)
-
-        self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
-        self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
-        self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
-
-    def test_inference_object_detection_head_swin_backbone(self):
-        model = DetaForObjectDetection.from_pretrained("jozhang97/deta-swin-large").to(torch_device)
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        expected_shape_logits = torch.Size((1, 300, model.config.num_labels))
-        self.assertEqual(outputs.logits.shape, expected_shape_logits)
-
-        expected_logits = torch.tensor(
-            [[-7.6308, -2.8485, -5.3737], [-7.2037, -4.5505, -4.8027], [-7.2943, -4.2611, -4.6617]]
-        ).to(torch_device)
-        expected_boxes = torch.tensor(
-            [[0.4987, 0.4969, 0.9999], [0.2549, 0.5498, 0.4805], [0.5498, 0.2757, 0.0569]]
-        ).to(torch_device)
-
-        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4))
-
-        expected_shape_boxes = torch.Size((1, 300, 4))
-        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
-        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
-
-        # verify postprocessing
-        results = image_processor.post_process_object_detection(
-            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
-        )[0]
-        expected_scores = torch.tensor([0.6831, 0.6826, 0.5684, 0.5464, 0.4392], device=torch_device)
-        expected_labels = [17, 17, 75, 75, 63]
-        expected_slice_boxes = torch.tensor([345.8478, 23.6754, 639.8562, 372.8265], device=torch_device)
-
-        self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
-        self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
-        self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
--- a/tests/models/efficientformer/init.py
+++ b/tests/models/efficientformer/init.py
--- a/tests/models/efficientformer/test_image_processing_efficientformer.py
+++ b/tests/models/efficientformer/test_image_processing_efficientformer.py
@@ -1,99 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_vision_available():
-    from transformers import ViTImageProcessor
-
-
-class EfficientFormerImageProcessorTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        num_channels=3,
-        image_size=224,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        size = size if size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_image_processor_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.size["height"], self.size["width"]
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_torch
-@require_vision
-class EfficientFormerImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = ViTImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        self.image_processor_tester = EfficientFormerImageProcessorTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_proc_properties(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processor, "image_mean"))
-        self.assertTrue(hasattr(image_processor, "image_std"))
-        self.assertTrue(hasattr(image_processor, "do_normalize"))
-        self.assertTrue(hasattr(image_processor, "do_resize"))
-        self.assertTrue(hasattr(image_processor, "size"))
--- a/tests/models/efficientformer/test_modeling_efficientformer.py
+++ b/tests/models/efficientformer/test_modeling_efficientformer.py
@@ -1,478 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch EfficientFormer model."""
-
-import unittest
-import warnings
-from typing import List
-
-from transformers import EfficientFormerConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property, is_torch_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        EfficientFormerForImageClassification,
-        EfficientFormerForImageClassificationWithTeacher,
-        EfficientFormerModel,
-    )
-    from transformers.models.auto.modeling_auto import (
-        MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
-        MODEL_MAPPING_NAMES,
-    )
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import EfficientFormerImageProcessor
-
-
-class EfficientFormerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size: int = 13,
-        image_size: int = 64,
-        patch_size: int = 2,
-        embed_dim: int = 3,
-        num_channels: int = 3,
-        is_training: bool = True,
-        use_labels: bool = True,
-        hidden_size: int = 128,
-        hidden_sizes=[16, 32, 64, 128],
-        num_hidden_layers: int = 7,
-        num_attention_heads: int = 4,
-        intermediate_size: int = 37,
-        hidden_act: str = "gelu",
-        hidden_dropout_prob: float = 0.1,
-        attention_probs_dropout_prob: float = 0.1,
-        type_sequence_label_size: int = 10,
-        initializer_range: float = 0.02,
-        encoder_stride: int = 2,
-        num_attention_outputs: int = 1,
-        dim: int = 128,
-        depths: List[int] = [2, 2, 2, 2],
-        resolution: int = 2,
-        mlp_expansion_ratio: int = 2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.encoder_stride = encoder_stride
-        self.num_attention_outputs = num_attention_outputs
-        self.embed_dim = embed_dim
-        self.seq_length = embed_dim + 1
-        self.resolution = resolution
-        self.depths = depths
-        self.hidden_sizes = hidden_sizes
-        self.dim = dim
-        self.mlp_expansion_ratio = mlp_expansion_ratio
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return EfficientFormerConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
-            resolution=self.resolution,
-            depths=self.depths,
-            hidden_sizes=self.hidden_sizes,
-            dim=self.dim,
-            mlp_expansion_ratio=self.mlp_expansion_ratio,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = EfficientFormerModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = EfficientFormerForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = EfficientFormerForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_torch
-class EfficientFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as EfficientFormer does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (
-            EfficientFormerModel,
-            EfficientFormerForImageClassificationWithTeacher,
-            EfficientFormerForImageClassification,
-        )
-        if is_torch_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "image-feature-extraction": EfficientFormerModel,
-            "image-classification": (
-                EfficientFormerForImageClassification,
-                EfficientFormerForImageClassificationWithTeacher,
-            ),
-        }
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = False
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = EfficientFormerModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=EfficientFormerConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="EfficientFormer does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="EfficientFormer does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
-                    seq_length = seq_length * self.model_tester.chunk_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[-1].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.assertIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-
-                self.assertListEqual(
-                    list(hidden_states[-1].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "EfficientFormerForImageClassificationWithTeacher":
-                del inputs_dict["labels"]
-
-        return inputs_dict
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="EfficientFormer does not implement masked image modeling yet")
-    def test_for_masked_image_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    # special case for EfficientFormerForImageClassificationWithTeacher model
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            # EfficientFormerForImageClassificationWithTeacher supports inference-only
-            if (
-                model_class.__name__ in MODEL_MAPPING_NAMES.values()
-                or model_class.__name__ == "EfficientFormerForImageClassificationWithTeacher"
-            ):
-                continue
-            model = model_class(config)
-            model.to(torch_device)
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            loss.backward()
-
-    def test_problem_types(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        problem_types = [
-            {"title": "multi_label_classification", "num_labels": 2, "dtype": torch.float},
-            {"title": "single_label_classification", "num_labels": 1, "dtype": torch.long},
-            {"title": "regression", "num_labels": 1, "dtype": torch.float},
-        ]
-
-        for model_class in self.all_model_classes:
-            if (
-                model_class.__name__
-                not in [
-                    *MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES.values(),
-                ]
-                or model_class.__name__ == "EfficientFormerForImageClassificationWithTeacher"
-            ):
-                continue
-
-            for problem_type in problem_types:
-                with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
-                    config.problem_type = problem_type["title"]
-                    config.num_labels = problem_type["num_labels"]
-
-                    model = model_class(config)
-                    model.to(torch_device)
-                    model.train()
-
-                    inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-                    if problem_type["num_labels"] > 1:
-                        inputs["labels"] = inputs["labels"].unsqueeze(1).repeat(1, problem_type["num_labels"])
-
-                    inputs["labels"] = inputs["labels"].to(problem_type["dtype"])
-
-                    # This tests that we do not trigger the warning form PyTorch "Using a target size that is different
-                    # to the input size. This will likely lead to incorrect results due to broadcasting. Please ensure
-                    # they have the same size." which is a symptom something in wrong for the regression problem.
-                    # See https://github.com/huggingface/transformers/issues/11780
-                    with warnings.catch_warnings(record=True) as warning_list:
-                        loss = model(**inputs).loss
-                    for w in warning_list:
-                        if "Using a target size that is different to the input size" in str(w.message):
-                            raise ValueError(
-                                f"Something is going wrong in the regression problem: intercepted {w.message}"
-                            )
-
-                    loss.backward()
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "snap-research/efficientformer-l1-300"
-        model = EfficientFormerModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
-
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_torch
-@require_vision
-class EfficientFormerModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            EfficientFormerImageProcessor.from_pretrained("snap-research/efficientformer-l1-300")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = EfficientFormerForImageClassification.from_pretrained("snap-research/efficientformer-l1-300").to(
-            torch_device
-        )
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = torch.tensor([-0.0555, 0.4825, -0.0852]).to(torch_device)
-        self.assertTrue(torch.allclose(outputs.logits[0][:3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_image_classification_head_with_teacher(self):
-        model = EfficientFormerForImageClassificationWithTeacher.from_pretrained(
-            "snap-research/efficientformer-l1-300"
-        ).to(torch_device)
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = torch.tensor([-0.1312, 0.4353, -1.0499]).to(torch_device)
-        self.assertTrue(torch.allclose(outputs.logits[0][:3], expected_slice, atol=1e-4))
--- a/tests/models/efficientformer/test_modeling_tf_efficientformer.py
+++ b/tests/models/efficientformer/test_modeling_tf_efficientformer.py
@@ -1,409 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow EfficientFormer model."""
-
-import inspect
-import unittest
-from typing import List
-
-import numpy as np
-
-from transformers import EfficientFormerConfig
-from transformers.testing_utils import require_tf, require_vision, slow
-from transformers.utils import cached_property, is_tf_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TFEfficientFormerForImageClassification,
-        TFEfficientFormerForImageClassificationWithTeacher,
-        TFEfficientFormerModel,
-    )
-    from transformers.modeling_tf_utils import keras
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import EfficientFormerImageProcessor
-
-
-class TFEfficientFormerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size: int = 13,
-        image_size: int = 64,
-        patch_size: int = 2,
-        embed_dim: int = 3,
-        num_channels: int = 3,
-        is_training: bool = True,
-        use_labels: bool = True,
-        hidden_size: int = 128,
-        hidden_sizes=[16, 32, 64, 128],
-        num_hidden_layers: int = 7,
-        num_attention_heads: int = 4,
-        intermediate_size: int = 37,
-        hidden_act: str = "gelu",
-        hidden_dropout_prob: float = 0.1,
-        attention_probs_dropout_prob: float = 0.1,
-        type_sequence_label_size: int = 10,
-        initializer_range: float = 0.02,
-        encoder_stride: int = 2,
-        num_attention_outputs: int = 1,
-        dim: int = 128,
-        depths: List[int] = [2, 2, 2, 2],
-        resolution: int = 2,
-        mlp_expansion_ratio: int = 2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.encoder_stride = encoder_stride
-        self.num_attention_outputs = num_attention_outputs
-        self.embed_dim = embed_dim
-        self.seq_length = embed_dim + 1
-        self.resolution = resolution
-        self.depths = depths
-        self.hidden_sizes = hidden_sizes
-        self.dim = dim
-        self.mlp_expansion_ratio = mlp_expansion_ratio
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return EfficientFormerConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
-            resolution=self.resolution,
-            depths=self.depths,
-            hidden_sizes=self.hidden_sizes,
-            dim=self.dim,
-            mlp_expansion_ratio=self.mlp_expansion_ratio,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = TFEfficientFormerModel(config=config)
-        result = model(pixel_values, training=False)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = TFEfficientFormerForImageClassification(config)
-        result = model(pixel_values, labels=labels, training=False)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = TFEfficientFormerForImageClassification(config)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFEfficientFormerModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_tf_common.py, as EfficientFormer does not use input_ids,
-    inputs_embeds, attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (
-            TFEfficientFormerModel,
-            TFEfficientFormerForImageClassificationWithTeacher,
-            TFEfficientFormerForImageClassification,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFEfficientFormerModel,
-            "image-classification": (
-                TFEfficientFormerForImageClassification,
-                TFEfficientFormerForImageClassificationWithTeacher,
-            ),
-        }
-        if is_tf_available()
-        else {}
-    )
-
-    fx_compatible = False
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFEfficientFormerModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=EfficientFormerConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="EfficientFormer does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="EfficientFormer does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
-                    seq_length = seq_length * self.model_tester.chunk_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[-1].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.asseretIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-
-                self.assertListEqual(
-                    list(hidden_states[-1].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "TFEfficientFormerForImageClassificationWithTeacher":
-                del inputs_dict["labels"]
-
-        return inputs_dict
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="EfficientFormer does not implement masked image modeling yet")
-    def test_for_masked_image_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "snap-research/efficientformer-l1-300"
-        model = TFEfficientFormerModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
-
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-
-    def test_compile_tf_model(self):
-        # We use a simplified version of this test for EfficientFormer because it requires training=False
-        # and Keras refuses to let us force that during functional construction
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            # Prepare our model
-            model = model_class(config)
-            # These are maximally general inputs for the model, with multiple None dimensions
-            # Hopefully this will catch any conditionals that fail for flexible shapes
-            functional_inputs = {
-                key: keras.Input(shape=val.shape[1:], dtype=val.dtype, name=key)
-                for key, val in model.input_signature.items()
-                if key in model.dummy_inputs
-            }
-            outputs_dict = model(functional_inputs)
-            self.assertTrue(outputs_dict is not None)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_tf
-@require_vision
-class EfficientFormerModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            EfficientFormerImageProcessor.from_pretrained("snap-research/efficientformer-l1-300")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = TFEfficientFormerForImageClassification.from_pretrained("snap-research/efficientformer-l1-300")
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="tf")
-        # forward pass
-        outputs = model(**inputs, training=False)
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        expected_slice = tf.constant([-0.0555, 0.4825, -0.0852])
-        self.assertTrue(np.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_image_classification_head_with_teacher(self):
-        model = TFEfficientFormerForImageClassificationWithTeacher.from_pretrained(
-            "snap-research/efficientformer-l1-300"
-        )
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="tf")
-        # forward pass
-        outputs = model(**inputs, training=False)
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        expected_slice = tf.constant([-0.1312, 0.4353, -1.0499])
-        self.assertTrue(np.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
--- a/tests/models/ernie_m/init.py
+++ b/tests/models/ernie_m/init.py
--- a/tests/models/ernie_m/test_modeling_ernie_m.py
+++ b/tests/models/ernie_m/test_modeling_ernie_m.py
@@ -1,323 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. and Baidu team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch ErnieM model."""
-
-import unittest
-
-from transformers import ErnieMConfig, is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        ErnieMForInformationExtraction,
-        ErnieMForMultipleChoice,
-        ErnieMForQuestionAnswering,
-        ErnieMForSequenceClassification,
-        ErnieMForTokenClassification,
-        ErnieMModel,
-    )
-
-
-class ErnieMModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def prepare_config_and_inputs_for_uiem(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return ErnieMConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
-        model = ErnieMModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, return_dict=True)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ErnieMForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_information_extraction(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ErnieMForInformationExtraction(config=config)
-        model.to(torch_device)
-        model.eval()
-        sequence_labels = torch.ones_like(input_ids, dtype=torch.float32)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = ErnieMForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = ErnieMForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        input_ids.to(torch_device)
-        input_mask.to(torch_device)
-        token_labels.to(torch_device)
-
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = ErnieMForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class ErnieMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            ErnieMModel,
-            ErnieMForMultipleChoice,
-            ErnieMForQuestionAnswering,
-            ErnieMForSequenceClassification,
-            ErnieMForTokenClassification,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": ErnieMModel,
-            "question-answering": ErnieMForQuestionAnswering,
-            "text-classification": ErnieMForSequenceClassification,
-            "token-classification": ErnieMForTokenClassification,
-            "zero-shot": ErnieMForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    test_torchscript = False
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "QAPipelineTests":
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = ErnieMModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ErnieMConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_information_extraction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_information_extraction(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "susnato/ernie-m-base_pytorch"
-        model = ErnieMModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class ErnieMModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_model(self):
-        model = ErnieMModel.from_pretrained("susnato/ernie-m-base_pytorch")
-        model.eval()
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        # TODO Replace vocab size
-        hidden_size = 768
-
-        expected_shape = torch.Size((1, 6, hidden_size))
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = torch.tensor(
-            [[[-0.0012, 0.1245, -0.0214], [-0.0742, 0.0244, -0.0771], [-0.0333, 0.1164, -0.1554]]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
--- a/tests/models/ernie_m/test_tokenization_ernie_m.py
+++ b/tests/models/ernie_m/test_tokenization_ernie_m.py
@@ -1,143 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. and Baidu team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch ErnieM model."""
-
-import unittest
-
-from transformers import ErnieMTokenizer
-from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model")
-
-
-@require_sentencepiece
-@require_tokenizers
-class ErnieMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "susnato/ernie-m-base_pytorch"
-    tokenizer_class = ErnieMTokenizer
-    test_seq2seq = False
-    test_sentencepiece = True
-    test_rust_tokenizer = False
-    test_sentencepiece_ignore_case = False
-
-    def setUp(self):
-        super().setUp()
-
-        # We have a SentencePiece fixture for testing
-        tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, unk_token="<unk>", pad_token="<pad>")
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "this is a test"
-        output_text = "this is a test"
-        return input_text, output_text
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = "<pad>"
-        token_id = 0
-
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "<pad>")
-        self.assertEqual(vocab_keys[1], "<unk>")
-        self.assertEqual(vocab_keys[-1], "▁eloquent")
-        self.assertEqual(len(vocab_keys), 30_000)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 30_000)
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "I was born in 92000, and this is falsé."
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_full_tokenizer(self):
-        tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, do_lower_case=True, unk_token="<unk>", pad_token="<pad>")
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"])
-
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        # ErnieMTokenizer(paddlenlp implementation) outputs '9' instead of '_9' so to mimic that '_9' is changed to '9'
-        self.assertListEqual(
-            tokens, ["▁i", "▁was", "▁born", "▁in", "9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "."]
-        )
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(ids, [31, 23, 386, 19, 518, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(
-            back_tokens,
-            ["▁i", "▁was", "▁born", "▁in", "9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "."],
-        )
-
-    def test_sequence_builders(self):
-        tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, unk_token="<unk>", pad_token="<pad>")
-
-        text = tokenizer.encode("sequence builders")
-        text_2 = tokenizer.encode("multi-sequence build")
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
-        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + [
-            tokenizer.sep_token_id
-        ] + text_2 + [tokenizer.sep_token_id]
-
-    @slow
-    def test_tokenizer_integration(self):
-        expected_encoding = {'input_ids': [[0, 11062, 82772, 7, 15, 82772, 538, 51529, 237, 17198, 1290, 206, 9, 215175, 1314, 136, 17198, 1290, 206, 9, 56359, 42, 122009, 9, 16466, 16, 87344, 4537, 9, 4717, 78381, 6, 159958, 7, 15, 24480, 618, 4, 527, 22693, 9, 304, 4, 2777, 24480, 9874, 4, 43523, 594, 4, 803, 18392, 33189, 18, 4, 43523, 24447, 5, 5, 5, 16, 100, 24955, 83658, 9626, 144057, 15, 839, 22335, 16, 136, 24955, 83658, 83479, 15, 39102, 724, 16, 678, 645, 6460, 1328, 4589, 42, 122009, 115774, 23, 3559, 1328, 46876, 7, 136, 53894, 1940, 42227, 41159, 17721, 823, 425, 4, 27512, 98722, 206, 136, 5531, 4970, 919, 17336, 5, 2], [0, 20080, 618, 83, 82775, 47, 479, 9, 1517, 73, 53894, 333, 80581, 110117, 18811, 5256, 1295, 51, 152526, 297, 7986, 390, 124416, 538, 35431, 214, 98, 15044, 25737, 136, 7108, 43701, 23, 756, 135355, 7, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 581, 63773, 119455, 6, 147797, 88203, 7, 645, 70, 21, 3285, 10269, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # fmt: skip
-
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="susnato/ernie-m-base_pytorch",
-            sequences=[
-                "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
-                "general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
-                "Language Understanding (NLU) and Natural Language Generation (NLG) with over32+ pretrained "
-                "models in100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.",
-                "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
-                "conditioning on both left and right context in all layers.",
-                "The quick brown fox jumps over the lazy dog.",
-            ],
-        )
--- a/tests/models/gptsan_japanese/init.py
+++ b/tests/models/gptsan_japanese/init.py
--- a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
+++ b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
@@ -1,476 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Toshiyuki Sakamoto(tanreinama) and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers import (
-    GPTSanJapaneseConfig,
-    GPTSanJapaneseForConditionalGeneration,
-    GPTSanJapaneseModel,
-    GPTSanJapaneseTokenizer,
-    is_torch_available,
-)
-from transformers.generation import GenerationConfig
-from transformers.testing_utils import require_torch, slow, tooslow, torch_device
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-class GPTSanJapaneseTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        num_contexts=7,
-        # For common tests
-        is_training=True,
-        hidden_size=32,
-        ext_size=42,
-        num_hidden_layers=2,
-        num_ext_layers=2,
-        num_attention_heads=4,
-        num_experts=2,
-        d_ff=32,
-        d_ext=80,
-        d_spout=33,
-        dropout_rate=0.0,
-        layer_norm_epsilon=1e-6,
-        expert_capacity=100,
-        router_jitter_noise=0.0,
-    ):
-        self.vocab_size = vocab_size
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_contexts = num_contexts
-        # For common tests
-        self.seq_length = self.num_contexts
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_ext_layers = num_ext_layers
-        self.ext_size = ext_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_experts = num_experts
-        self.d_ff = d_ff
-        self.d_ext = d_ext
-        self.d_spout = d_spout
-        self.dropout_rate = dropout_rate
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.expert_capacity = expert_capacity
-        self.router_jitter_noise = router_jitter_noise
-
-    def get_large_model_config(self):
-        return GPTSanJapaneseConfig.from_pretrained("Tanrei/GPTSAN-japanese")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-
-        return (config, input_ids)
-
-    def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-
-        return (config, {"input_ids": input_ids})
-
-    def get_config(self):
-        return GPTSanJapaneseConfig(
-            vocab_size=self.vocab_size,
-            num_contexts=self.seq_length,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_ext=self.d_ext,
-            d_spout=self.d_spout,
-            num_switch_layers=self.num_hidden_layers - self.num_ext_layers,
-            num_ext_layers=self.num_ext_layers,
-            num_heads=self.num_attention_heads,
-            num_experts=self.num_experts,
-            expert_capacity=self.expert_capacity,
-            dropout_rate=self.dropout_rate,
-            layer_norm_epsilon=self.layer_norm_epsilon,
-            router_jitter_noise=self.router_jitter_noise,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-    ):
-        model = GPTSanJapaneseForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-        )
-        self.parent.assertIsNotNone(result)
-
-
-@require_torch
-class GPTSanJapaneseTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (GPTSanJapaneseModel,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "conversational": GPTSanJapaneseForConditionalGeneration,
-            "feature-extraction": GPTSanJapaneseForConditionalGeneration,
-            "summarization": GPTSanJapaneseForConditionalGeneration,
-            "text2text-generation": GPTSanJapaneseForConditionalGeneration,
-            "translation": GPTSanJapaneseForConditionalGeneration,
-        }
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = False
-    is_encoder_decoder = False
-    test_pruning = False
-    test_headmasking = False
-    test_save_load_fast_init_to_base = False
-    test_training = False
-    # The small GPTSAN_JAPANESE model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "SummarizationPipelineTests":
-            # TODO: fix `_reorder_cache` is not implemented for this model
-            return True
-        elif pipeline_test_casse_name == "Text2TextGenerationPipelineTests":
-            # TODO: check this.
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = GPTSanJapaneseTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPTSanJapaneseConfig, d_model=37)
-
-    def test_config(self):
-        GPTSanJapaneseConfig()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(
-        reason="skip for now as the computed `max_memory` by `model_split_percents` in the test method will be changed inside `from_pretrained`"
-    )
-    def test_model_parallelism(self):
-        super().test_model_parallelism()
-
-    @unittest.skip(reason="Gptsan does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Gptsan does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-
-@require_torch
-class GPTSanJapaneseForConditionalGenerationTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (GPTSanJapaneseForConditionalGeneration,) if is_torch_available() else ()
-    fx_compatible = False
-    is_encoder_decoder = False
-    test_pruning = False
-    test_headmasking = False
-    # The small GPTSAN_JAPANESE model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    def setUp(self):
-        self.model_tester = GPTSanJapaneseTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPTSanJapaneseConfig, d_model=37)
-
-    def test_config(self):
-        GPTSanJapaneseConfig()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(
-        reason="skip for now as the computed `max_memory` by `model_split_percents` in the test method will be changed inside `from_pretrained`"
-    )
-    def test_model_parallelism(self):
-        super().test_model_parallelism()
-
-    @unittest.skip(reason="Gptsan does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Gptsan does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @slow
-    def test_logits(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        input_ids = tokenizer.encode("武田信玄は", return_tensors="pt")
-        outputs = model(input_ids)
-        output_logits = outputs.logits.detach().cpu().numpy()
-        # Output of original model created with mesh-tensoflow
-        # fmt: off
-        target = [
-            [-12.037839889526367, -12.433061599731445, -14.333840370178223, -12.450345993041992, -11.1661376953125,
-            -11.930137634277344, -10.659740447998047, -12.909574508666992, -13.241043090820312, -13.398579597473145,
-            -11.107524871826172, -12.3685941696167, -22.97943115234375, -10.481067657470703, -12.484030723571777,
-            -12.807360649108887, -14.769700050354004, -12.233579635620117, -13.428145408630371, -22.624177932739258],
-            [-7.511149883270264, -8.281851768493652, -7.943127155303955, -7.55021333694458, -6.49869966506958,
-            -7.586796283721924, -6.978085994720459, -7.839145183563232, -8.21964168548584, -8.695091247558594,
-            -6.706910610198975, -6.6585798263549805, -19.565698623657227, -5.353842735290527, -8.350686073303223,
-            -8.039388656616211, -10.856569290161133, -7.75154447555542, -8.819022178649902, -19.51532745361328],
-            [-9.73066234588623, -10.223922729492188, -9.932981491088867, -11.857836723327637, -7.662626266479492,
-            -11.13529109954834, -7.765097618103027, -11.472923278808594, -9.543149948120117, -11.905633926391602,
-            -9.366164207458496, -11.5734281539917, -23.699003219604492, -9.429590225219727, -10.42839241027832,
-            -10.585240364074707, -10.94771957397461, -11.095416069030762, -10.390240669250488, -23.769372940063477],
-            [-9.728265762329102, -9.859712600708008, -10.09729290008545, -9.678522109985352, -6.879519939422607,
-            -9.68487548828125, -4.2803425788879395, -10.018914222717285, -9.308445930480957, -10.63394546508789,
-            -8.083646774291992, -9.06301498413086, -21.904266357421875, -8.90160846710205, -8.841876029968262,
-            -11.856719970703125, -12.079398155212402, -11.233753204345703, -10.177338600158691, -21.87256622314453],
-            [-9.669764518737793, -9.614198684692383, -9.814510345458984, -9.996501922607422, -11.375690460205078,
-            -10.113405227661133, -10.546867370605469, -10.04369068145752, -10.907809257507324, -10.504216194152832,
-            -11.129199028015137, -10.151124000549316, -21.96586799621582, -9.086349487304688, -11.730339050292969,
-            -10.460667610168457, -10.298049926757812, -10.784148216247559, -10.840693473815918, -22.03152847290039],
-        ]
-        # fmt: on
-        target = np.array(target).flatten()
-        predict = output_logits[0, :, :20].flatten()
-
-        def check(a, b, epsilon=5e-4):
-            return abs(a - b) < epsilon * max(abs(a), abs(b))
-
-        self.assertTrue(np.all([check(target[i], predict[i]) for i in range(len(target))]))
-
-    @slow
-    def test_batch_generation(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        model.to(torch_device)
-
-        # set deterministically
-        generation_config = GenerationConfig.from_pretrained("Tanrei/GPTSAN-japanese")
-        generation_config.top_k = 1
-
-        # use different length sentences to test batching
-        sentences = [
-            "甲斐なら武田と言うほど",
-            "織田信長は、",
-        ]
-
-        tokenizer.padding_side = "left"
-        inputs = tokenizer(sentences, return_tensors="pt", padding=True)
-        input_ids = inputs["input_ids"].to(torch_device)
-
-        self.assertNotEqual(inputs["attention_mask"][0].numpy().tolist(), inputs["attention_mask"][1].numpy().tolist())
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"].to(torch_device),
-            max_new_tokens=3,
-            generation_config=generation_config,
-        )
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
-        output_non_padded = model.generate(
-            input_ids=inputs_non_padded, max_new_tokens=3, generation_config=generation_config
-        )
-
-        inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
-        output_padded = model.generate(input_ids=inputs_padded, max_new_tokens=3, generation_config=generation_config)
-
-        self.assertNotEqual(inputs_non_padded.shape, inputs_padded.shape)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "甲斐なら武田と言うほど甲斐の武田",
-            "織田信長は、このような",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(batch_out_sentence, [non_padded_sentence, padded_sentence])
-
-    @tooslow
-    def test_sample(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        # Output of original model created with mesh-tensoflow
-        target = [
-            ("武田信玄は", 35675),
-            ("武田信玄は、", 45),
-            ("武田信玄は、この", 29),
-            ("武田信玄は、このよう", 30642),
-            ("武田信玄は、このような", 35680),
-            ("武田信玄は、このような「", 8640),
-            ("武田信玄は、このような「武田", 31617),
-            ("武田信玄は、このような「武田家", 30646),
-            ("武田信玄は、このような「武田家の", 31617),
-            ("武田信玄は、このような「武田家の家", 31381),
-        ]
-        for input, output in target:
-            input_ids = tokenizer.encode(input, return_tensors="pt")
-            outputs = model(input_ids)
-            output_logits = outputs.logits.detach().cpu().numpy()[0]
-            output_id = np.argmax(output_logits[-1])
-            self.assertEqual(output_id, output)
-
-    @slow
-    def test_spout_generation(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        model.to(torch_device)
-
-        # set deterministically
-        generation_config = GenerationConfig.from_pretrained("Tanrei/GPTSAN-japanese")
-        generation_config.top_k = 1
-
-        input_text = "武田信玄は、"
-        input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(torch_device)
-        input_ids_batch = tokenizer([input_text, input_text], return_tensors="pt").input_ids.to(torch_device)
-
-        # spout from uniform and one-hot
-
-        spouts = [
-            [0.87882208, 0.38426396, 0.33220248, 0.43890406, 0.16562252,
-            0.04803985, 0.211572  , 0.23188473, 0.37153068, 0.7836377 ,
-            0.02160172, 0.38761719, 0.75290772, 0.90198857, 0.34365777,
-            0.64168169, 0.44318471, 0.14575746, 0.92562881, 0.40812148,
-            0.29019122, 0.88861599, 0.65524846, 0.43563456, 0.38177187,
-            0.70832965, 0.81527892, 0.68832812, 0.38833192, 0.4561522 ,
-            0.14828817, 0.47248213, 0.54357335, 0.82009566, 0.1338884 ,
-            0.02755417, 0.19764677, 0.2422084 , 0.04757674, 0.65409606,
-            0.0824589 , 0.03304383, 0.94387689, 0.98764509, 0.82433901,
-            0.27646741, 0.64907493, 0.76009406, 0.30087915, 0.17904689,
-            0.41601714, 0.67046398, 0.10422822, 0.08447374, 0.07354344,
-            0.61423565, 0.70284866, 0.7532333 , 0.1972038 , 0.29575659,
-            0.90583886, 0.29265307, 0.50000175, 0.70407655, 0.889363  ,
-            0.81904418, 0.66829128, 0.64468815, 0.56563723, 0.85601875,
-            0.94924672, 0.00166762, 0.25220643, 0.74540219, 0.67993247,
-            0.1549675 , 0.39385352, 0.92153607, 0.63745931, 0.27759043,
-            0.84702295, 0.65904271, 0.58676614, 0.8666936 , 0.39607438,
-            0.79954983, 0.42220697, 0.39650381, 0.7849864 , 0.56150201,
-            0.15678925, 0.14746032, 0.34542114, 0.47026783, 0.11956489,
-            0.25421435, 0.33788901, 0.68934842, 0.36424685, 0.71737898,
-            0.38983449, 0.94393779, 0.39575588, 0.36616553, 0.87104665,
-            0.64630203, 0.22516905, 0.88270804, 0.15031338, 0.75144345,
-            0.46459025, 0.85396454, 0.86355643, 0.65139851, 0.70266061,
-            0.30241389, 0.81056497, 0.88865969, 0.38773807, 0.70635849,
-            0.90718459, 0.43245789, 0.28000654, 0.45935562, 0.08773519,
-            0.9552151 , 0.93901511, 0.22489288], # uniform
-            [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0.],
-        ]  # fmt: skip
-
-        output1 = model.generate(
-            input_ids=input_ids,
-            spout=spouts[0],
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        output2 = model.generate(
-            input_ids=input_ids,
-            spout=spouts[1],
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        output3 = model.generate(
-            input_ids=input_ids_batch,
-            spout=spouts,
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        out1_sentence = tokenizer.decode(output1[0])
-        out2_sentence = tokenizer.decode(output2[0])
-        batch_out_sentence = tokenizer.batch_decode(output3)
-
-        expected_output_sentence = [
-            "武田信玄は、武田氏の滅亡後、武田氏の居城であった甲斐武田氏の居城である",
-            "武田信玄は、武田家の滅亡を防ぐため、武田家の家臣である武田信虎を討",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(batch_out_sentence, [out1_sentence, out2_sentence])
-
-    @slow
-    def test_prefix_lm_generation(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        model.to(torch_device)
-
-        # set deterministically
-        generation_config = GenerationConfig.from_pretrained("Tanrei/GPTSAN-japanese")
-        generation_config.top_k = 1
-
-        prefix_text_1 = "武田信玄"
-        prefix_text_2 = "織田信長"
-        input_text_1 = "は、"
-        input_text_2 = "が、"
-        input_tok_1 = tokenizer(input_text_1, prefix_text=prefix_text_1, return_tensors="pt")
-        input_tok_2 = tokenizer(input_text_2, prefix_text=prefix_text_2, return_tensors="pt")
-        input_tok_3 = tokenizer([[prefix_text_1, input_text_1], [prefix_text_2, input_text_2]], return_tensors="pt")
-
-        output1 = model.generate(
-            input_ids=input_tok_1.input_ids.to(torch_device),
-            token_type_ids=input_tok_1.token_type_ids.to(torch_device),
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        output2 = model.generate(
-            input_ids=input_tok_2.input_ids.to(torch_device),
-            token_type_ids=input_tok_2.token_type_ids.to(torch_device),
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        output3 = model.generate(
-            input_ids=input_tok_3.input_ids.to(torch_device),
-            token_type_ids=input_tok_3.token_type_ids.to(torch_device),
-            attention_mask=input_tok_3.attention_mask.to(torch_device),
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        out1_sentence = tokenizer.decode(output1[0])
-        out2_sentence = tokenizer.decode(output2[0])
-        batch_out_sentence = tokenizer.batch_decode(output3)
-
-        expected_output_sentence = [
-            "武田信玄は、武田氏の祖である武田信虎を、その子・武田信友を擁して",
-            "織田信長が、織田信長の妻・お市の方を妻として迎えたという逸話が残",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(batch_out_sentence, [out1_sentence, out2_sentence])
--- a/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py
+++ b/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py
@@ -1,218 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Toshiyuki Sakamoto(tanreinama) and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import unittest
-
-from transformers.models.gptsan_japanese.tokenization_gptsan_japanese import (
-    VOCAB_FILES_NAMES,
-    GPTSanJapaneseTokenizer,
-)
-from transformers.testing_utils import require_jinja, require_tokenizers, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-@require_tokenizers
-class GPTSanJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "Tanrei/GPTSAN-japanese"
-    tokenizer_class = GPTSanJapaneseTokenizer
-    test_rust_tokenizer = False
-    from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False}
-
-    def setUp(self):
-        super().setUp()
-
-        vocab_tokens = ["こん", "こんに", "にちは", "ばんは", "世界,㔺界", "、", "。", "<BR>", "<SP>", "<TAB>", "<URL>", "<EMAIL>", "<TEL>", "<DATE>", "<PRICE>", "<BLOCK>", "<KIGOU>", "<U2000U2BFF>", "<|emoji1|>", "<unk>", "<|bagoftoken|>", "<|endoftext|>"]  # fmt: skip
-        emoji_tokens = {"emoji": {"\ud83d\ude00": "<|emoji1|>"}, "emoji_inv": {"<|emoji1|>": "\ud83d\ude00"}}  # 😀
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.emoji_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["emoji_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-        with open(self.emoji_file, "w") as emoji_writer:
-            emoji_writer.write(json.dumps(emoji_tokens))
-
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return GPTSanJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.get_input_output_texts
-    def get_input_output_texts(self, tokenizer):
-        input_text = "こんにちは、世界。 \nこんばんは、㔺界。😀"
-        output_text = "こんにちは、世界。 \nこんばんは、世界。😀"
-        return input_text, output_text
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.get_clean_sequence
-    def get_clean_sequence(self, tokenizer):
-        input_text, output_text = self.get_input_output_texts(tokenizer)
-        ids = tokenizer.encode(output_text, add_special_tokens=False)
-        text = tokenizer.decode(ids, clean_up_tokenization_spaces=False)
-        return text, ids
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_pretokenized_inputs
-    def test_pretokenized_inputs(self):
-        pass  # TODO add if relevant
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_maximum_encoding_length_pair_input
-    def test_maximum_encoding_length_pair_input(self):
-        pass  # TODO add if relevant
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_maximum_encoding_length_single_input
-    def test_maximum_encoding_length_single_input(self):
-        pass  # TODO add if relevant
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_full_tokenizer
-    def test_full_tokenizer(self):
-        tokenizer = self.get_tokenizer()
-
-        # Testing tokenization
-        input_text = "こんにちは、世界。　こんばんは、㔺界。"
-        expected_token = ["こん", "にちは", "、", "世界", "。", "<SP>", "こん", "ばんは", "、", "㔺界", "。"]
-        tokens = tokenizer.tokenize(input_text)
-        self.assertListEqual(tokens, expected_token)
-
-        # Testing conversion to ids without special tokens
-        expected_ids = [0, 2, 5, 4, 6, 8, 0, 3, 5, 4, 6]
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(input_ids, expected_ids)
-
-        # Testing conversion to ids with special tokens
-        input_tokens = tokens + [tokenizer.unk_token]
-        expected_ids = [0, 2, 5, 4, 6, 8, 0, 3, 5, 4, 6, 19]
-        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
-        self.assertListEqual(input_ids, expected_ids)
-
-    def test_token_bagging(self):
-        tokenizer = self.get_tokenizer()
-
-        # Testing tokenization
-        input_text = "こんにちは、<|bagoftoken|>世界。こんばんは、<|bagoftoken|>㔺界。"
-        expected_text = "こんにちは、、、、世界。こんばんは、、、、世界。"
-        tokens = tokenizer.encode(input_text)
-        output_text = tokenizer.decode(tokens)
-        self.assertEqual(output_text, expected_text)
-
-    @slow
-    def test_prefix_input(self):
-        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
-
-        # Testing tokenization
-        prefix_text = "こんにちは、世界。"
-        input_text = "こんばんは、㔺界。😀"
-        expected_text = "こんにちは、世界。こんばんは、世界。😀"
-        tokens_1 = tokenizer.encode(prefix_text + input_text)
-        tokens_2 = tokenizer.encode("", prefix_text=prefix_text + input_text)
-        tokens_3 = tokenizer.encode(input_text, prefix_text=prefix_text)
-        output_text_1 = tokenizer.decode(tokens_1)
-        output_text_2 = tokenizer.decode(tokens_2)
-        output_text_3 = tokenizer.decode(tokens_3)
-        self.assertEqual(output_text_1, expected_text)
-        self.assertEqual(output_text_2, expected_text)
-        self.assertEqual(output_text_3, expected_text)
-
-    @slow
-    def test_token_type_ids(self):
-        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
-
-        # Testing tokenization
-        prefix_text = "こんにちは、世界。"
-        input_text = "こんばんは、㔺界。😀"
-
-        len_prefix = len(tokenizer.encode(prefix_text)) - 2
-        len_text = len(tokenizer.encode(input_text)) - 2
-
-        expected_mask_1 = [1] + [0] * (len_prefix + len_text + 1)
-        expected_mask_2 = [1] * (len_prefix + len_text + 1) + [0]
-        expected_mask_3 = [1] + [1] * (len_prefix) + [0] * (len_text + 1)
-
-        type_id_1 = tokenizer(prefix_text + input_text).token_type_ids
-        type_id_2 = tokenizer("", prefix_text=prefix_text + input_text).token_type_ids
-        type_id_3 = tokenizer(input_text, prefix_text=prefix_text).token_type_ids
-        self.assertListEqual(type_id_1, expected_mask_1)
-        self.assertListEqual(type_id_2, expected_mask_2)
-        self.assertListEqual(type_id_3, expected_mask_3)
-
-    @slow
-    def test_prefix_tokens(self):
-        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
-
-        x_token_1 = tokenizer.encode("あンいワ")
-        x_token_2 = tokenizer.encode("", prefix_text="あンいワ")
-        x_token_3 = tokenizer.encode("いワ", prefix_text="あン")
-
-        self.assertEqual(tokenizer.decode(x_token_1), tokenizer.decode(x_token_2))
-        self.assertEqual(tokenizer.decode(x_token_1), tokenizer.decode(x_token_3))
-        self.assertNotEqual(x_token_1, x_token_2)
-        self.assertNotEqual(x_token_1, x_token_3)
-        self.assertEqual(x_token_1[1], x_token_2[-1])  # SEG token
-        self.assertEqual(x_token_1[1], x_token_3[3])  # SEG token
-
-    @slow
-    def test_batch_encode(self):
-        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
-
-        input_pairs = [["武田信玄", "は、"], ["織田信長", "の配下の、"]]
-        x_token = tokenizer(input_pairs, padding=True)
-        x_token_2 = tokenizer.batch_encode_plus(input_pairs, padding=True)
-
-        # fmt: off
-        expected_outputs = [[35993, 8640, 25948, 35998, 30647, 35675, 35999, 35999], [35993, 10382, 9868, 35998, 30646, 9459, 30646, 35675]]
-        expected_typeids = [[1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0, 0]]
-        expected_attmask = [[1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1]]
-        # fmt: on
-        self.assertListEqual(x_token.input_ids, expected_outputs)
-        self.assertListEqual(x_token.token_type_ids, expected_typeids)
-        self.assertListEqual(x_token.attention_mask, expected_attmask)
-        self.assertListEqual(x_token_2.input_ids, expected_outputs)
-        self.assertListEqual(x_token_2.token_type_ids, expected_typeids)
-        self.assertListEqual(x_token_2.attention_mask, expected_attmask)
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_conversion_reversible
-    def test_conversion_reversible(self):
-        # Intentionally convert some words to accommodate character fluctuations unique to Japanese
-        pass
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_padding_different_model_input_name
-    def test_padding_different_model_input_name(self):
-        # tokenizer has no padding token
-        pass
-
-    @require_jinja
-    def test_tokenization_for_chat(self):
-        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
-        # This is in English, but it's just here to make sure the chat control tokens are being added properly
-        test_chats = [
-            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
-            [
-                {"role": "system", "content": "You are a helpful chatbot."},
-                {"role": "user", "content": "Hello!"},
-                {"role": "assistant", "content": "Nice to meet you."},
-            ],
-            [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
-        ]
-        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
-        # fmt: off
-        expected_tokens = [
-            [35993, 35998, 35637, 35659, 35665, 35716, 35645, 35662, 35649, 35716, 35645, 35716, 35652, 35649, 35656, 35660, 35650, 35665, 35656, 35716, 35647, 35652, 35645, 35664, 35646, 35659, 35664, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999],
-            [35993, 35998, 35637, 35659, 35665, 35716, 35645, 35662, 35649, 35716, 35645, 35716, 35652, 35649, 35656, 35660, 35650, 35665, 35656, 35716, 35647, 35652, 35645, 35664, 35646, 35659, 35664, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999, 35993, 35998, 35626, 35653, 35647, 35649, 35716, 35664, 35659, 35716, 35657, 35649, 35649, 35664, 35716, 35669, 35659, 35665, 35595, 35716, 35999],
-            [35993, 35998, 35626, 35653, 35647, 35649, 35716, 35664, 35659, 35716, 35657, 35649, 35649, 35664, 35716, 35669, 35659, 35665, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999]
-        ]
-        # fmt: on
-        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
-            self.assertListEqual(tokenized_chat, expected_tokens)
--- a/tests/models/graphormer/init.py
+++ b/tests/models/graphormer/init.py
--- a/tests/models/graphormer/test_modeling_graphormer.py
+++ b/tests/models/graphormer/test_modeling_graphormer.py
--- a/tests/models/jukebox/init.py
+++ b/tests/models/jukebox/init.py
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -1,407 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-from unittest import skip
-
-from transformers import is_torch_available
-from transformers.testing_utils import (
-    require_torch,
-    require_torch_accelerator,
-    require_torch_fp16,
-    slow,
-    torch_device,
-)
-from transformers.trainer_utils import set_seed
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import JukeboxModel, JukeboxPrior, JukeboxTokenizer
-
-
-@require_torch
-class Jukebox1bModelTester(unittest.TestCase):
-    all_model_classes = (JukeboxModel,) if is_torch_available() else ()
-    model_id = "openai/jukebox-1b-lyrics"
-    metas = {
-        "artist": "Zac Brown Band",
-        "genres": "Country",
-        "lyrics": """I met a traveller from an antique land,
-    Who said "Two vast and trunkless legs of stone
-    Stand in the desert. . . . Near them, on the sand,
-    Half sunk a shattered visage lies, whose frown,
-    And wrinkled lip, and sneer of cold command,
-    Tell that its sculptor well those passions read
-    Which yet survive, stamped on these lifeless things,
-    The hand that mocked them, and the heart that fed;
-    And on the pedestal, these words appear:
-    My name is Ozymandias, King of Kings;
-    Look on my Works, ye Mighty, and despair!
-    Nothing beside remains. Round the decay
-    Of that colossal Wreck, boundless and bare
-    The lone and level sands stretch far away
-    """,
-    }
-    # fmt: off
-    EXPECTED_OUTPUT_2 = [
-        1864, 1536, 1213, 1870, 1357, 1536, 519, 880, 1323, 789, 1082, 534,
-        1000, 1445, 1105, 1130, 967, 515, 1434, 1620, 534, 1495, 283, 1445,
-        333, 1307, 539, 1631, 1528, 375, 1434, 673, 627, 710, 778, 1883,
-        1405, 1276, 1455, 1228
-    ]
-
-    EXPECTED_OUTPUT_2_PT_2 = [
-        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653
-    ]
-
-    EXPECTED_OUTPUT_1 = [
-        1125, 1751, 697, 1776, 1141, 1476, 391, 697, 1125, 684, 867, 416,
-        844, 1372, 1274, 717, 1274, 844, 1299, 1419, 697, 1370, 317, 1125,
-        191, 1440, 1370, 1440, 1370, 282, 1621, 1370, 368, 349, 867, 1872,
-        1262, 869, 1728, 747
-    ]
-    EXPECTED_OUTPUT_1_PT_2 = [
-        416, 416, 1125, 1125, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416
-    ]
-
-    EXPECTED_OUTPUT_0 = [
-        1755, 842, 307, 1843, 1022, 1395, 234, 1554, 806, 739, 1022, 442,
-        616, 556, 268, 1499, 933, 457, 1440, 1837, 755, 985, 308, 902,
-        293, 1443, 1671, 1141, 1533, 555, 1562, 1061, 287, 417, 1022, 2008,
-        1186, 1015, 1777, 268
-    ]
-    EXPECTED_OUTPUT_0_PT_2 = [
-        854, 842, 1353, 114, 1353, 842, 185, 842, 185, 114, 591, 842,
-        185, 417, 185, 842, 307, 842, 591, 842, 185, 842, 307, 842,
-        591, 842, 1353, 842, 185, 842, 591, 842, 591, 114, 591, 842,
-        185, 842, 591, 89
-    ]
-
-    EXPECTED_Y_COND = [1058304, 0, 786432, 7169, 507, 76, 27, 40, 30, 76]
-
-    EXPECTED_PRIMED_0 = [
-        390, 1160, 1002, 1907, 1788, 1788, 1788, 1907, 1002, 1002, 1854, 1002,
-        1002, 1002, 1002, 1002, 1002, 1160, 1160, 1606, 596, 596, 1160, 1002,
-        1516, 596, 1002, 1002, 1002, 1907, 1788, 1788, 1788, 1854, 1788, 1907,
-        1907, 1788, 596, 1626
-    ]
-    EXPECTED_PRIMED_1 = [
-        1236, 1668, 1484, 1920, 1848, 1409, 139, 864, 1828, 1272, 1599, 824,
-        1672, 139, 555, 1484, 824, 1920, 555, 596, 1579, 1599, 1231, 1599,
-        1637, 1407, 212, 824, 1599, 116, 1433, 824, 258, 1599, 1433, 1895,
-        1063, 1433, 1433, 1599
-    ]
-    EXPECTED_PRIMED_2 = [
-        1684, 1873, 1119, 1189, 395, 611, 1901, 972, 890, 1337, 1392, 1927,
-        96, 972, 672, 780, 1119, 890, 158, 771, 1073, 1927, 353, 1331,
-        1269, 1459, 1333, 1645, 812, 1577, 1337, 606, 353, 981, 1466, 619,
-        197, 391, 302, 1930
-    ]
-    EXPECTED_VQVAE_ENCODE = [
-        390, 1160, 1002, 1907, 1788, 1788, 1788, 1907, 1002, 1002, 1854, 1002,
-        1002, 1002, 1002, 1002, 1002, 1160, 1160, 1606, 596, 596, 1160, 1002,
-        1516, 596, 1002, 1002, 1002, 1907, 1788, 1788, 1788, 1854, 1788, 1907,
-        1907, 1788, 596, 1626
-    ]
-    EXPECTED_VQVAE_DECODE = [
-        -0.0492, -0.0524, -0.0565, -0.0640, -0.0686, -0.0684, -0.0677, -0.0664,
-        -0.0605, -0.0490, -0.0330, -0.0168, -0.0083, -0.0075, -0.0051, 0.0025,
-        0.0136, 0.0261, 0.0386, 0.0497, 0.0580, 0.0599, 0.0583, 0.0614,
-        0.0740, 0.0889, 0.1023, 0.1162, 0.1211, 0.1212, 0.1251, 0.1336,
-        0.1502, 0.1686, 0.1883, 0.2148, 0.2363, 0.2458, 0.2507, 0.2531
-    ]
-    EXPECTED_AUDIO_COND = [
-        0.0256, -0.0544, 0.1600, -0.0032, 0.1066, 0.0825, -0.0013, 0.3440,
-        0.0210, 0.0412, -0.1777, -0.0892, -0.0164, 0.0285, -0.0613, -0.0617,
-        -0.0137, -0.0201, -0.0175, 0.0215, -0.0627, 0.0520, -0.0730, 0.0970,
-        -0.0100, 0.0442, -0.0586, 0.0207, -0.0015, -0.0082
-    ]
-    EXPECTED_META_COND = [
-        0.0415, 0.0877, 0.0022, -0.0055, 0.0751, 0.0334, 0.0324, -0.0068,
-        0.0011, 0.0017, -0.0676, 0.0655, -0.0143, 0.0399, 0.0303, 0.0743,
-        -0.0168, -0.0394, -0.1113, 0.0124, 0.0442, 0.0267, -0.0003, -0.1536,
-        -0.0116, -0.1837, -0.0180, -0.1026, -0.0777, -0.0456
-    ]
-    EXPECTED_LYRIC_COND = [
-        76, 27, 40, 30, 76, 46, 44, 47, 40, 37, 38, 31, 45, 45, 76, 38, 31, 33,
-        45, 76, 41, 32, 76, 45, 46, 41, 40, 31, 78, 76
-    ]
-    # fmt: on
-
-    def prepare_inputs(self):
-        tokenizer = JukeboxTokenizer.from_pretrained(self.model_id)
-        tokens = tokenizer(**self.metas)["input_ids"]
-        return tokens
-
-    @slow
-    def test_sampling(self):
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-        labels = self.prepare_inputs()
-
-        set_seed(0)
-        zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
-        self.assertIn(zs[0][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_2, self.EXPECTED_OUTPUT_2_PT_2])
-
-        set_seed(0)
-        zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[1].raw_to_tokens, save_results=False)
-        self.assertIn(zs[1][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_1, self.EXPECTED_OUTPUT_1_PT_2])
-
-        set_seed(0)
-        zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[2].raw_to_tokens, save_results=False)
-        self.assertIn(zs[2][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_0, self.EXPECTED_OUTPUT_0_PT_2])
-
-    @slow
-    def test_conditioning(self):
-        torch.backends.cuda.matmul.allow_tf32 = False
-        torch.backends.cudnn.allow_tf32 = False
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-
-        labels = self.prepare_inputs()
-        set_seed(0)
-        zs = [torch.zeros(1, 0, dtype=torch.long) for _ in range(3)]
-
-        top_prior = model.priors[0]
-        start = 0
-        music_token_conds = top_prior.get_music_tokens_conds(zs, start=start, end=start + top_prior.n_ctx)
-        metadata = top_prior.get_metadata(labels[0].clone(), start, 1058304, 0)
-
-        self.assertIsNone(music_token_conds)
-        self.assertListEqual(metadata.numpy()[0][:10].tolist(), self.EXPECTED_Y_COND)
-
-        audio_conditioning, metadata_conditioning, lyric_tokens = top_prior.get_cond(music_token_conds, metadata)
-        torch.testing.assert_close(
-            audio_conditioning[0][0][:30].detach(), torch.tensor(self.EXPECTED_AUDIO_COND), atol=1e-4, rtol=1e-4
-        )
-        torch.testing.assert_close(
-            metadata_conditioning[0][0][:30].detach(), torch.tensor(self.EXPECTED_META_COND), atol=1e-4, rtol=1e-4
-        )
-        torch.testing.assert_close(
-            lyric_tokens[0, :30].detach(), torch.tensor(self.EXPECTED_LYRIC_COND), atol=1e-4, rtol=1e-4
-        )
-
-    @slow
-    def test_primed_sampling(self):
-        torch.backends.cuda.matmul.allow_tf32 = False
-        torch.backends.cudnn.allow_tf32 = False
-
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-        set_seed(0)
-        waveform = torch.rand((1, 5120, 1))
-        tokens = list(self.prepare_inputs())
-
-        zs = [model.vqvae.encode(waveform, start_level=2, bs_chunks=waveform.shape[0])[0], None, None]
-        zs = model._sample(
-            zs, tokens, sample_levels=[0], save_results=False, sample_length=40 * model.priors[0].raw_to_tokens
-        )
-        torch.testing.assert_close(zs[0][0][:40], torch.tensor(self.EXPECTED_PRIMED_0))
-
-        upper_2 = torch.cat((zs[0], torch.zeros(1, 2048 - zs[0].shape[-1])), dim=-1).long()
-        zs = [upper_2, model.vqvae.encode(waveform, start_level=1, bs_chunks=waveform.shape[0])[0], None]
-        zs = model._sample(
-            zs, tokens, sample_levels=[1], save_results=False, sample_length=40 * model.priors[1].raw_to_tokens
-        )
-        torch.testing.assert_close(zs[1][0][:40], torch.tensor(self.EXPECTED_PRIMED_1))
-
-        upper_1 = torch.cat((zs[1], torch.zeros(1, 2048 - zs[1].shape[-1])), dim=-1).long()
-        zs = [upper_2, upper_1, model.vqvae.encode(waveform, start_level=0, bs_chunks=waveform.shape[0])[0]]
-        zs = model._sample(
-            zs, tokens, sample_levels=[2], save_results=False, sample_length=40 * model.priors[2].raw_to_tokens
-        )
-        torch.testing.assert_close(zs[2][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_2))
-
-    @slow
-    def test_vqvae(self):
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-        set_seed(0)
-        x = torch.rand((1, 5120, 1))
-        with torch.no_grad():
-            zs = model.vqvae.encode(x, start_level=2, bs_chunks=x.shape[0])
-        torch.testing.assert_close(zs[0][0], torch.tensor(self.EXPECTED_VQVAE_ENCODE))
-
-        with torch.no_grad():
-            x = model.vqvae.decode(zs, start_level=2, bs_chunks=x.shape[0])
-        torch.testing.assert_close(x[0, :40, 0], torch.tensor(self.EXPECTED_VQVAE_DECODE), atol=1e-4, rtol=1e-4)
-
-
-@require_torch
-class Jukebox5bModelTester(unittest.TestCase):
-    all_model_classes = (JukeboxModel,) if is_torch_available() else ()
-    model_id = "openai/jukebox-5b-lyrics"
-    metas = {
-        "artist": "Zac Brown Band",
-        "genres": "Country",
-        "lyrics": """I met a traveller from an antique land,
-    Who said "Two vast and trunkless legs of stone
-    Stand in the desert. . . . Near them, on the sand,
-    Half sunk a shattered visage lies, whose frown,
-    And wrinkled lip, and sneer of cold command,
-    Tell that its sculptor well those passions read
-    Which yet survive, stamped on these lifeless things,
-    The hand that mocked them, and the heart that fed;
-    And on the pedestal, these words appear:
-    My name is Ozymandias, King of Kings;
-    Look on my Works, ye Mighty, and despair!
-    Nothing beside remains. Round the decay
-    Of that colossal Wreck, boundless and bare
-    The lone and level sands stretch far away
-    """,
-    }
-
-    # fmt: off
-    EXPECTED_OUTPUT_2 = [
-        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        1489, 1489, 1489, 1489, 1150, 1853, 1509, 1150, 1357, 1509, 6, 1272
-    ]
-    EXPECTED_OUTPUT_2_PT_2 = [
-        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653
-    ]
-
-    EXPECTED_OUTPUT_1 = [
-        1125, 416, 1125, 1125, 1125, 1125, 1125, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416
-    ]
-    EXPECTED_OUTPUT_1_PT_2 = [
-        416, 416, 1125, 1125, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416
-    ]
-
-    EXPECTED_OUTPUT_0 = [
-        1755, 1061, 234, 1755, 1061, 1755, 185, 290, 307, 307, 616, 616,
-        616, 616, 616, 616, 307, 290, 417, 1755, 234, 1755, 185, 290,
-        290, 290, 307, 616, 616, 616, 616, 616, 290, 234, 234, 1755,
-        234, 234, 1755, 234, 185, 185, 307, 616, 616, 616, 616, 290,
-        1755, 1755, 1755, 234, 234, 1755, 1572, 290, 307, 616, 34, 616
-    ]
-    EXPECTED_OUTPUT_0_PT_2 = [
-        854, 842, 1353, 114, 1353, 842, 185, 842, 185, 114, 591, 842, 185,
-        417, 185, 842, 307, 842, 591, 842, 185, 842, 185, 842, 591, 842,
-        1353, 842, 185, 842, 591, 842, 591, 114, 591, 842, 185, 842, 591,
-        89, 591, 842, 591, 842, 591, 417, 1372, 842, 1372, 842, 34, 842,
-        185, 89, 591, 842, 185, 842, 591, 632
-    ]
-
-    EXPECTED_GPU_OUTPUTS_2 = [
-        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653
-    ]
-    EXPECTED_GPU_OUTPUTS_2_PT_2 = [
-        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 1853, 1177, 1536, 1228,
-        710, 475, 1489, 1229, 1224, 231, 1224, 252, 1434, 653, 475,
-        1106, 1877, 1599, 1228, 1600, 1683, 1182, 1853, 475, 1864,
-        252, 1229, 1434, 2001
-    ]
-
-    EXPECTED_GPU_OUTPUTS_1 = [
-        1125, 1125, 416, 1125, 1125, 416, 1125, 1125, 416, 416, 1125, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416
-    ]
-    EXPECTED_GPU_OUTPUTS_0 = [
-        491, 1755, 34, 1613, 1755, 417, 992, 1613, 222, 842, 1353, 1613,
-        844, 632, 185, 1613, 844, 632, 185, 1613, 185, 842, 677, 1613,
-        185, 114, 1353, 1613, 307, 89, 844, 1613, 307, 1332, 234, 1979,
-        307, 89, 1353, 616, 34, 842, 185, 842, 34, 842, 185, 842,
-        307, 114, 185, 89, 34, 1268, 185, 89, 34, 842, 185, 89
-    ]
-    # fmt: on
-
-    def prepare_inputs(self, model_id):
-        tokenizer = JukeboxTokenizer.from_pretrained(model_id)
-        tokens = tokenizer(**self.metas)["input_ids"]
-        return tokens
-
-    @slow
-    def test_sampling(self):
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-        labels = self.prepare_inputs(self.model_id)
-
-        set_seed(0)
-        zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[0].raw_to_tokens, save_results=False)
-        self.assertIn(zs[0][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_2, self.EXPECTED_OUTPUT_2_PT_2])
-
-        set_seed(0)
-        zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[1].raw_to_tokens, save_results=False)
-        self.assertIn(zs[1][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_1, self.EXPECTED_OUTPUT_1_PT_2])
-
-        set_seed(0)
-        zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[2].raw_to_tokens, save_results=False)
-        self.assertIn(zs[2][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_0, self.EXPECTED_OUTPUT_0_PT_2])
-
-    @slow
-    @require_torch_accelerator
-    @skip("Not enough GPU memory on CI runners")
-    def test_slow_sampling(self):
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-        labels = [i.to(torch_device) for i in self.prepare_inputs(self.model_id)]
-
-        set_seed(0)
-        model.priors[0].to(torch_device)
-        zs = [torch.zeros(1, 0, dtype=torch.long).to(torch_device) for _ in range(3)]
-        zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[0].raw_to_tokens, save_results=False)
-        torch.testing.assert_close(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
-        model.priors[0].cpu()
-
-        set_seed(0)
-        model.priors[1].to(torch_device)
-        zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[1].raw_to_tokens, save_results=False)
-        torch.testing.assert_close(zs[1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
-        model.priors[1].cpu()
-
-        set_seed(0)
-        model.priors[2].to(torch_device)
-        zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[2].raw_to_tokens, save_results=False)
-        torch.testing.assert_close(zs[2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
-
-    @slow
-    @require_torch_accelerator
-    @require_torch_fp16
-    def test_fp16_slow_sampling(self):
-        prior_id = "ArthurZ/jukebox_prior_0"
-        model = JukeboxPrior.from_pretrained(prior_id, min_duration=0).eval().half().to(torch_device)
-
-        labels = self.prepare_inputs(prior_id)[0].to(torch_device)
-        metadata = model.get_metadata(labels, 0, 7680, 0)
-        set_seed(0)
-        outputs = model.sample(1, metadata=metadata, sample_tokens=60)
-        self.assertIn(outputs[0].cpu().tolist(), [self.EXPECTED_GPU_OUTPUTS_2, self.EXPECTED_GPU_OUTPUTS_2_PT_2])
--- a/tests/models/jukebox/test_tokenization_jukebox.py
+++ b/tests/models/jukebox/test_tokenization_jukebox.py
@@ -1,209 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers import JukeboxTokenizer
-from transformers.testing_utils import require_torch
-
-
-class JukeboxTokenizationTest(unittest.TestCase):
-    tokenizer_class = JukeboxTokenizer
-    metas = {
-        "artist": "Zac Brown Band",
-        "genres": "Country",
-        "lyrics": """I met a traveller from an antique land,
-        Who said "Two vast and trunkless legs of stone
-        Stand in the desert. . . . Near them, on the sand,
-        Half sunk a shattered visage lies, whose frown,
-        And wrinkled lip, and sneer of cold command,
-        Tell that its sculptor well those passions read
-        Which yet survive, stamped on these lifeless things,
-        The hand that mocked them, and the heart that fed;
-        And on the pedestal, these words appear:
-        My name is Ozymandias, King of Kings;
-        Look on my Works, ye Mighty, and despair!
-        Nothing beside remains. Round the decay
-        Of that colossal Wreck, boundless and bare
-        The lone and level sands stretch far away
-        """,
-    }
-
-    @require_torch
-    def test_1b_lyrics_tokenizer(self):
-        """
-        how to run the same test with openAI
-        ...
-        """
-        import torch
-
-        tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
-        tokens = tokenizer(**self.metas)["input_ids"]
-        # fmt: off
-        EXPECTED_OUTPUT = [
-            torch.tensor([[
-                0, 0, 0, 7169, 507, 9, 76, 39, 31, 46, 76, 27,
-                76, 46, 44, 27, 48, 31, 38, 38, 31, 44, 76, 32,
-                44, 41, 39, 76, 27, 40, 76, 27, 40, 46, 35, 43,
-                47, 31, 76, 38, 27, 40, 30, 64, 78, 76, 76, 76,
-                76, 76, 76, 76, 76, 23, 34, 41, 76, 45, 27, 35,
-                30, 76, 71, 20, 49, 41, 76, 48, 27, 45, 46, 76,
-                27, 40, 30, 76, 46, 44, 47, 40, 37, 38, 31, 45,
-                45, 76, 38, 31, 33, 45, 76, 41, 32, 76, 45, 46,
-                41, 40, 31, 78, 76, 76, 76, 76, 76, 76, 76, 76,
-                19, 46, 27, 40, 30, 76, 35, 40, 76, 46, 34, 31,
-                76, 30, 31, 45, 31, 44, 46, 63, 76, 63, 76, 63,
-                76, 63, 76, 14, 31, 27, 44, 76, 46, 34, 31, 39,
-                64, 76, 41, 40, 76, 46, 34, 31, 76, 45, 27, 40,
-                30, 64, 78, 76, 76, 76, 76, 76, 76, 76, 76, 8,
-                27, 38, 32, 76, 45, 47, 40, 37, 76, 27, 76, 45,
-                34, 27, 46, 46, 31, 44, 31, 30, 76, 48, 35, 45,
-                27, 33, 31, 76, 38, 35, 31, 45, 64, 76, 49, 34,
-                41, 45, 31, 76, 32, 44, 41, 49, 40, 64, 78, 76,
-                76, 76, 76, 76, 76, 76, 76, 1, 40, 30, 76, 49,
-                44, 35, 40, 37, 38, 31, 30, 76, 38, 35, 42, 64,
-                76, 27, 40, 30, 76, 45, 40, 31, 31, 44, 76, 41,
-                32, 76, 29, 41, 38, 30, 76, 29, 41, 39, 39, 27,
-                40, 30, 64, 78, 76, 76, 76, 76, 76, 76, 76, 76,
-                20, 31, 38, 38, 76, 46, 34, 27, 46, 76, 35, 46,
-                45, 76, 45, 29, 47, 38, 42, 46, 41, 44, 76, 49,
-                31, 38, 38, 76, 46, 34, 41, 45, 31, 76, 42, 27,
-                45, 45, 35, 41, 40, 45, 76, 44, 31, 27, 30, 78,
-                76, 76, 76, 76, 76, 76, 76, 76, 23, 34, 35, 29,
-                34, 76, 51, 31, 46, 76, 45, 47, 44, 48, 35, 48,
-                31, 64, 76, 45, 46, 27, 39, 42, 31, 30, 76, 41,
-                40, 76, 46, 34, 31, 45, 31, 76, 38, 35, 32, 31,
-                38, 31, 45, 45, 76, 46, 34, 35, 40, 33, 45, 64,
-                78, 76, 76, 76, 76, 76, 76, 76, 76, 20, 34, 31,
-                76, 34, 27, 40, 30, 76, 46, 34, 27, 46, 76, 39,
-                41, 29, 37, 31, 30, 76, 46, 34, 31, 39, 64, 76,
-                27, 40, 30, 76, 46, 34, 31, 76, 34, 31, 27, 44,
-                46, 76, 46, 34, 27, 46, 76, 32, 31, 30, 66, 78,
-                76, 76, 76, 76, 76, 76, 76, 76, 1, 40, 30, 76,
-                41, 40, 76, 46, 34, 31, 76, 42, 31, 30, 31, 45,
-                46, 27, 38, 64, 76, 46, 34, 31, 45, 31, 76, 49,
-                41, 44, 30, 45, 76, 27, 42, 42, 31, 27, 44, 65,
-                78, 76, 76, 76, 76, 76, 76, 76, 76, 13, 51, 76,
-                40, 27, 39, 31, 76, 35, 45, 76, 15, 52, 51, 39,
-                27, 40, 30, 35, 27, 45, 64, 76, 11, 35, 40, 33,
-                76, 41, 32, 76, 11, 35, 40, 33, 45, 66, 78, 76,
-                76, 76, 76, 76, 76, 76, 76, 12, 41, 41, 37, 76,
-                41, 40, 76, 39, 51, 76, 23, 41, 44, 37, 45, 64,
-                76, 51, 31, 76, 13, 35, 33, 34, 46, 51, 64, 76,
-                27, 40, 30, 76, 30, 31, 45, 42, 27, 35, 44, 67,
-                78, 76, 76, 76, 76, 76, 76, 76, 76, 14, 41, 46,
-                34, 35, 40, 33, 76, 28, 31, 45, 35, 30, 31, 76,
-                44, 31, 39, 27, 35, 40, 45, 63, 76, 18, 41, 47,
-                40, 30, 76, 46, 34, 31, 76, 30, 31, 29, 27, 51,
-                78, 76, 76, 76, 76, 76, 76, 76, 76, 15, 32, 76,
-                46, 34, 27, 46, 76, 29, 41, 38, 41, 45, 45, 27,
-                38, 76, 23, 44, 31, 29, 37, 64, 76, 28, 41, 47,
-                40, 30, 38, 31, 45, 45, 76, 27, 40, 30, 76, 28,
-                27, 44, 31, 78, 76, 76, 76, 76, 76, 76, 76, 76,
-                20, 34, 31, 76, 38, 41, 40, 31, 76, 27, 40, 30,
-                76, 38, 31, 48, 31, 38, 76, 45, 27, 40, 30, 45,
-                76, 45, 46, 44, 31, 46, 29, 34, 76, 32, 27, 44,
-                76, 27, 49, 27, 51, 78, 76, 76, 76, 76, 76, 76,
-                76, 76]]),
-            torch.tensor([[0, 0, 0, 1069, 11]]),
-            torch.tensor([[0, 0, 0, 1069, 11]]),
-        ]
-        # fmt: on
-        self.assertTrue(torch.allclose(tokens[0], EXPECTED_OUTPUT[0]))
-        self.assertTrue(torch.allclose(tokens[1], EXPECTED_OUTPUT[1]))
-        self.assertTrue(torch.allclose(tokens[2], EXPECTED_OUTPUT[2]))
-
-    @require_torch
-    def test_5b_lyrics_tokenizer(self):
-        """
-        The outputs are similar that open AI but do not have the same format as this one is adapted to the HF integration.
-        """
-        import torch
-
-        tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-5b-lyrics")
-        tokens = tokenizer(**self.metas)["input_ids"]
-        # fmt: off
-        EXPECTED_OUTPUT = [
-            torch.tensor([[
-                0, 0, 0, 1069, 11, -1, -1, -1, -1, 9, 77, 39,
-                31, 46, 77, 27, 77, 46, 44, 27, 48, 31, 38, 38,
-                31, 44, 77, 32, 44, 41, 39, 77, 27, 40, 77, 27,
-                40, 46, 35, 43, 47, 31, 77, 38, 27, 40, 30, 64,
-                79, 77, 77, 77, 77, 77, 77, 77, 77, 23, 34, 41,
-                77, 45, 27, 35, 30, 77, 72, 20, 49, 41, 77, 48,
-                27, 45, 46, 77, 27, 40, 30, 77, 46, 44, 47, 40,
-                37, 38, 31, 45, 45, 77, 38, 31, 33, 45, 77, 41,
-                32, 77, 45, 46, 41, 40, 31, 79, 77, 77, 77, 77,
-                77, 77, 77, 77, 19, 46, 27, 40, 30, 77, 35, 40,
-                77, 46, 34, 31, 77, 30, 31, 45, 31, 44, 46, 63,
-                77, 63, 77, 63, 77, 63, 77, 14, 31, 27, 44, 77,
-                46, 34, 31, 39, 64, 77, 41, 40, 77, 46, 34, 31,
-                77, 45, 27, 40, 30, 64, 79, 77, 77, 77, 77, 77,
-                77, 77, 77, 8, 27, 38, 32, 77, 45, 47, 40, 37,
-                77, 27, 77, 45, 34, 27, 46, 46, 31, 44, 31, 30,
-                77, 48, 35, 45, 27, 33, 31, 77, 38, 35, 31, 45,
-                64, 77, 49, 34, 41, 45, 31, 77, 32, 44, 41, 49,
-                40, 64, 79, 77, 77, 77, 77, 77, 77, 77, 77, 1,
-                40, 30, 77, 49, 44, 35, 40, 37, 38, 31, 30, 77,
-                38, 35, 42, 64, 77, 27, 40, 30, 77, 45, 40, 31,
-                31, 44, 77, 41, 32, 77, 29, 41, 38, 30, 77, 29,
-                41, 39, 39, 27, 40, 30, 64, 79, 77, 77, 77, 77,
-                77, 77, 77, 77, 20, 31, 38, 38, 77, 46, 34, 27,
-                46, 77, 35, 46, 45, 77, 45, 29, 47, 38, 42, 46,
-                41, 44, 77, 49, 31, 38, 38, 77, 46, 34, 41, 45,
-                31, 77, 42, 27, 45, 45, 35, 41, 40, 45, 77, 44,
-                31, 27, 30, 79, 77, 77, 77, 77, 77, 77, 77, 77,
-                23, 34, 35, 29, 34, 77, 51, 31, 46, 77, 45, 47,
-                44, 48, 35, 48, 31, 64, 77, 45, 46, 27, 39, 42,
-                31, 30, 77, 41, 40, 77, 46, 34, 31, 45, 31, 77,
-                38, 35, 32, 31, 38, 31, 45, 45, 77, 46, 34, 35,
-                40, 33, 45, 64, 79, 77, 77, 77, 77, 77, 77, 77,
-                77, 20, 34, 31, 77, 34, 27, 40, 30, 77, 46, 34,
-                27, 46, 77, 39, 41, 29, 37, 31, 30, 77, 46, 34,
-                31, 39, 64, 77, 27, 40, 30, 77, 46, 34, 31, 77,
-                34, 31, 27, 44, 46, 77, 46, 34, 27, 46, 77, 32,
-                31, 30, 66, 79, 77, 77, 77, 77, 77, 77, 77, 77,
-                1, 40, 30, 77, 41, 40, 77, 46, 34, 31, 77, 42,
-                31, 30, 31, 45, 46, 27, 38, 64, 77, 46, 34, 31,
-                45, 31, 77, 49, 41, 44, 30, 45, 77, 27, 42, 42,
-                31, 27, 44, 65, 79, 77, 77, 77, 77, 77, 77, 77,
-                77, 13, 51, 77, 40, 27, 39, 31, 77, 35, 45, 77,
-                15, 52, 51, 39, 27, 40, 30, 35, 27, 45, 64, 77,
-                11, 35, 40, 33, 77, 41, 32, 77, 11, 35, 40, 33,
-                45, 66, 79, 77, 77, 77, 77, 77, 77, 77, 77, 12,
-                41, 41, 37, 77, 41, 40, 77, 39, 51, 77, 23, 41,
-                44, 37, 45, 64, 77, 51, 31, 77, 13, 35, 33, 34,
-                46, 51, 64, 77, 27, 40, 30, 77, 30, 31, 45, 42,
-                27, 35, 44, 67, 79, 77, 77, 77, 77, 77, 77, 77,
-                77, 14, 41, 46, 34, 35, 40, 33, 77, 28, 31, 45,
-                35, 30, 31, 77, 44, 31, 39, 27, 35, 40, 45, 63,
-                77, 18, 41, 47, 40, 30, 77, 46, 34, 31, 77, 30,
-                31, 29, 27, 51, 79, 77, 77, 77, 77, 77, 77, 77,
-                77, 15, 32, 77, 46, 34, 27, 46, 77, 29, 41, 38,
-                41, 45, 45, 27, 38, 77, 23, 44, 31, 29, 37, 64,
-                77, 28, 41, 47, 40, 30, 38, 31, 45, 45, 77, 27,
-                40, 30, 77, 28, 27, 44, 31, 79, 77, 77, 77, 77,
-                77, 77, 77, 77, 20, 34, 31, 77, 38, 41, 40, 31,
-                77, 27, 40, 30, 77, 38, 31, 48, 31, 38, 77, 45,
-                27, 40, 30, 45, 77, 45, 46, 44, 31, 46, 29, 34,
-                77, 32, 27, 44, 77, 27, 49, 27, 51, 79, 77, 77,
-                77, 77, 77, 77, 77, 77]]),
-            torch.tensor([[0, 0, 0, 1069, 11, -1, -1, -1, -1]]),
-            torch.tensor([[0, 0, 0, 1069, 11, -1, -1, -1, -1]]),
-        ]
-        # fmt: on
-        self.assertTrue(torch.allclose(tokens[0], EXPECTED_OUTPUT[0]))
-        self.assertTrue(torch.allclose(tokens[1], EXPECTED_OUTPUT[1]))
-        self.assertTrue(torch.allclose(tokens[2], EXPECTED_OUTPUT[2]))
--- a/tests/models/mega/init.py
+++ b/tests/models/mega/init.py
--- a/tests/models/mega/test_modeling_mega.py
+++ b/tests/models/mega/test_modeling_mega.py
@@ -1,744 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import MegaConfig, is_torch_available
-from transformers.testing_utils import (
-    TestCasePlus,
-    is_flaky,
-    require_torch,
-    require_torch_fp16,
-    slow,
-    torch_device,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        MegaForCausalLM,
-        MegaForMaskedLM,
-        MegaForMultipleChoice,
-        MegaForQuestionAnswering,
-        MegaForSequenceClassification,
-        MegaForTokenClassification,
-        MegaModel,
-    )
-
-
-class MegaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        intermediate_size=37,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_positions=1024,
-        bidirectional=False,  # needed for decoding, and can't modify common generation tests; test separately by overriding
-        ema_projection_size=16,
-        shared_representation_size=64,
-        use_chunking=False,
-        chunk_size=32,
-        attention_activation="softmax",
-        use_normalized_ffn=True,
-        nffn_hidden_size=24,
-        add_token_type_embeddings=True,
-        type_vocab_size=2,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.add_token_type_embeddings = add_token_type_embeddings
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_positions = max_positions
-        self.bidirectional = bidirectional
-        self.ema_projection_size = ema_projection_size
-        self.shared_representation_size = shared_representation_size
-        self.use_chunking = use_chunking
-        self.chunk_size = chunk_size
-        self.attention_activation = attention_activation
-        self.use_normalized_ffn = use_normalized_ffn
-        self.nffn_hidden_size = nffn_hidden_size
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.num_attention_heads = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.add_token_type_embeddings:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return MegaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            intermediate_size=self.intermediate_size,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            # added args
-            add_token_type_embeddings=self.add_token_type_embeddings,
-            max_positions=self.max_positions,
-            bidirectional=self.bidirectional,
-            ema_projection_size=self.ema_projection_size,
-            shared_representation_size=self.shared_representation_size,
-            use_chunking=self.use_chunking,
-            chunk_size=self.chunk_size,
-            attention_activation=self.attention_activation,
-            use_normalized_ffn=self.use_normalized_ffn,
-            nffn_hidden_size=self.nffn_hidden_size,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        config.bidirectional = False
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MegaModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = MegaForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.bidirectional = False
-        config.add_cross_attention = True
-        model = MegaForCausalLM(config=config).to(torch_device).eval()
-
-        # make sure that ids don't start with pad token
-        mask = input_ids.ne(config.pad_token_id).long()
-        input_ids = input_ids * mask
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # make sure that ids don't start with pad token
-        mask = next_tokens.ne(config.pad_token_id).long()
-        next_tokens = next_tokens * mask
-        next_mask = ids_tensor((self.batch_size, 1), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_decoder_model_with_chunking(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.use_chunking = True
-        config.output_attentions = True
-        config.attention_activation = "laplace"
-        config.chunk_size = input_ids.size(1) * 2
-
-        model = MegaForCausalLM(config).to(torch_device).eval()
-
-        input_ids = input_ids.repeat(1, 8)
-        # multiply the sequence length by 8 since we repeat the same ids 8 times in input_ids
-        input_mask = random_attention_mask([self.batch_size, self.seq_length * 8])
-
-        result = model(input_ids, attention_mask=input_mask)
-
-        # test if the sequence length of attentions is same provided chunk_size
-        self.parent.assertEqual(result["attentions"][0].shape[-1], config.chunk_size)
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MegaForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = MegaForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = MegaForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MegaForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    # extra checks for Mega-specific model functionality
-    def create_and_check_bidirectionality(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.bidirectional = True
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-        # no mask
-        result = model(input_ids)
-        # with mask & token types
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def check_chunking_shorter_sequence(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.use_chunking = True
-        config.chunk_size = input_ids.size(1) + 25
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def check_chunking_longer_sequence(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.use_chunking = True
-
-        # we want the chunk size to be < sequence length, and the sequence length to be a multiple of chunk size
-        config.chunk_size = input_ids.size(1) * 2
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(
-            input_ids.repeat(1, 8),
-        )
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length * 8, self.hidden_size))
-
-    def check_laplace_self_attention(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.attention_activation = "laplace"
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def check_relu2_self_attention(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.attention_activation = "relu2"
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def check_sequence_length_beyond_max_positions(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.max_positions = self.seq_length - 2
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class MegaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            MegaForCausalLM,
-            MegaForMaskedLM,
-            MegaModel,
-            MegaForSequenceClassification,
-            MegaForTokenClassification,
-            MegaForMultipleChoice,
-            MegaForQuestionAnswering,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (MegaForCausalLM,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": MegaModel,
-            "fill-mask": MegaForMaskedLM,
-            "question-answering": MegaForQuestionAnswering,
-            "text-classification": MegaForSequenceClassification,
-            "text-generation": MegaForCausalLM,
-            "token-classification": MegaForTokenClassification,
-            "zero-shot": MegaForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-
-    def setUp(self):
-        self.model_tester = MegaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MegaConfig, hidden_size=37)
-
-    # TODO: @ydshieh
-    @is_flaky(description="Sometimes gives `AssertionError` on expected outputs")
-    def test_pipeline_fill_mask(self):
-        super().test_pipeline_fill_mask()
-
-    # TODO: @ydshieh
-    @is_flaky(
-        description="Sometimes gives `RuntimeError: probability tensor contains either `inf`, `nan` or element < 0`"
-    )
-    def test_pipeline_text_generation(self):
-        super().test_pipeline_text_generation()
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_decoder_model_with_chunking(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_with_chunking(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_bidirectionality(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bidirectionality(*config_and_inputs)
-
-    def test_for_chunking_shorter_sequence(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_chunking_shorter_sequence(*config_and_inputs)
-
-    def test_for_chunking_longer_sequence(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_chunking_longer_sequence(*config_and_inputs)
-
-    def test_for_laplace_attention(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_laplace_self_attention(*config_and_inputs)
-
-    def test_for_relu2_attention(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_relu2_self_attention(*config_and_inputs)
-
-    def test_for_sequence_length_beyond_max_positions(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_sequence_length_beyond_max_positions(*config_and_inputs)
-
-    @require_torch_fp16
-    def test_generate_fp16(self):
-        config, input_ids, _, attention_mask, *_ = self.model_tester.prepare_config_and_inputs_for_decoder()
-        # attention_mask = torch.LongTensor(input_ids.ne(1)).to(torch_device)
-        model = MegaForCausalLM(config).eval().to(torch_device)
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-    def test_sequence_classification_model(self):
-        config, input_ids, _, attention_mask, *_ = self.model_tester.prepare_config_and_inputs()
-        config.num_labels = self.model_tester.num_labels
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = MegaForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_sequence_classification_model_for_multi_label(self):
-        config, input_ids, _, attention_mask, *_ = self.model_tester.prepare_config_and_inputs()
-        config.num_labels = self.model_tester.num_labels
-        config.problem_type = "multi_label_classification"
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(torch.float)
-        model = MegaForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "mnaylor/mega-base-wikitext"
-        model = MegaModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
-    def test_cpu_offload(self):
-        super().test_cpu_offload()
-
-    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
-    def test_disk_offload(self):
-        super().test_disk_offload()
-
-    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
-    def test_model_parallelism(self):
-        super().test_model_parallelism()
-
-    @unittest.skip(
-        reason=(
-            "Calling `self.attention_function` in `MegaMovingAverageGatedAttention.forward` changes the submodules on "
-            "device 1 to device 0 (also changes `requires_grad`). No idea how this could happen for now."
-        )
-    )
-    def test_multi_gpu_data_parallel_forward(self):
-        super().test_multi_gpu_data_parallel_forward()
-
-    @unittest.skip(reason="Tracing of the dynamically computed `MegaMultiDimensionDampedEma._kernel` doesn't work.")
-    def test_torchscript_simple(self):
-        super().test_torchscript_simple()
-
-    @unittest.skip(reason="Tracing of the dynamically computed `MegaMultiDimensionDampedEma._kernel` doesn't work.")
-    def test_torchscript_output_hidden_state(self):
-        super().test_torchscript_output_hidden_state()
-
-    @unittest.skip(reason="Tracing of the dynamically computed `MegaMultiDimensionDampedEma._kernel` doesn't work.")
-    def test_torchscript_output_attentions(self):
-        super().test_torchscript_output_attentions()
-
-
-@require_torch
-class MegaModelIntegrationTest(TestCasePlus):
-    @slow
-    def test_inference_masked_lm(self):
-        model = MegaForMaskedLM.from_pretrained("mnaylor/mega-base-wikitext")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 11, 50265))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[67.8389, 10.1470, -32.7148], [-11.1655, 29.1152, 23.1304], [-3.8015, 66.0397, 29.6733]]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_no_head(self):
-        model = MegaModel.from_pretrained("mnaylor/mega-base-wikitext")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 11, 128))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice. taken from output[:, :3, :3]
-        expected_slice = torch.tensor(
-            [[[1.1767, -0.6349, 2.8494], [-0.5109, -0.7745, 1.9495], [-0.3287, -0.2111, 3.3367]]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
--- a/tests/models/nat/init.py
+++ b/tests/models/nat/init.py
--- a/tests/models/nat/test_modeling_nat.py
+++ b/tests/models/nat/test_modeling_nat.py
@@ -1,382 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch Nat model."""
-
-import collections
-import unittest
-
-from transformers import NatConfig
-from transformers.testing_utils import require_natten, require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property, is_torch_available, is_vision_available
-
-from ...test_backbone_common import BackboneTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-    from torch import nn
-
-    from transformers import NatBackbone, NatForImageClassification, NatModel
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import AutoImageProcessor
-
-
-class NatModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=64,
-        patch_size=4,
-        num_channels=3,
-        embed_dim=16,
-        depths=[1, 2, 1],
-        num_heads=[2, 4, 8],
-        kernel_size=3,
-        mlp_ratio=2.0,
-        qkv_bias=True,
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        drop_path_rate=0.1,
-        hidden_act="gelu",
-        patch_norm=True,
-        initializer_range=0.02,
-        layer_norm_eps=1e-5,
-        is_training=True,
-        scope=None,
-        use_labels=True,
-        num_labels=10,
-        out_features=["stage1", "stage2"],
-        out_indices=[1, 2],
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.embed_dim = embed_dim
-        self.depths = depths
-        self.num_heads = num_heads
-        self.kernel_size = kernel_size
-        self.mlp_ratio = mlp_ratio
-        self.qkv_bias = qkv_bias
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.drop_path_rate = drop_path_rate
-        self.hidden_act = hidden_act
-        self.patch_norm = patch_norm
-        self.layer_norm_eps = layer_norm_eps
-        self.initializer_range = initializer_range
-        self.is_training = is_training
-        self.scope = scope
-        self.use_labels = use_labels
-        self.num_labels = num_labels
-        self.out_features = out_features
-        self.out_indices = out_indices
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return NatConfig(
-            num_labels=self.num_labels,
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            embed_dim=self.embed_dim,
-            depths=self.depths,
-            num_heads=self.num_heads,
-            kernel_size=self.kernel_size,
-            mlp_ratio=self.mlp_ratio,
-            qkv_bias=self.qkv_bias,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            drop_path_rate=self.drop_path_rate,
-            hidden_act=self.hidden_act,
-            patch_norm=self.patch_norm,
-            layer_norm_eps=self.layer_norm_eps,
-            initializer_range=self.initializer_range,
-            out_features=self.out_features,
-            out_indices=self.out_indices,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = NatModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        expected_height = expected_width = (config.image_size // config.patch_size) // (2 ** (len(config.depths) - 1))
-        expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1))
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, expected_height, expected_width, expected_dim)
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        model = NatForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = NatForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_backbone(self, config, pixel_values, labels):
-        model = NatBackbone(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        # verify hidden states
-        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], 16, 16])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), len(config.out_features))
-
-        # verify backbone works with out_features=None
-        config.out_features = None
-        model = NatBackbone(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, model.channels[-1], 4, 4])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), 1)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_natten
-@require_torch
-class NatModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            NatModel,
-            NatForImageClassification,
-            NatBackbone,
-        )
-        if is_torch_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"image-feature-extraction": NatModel, "image-classification": NatForImageClassification}
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = False
-
-    test_torchscript = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = NatModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=NatConfig, embed_dim=37)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    def test_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_backbone(*config_and_inputs)
-
-    @unittest.skip(reason="Nat does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Nat does not use feedforward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_attention_outputs(self):
-        self.skipTest("Nat's attention operation is handled entirely by NATTEN.")
-
-    def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
-        model = model_class(config)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-        hidden_states = outputs.hidden_states
-
-        expected_num_layers = getattr(
-            self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1
-        )
-        self.assertEqual(len(hidden_states), expected_num_layers)
-
-        # Nat has a different seq_length
-        patch_size = (
-            config.patch_size
-            if isinstance(config.patch_size, collections.abc.Iterable)
-            else (config.patch_size, config.patch_size)
-        )
-
-        height = image_size[0] // patch_size[0]
-        width = image_size[1] // patch_size[1]
-
-        self.assertListEqual(
-            list(hidden_states[0].shape[-3:]),
-            [height, width, self.model_tester.embed_dim],
-        )
-
-        if model_class.__name__ != "NatBackbone":
-            reshaped_hidden_states = outputs.reshaped_hidden_states
-            self.assertEqual(len(reshaped_hidden_states), expected_num_layers)
-
-            batch_size, num_channels, height, width = reshaped_hidden_states[0].shape
-            reshaped_hidden_states = (
-                reshaped_hidden_states[0].view(batch_size, num_channels, height, width).permute(0, 2, 3, 1)
-            )
-            self.assertListEqual(
-                list(reshaped_hidden_states.shape[-3:]),
-                [height, width, self.model_tester.embed_dim],
-            )
-
-    def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        image_size = (
-            self.model_tester.image_size
-            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
-            else (self.model_tester.image_size, self.model_tester.image_size)
-        )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "shi-labs/nat-mini-in1k-224"
-        model = NatModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "embeddings" not in name and param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-
-@require_natten
-@require_vision
-@require_torch
-class NatModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = NatForImageClassification.from_pretrained("shi-labs/nat-mini-in1k-224").to(torch_device)
-        image_processor = self.default_image_processor
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = torch.Size((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        expected_slice = torch.tensor([0.3805, -0.8676, -0.3912]).to(torch_device)
-        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
-
-
-@require_torch
-@require_natten
-class NatBackboneTest(unittest.TestCase, BackboneTesterMixin):
-    all_model_classes = (NatBackbone,) if is_torch_available() else ()
-    config_class = NatConfig
-
-    def setUp(self):
-        self.model_tester = NatModelTester(self)
--- a/tests/models/nezha/init.py
+++ b/tests/models/nezha/init.py
--- a/tests/models/nezha/test_modeling_nezha.py
+++ b/tests/models/nezha/test_modeling_nezha.py
@@ -1,489 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import tempfile
-import unittest
-
-from transformers import NezhaConfig, is_torch_available
-from transformers.models.auto import get_values
-from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        MODEL_FOR_PRETRAINING_MAPPING,
-        NezhaForMaskedLM,
-        NezhaForMultipleChoice,
-        NezhaForNextSentencePrediction,
-        NezhaForPreTraining,
-        NezhaForQuestionAnswering,
-        NezhaForSequenceClassification,
-        NezhaForTokenClassification,
-        NezhaModel,
-    )
-
-
-class NezhaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=128,
-        max_relative_position=32,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        """
-        Returns a tiny configuration by default.
-        """
-        return NezhaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NezhaModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = NezhaModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NezhaForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_next_sequence_prediction(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NezhaForNextSentencePrediction(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_pretraining(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NezhaForPreTraining(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            next_sentence_label=sequence_labels,
-        )
-        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NezhaForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = NezhaForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = NezhaForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = NezhaForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class NezhaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            NezhaModel,
-            NezhaForMaskedLM,
-            NezhaForMultipleChoice,
-            NezhaForNextSentencePrediction,
-            NezhaForPreTraining,
-            NezhaForQuestionAnswering,
-            NezhaForSequenceClassification,
-            NezhaForTokenClassification,
-        )
-        if is_torch_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": NezhaModel,
-            "fill-mask": NezhaForMaskedLM,
-            "question-answering": NezhaForQuestionAnswering,
-            "text-classification": NezhaForSequenceClassification,
-            "token-classification": NezhaForTokenClassification,
-            "zero-shot": NezhaForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = True
-
-    # special case for ForPreTraining model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["labels"] = torch.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
-                )
-                inputs_dict["next_sentence_label"] = torch.zeros(
-                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = NezhaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=NezhaConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_next_sequence_prediction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_next_sequence_prediction(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "sijunhe/nezha-cn-base"
-        model = NezhaModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    @require_torch_gpu
-    def test_torchscript_device_change(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            # NezhaForMultipleChoice behaves incorrectly in JIT environments.
-            if model_class == NezhaForMultipleChoice:
-                return
-
-            config.torchscript = True
-            model = model_class(config=config)
-
-            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            traced_model = torch.jit.trace(
-                model, (inputs_dict["input_ids"].to("cpu"), inputs_dict["attention_mask"].to("cpu"))
-            )
-
-            with tempfile.TemporaryDirectory() as tmp:
-                torch.jit.save(traced_model, os.path.join(tmp, "bert.pt"))
-                loaded = torch.jit.load(os.path.join(tmp, "bert.pt"), map_location=torch_device)
-                loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device))
-
-
-@require_torch
-class NezhaModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_nezha_model(self):
-        model = NezhaModel.from_pretrained("sijunhe/nezha-cn-base")
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
-        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1]])
-        with torch.no_grad():
-            output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = torch.Size((1, 6, 768))
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = torch.tensor([[[0.0685, 0.2441, 0.1102], [0.0600, 0.1906, 0.1349], [0.0221, 0.0819, 0.0586]]])
-
-        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_nezha_masked_lm(self):
-        model = NezhaForMaskedLM.from_pretrained("sijunhe/nezha-cn-base")
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
-        attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1]])
-        with torch.no_grad():
-            output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = torch.Size((1, 6, 21128))
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = torch.tensor(
-            [[-2.7939, -1.7902, -2.2189], [-2.8585, -1.8908, -2.3723], [-2.6499, -1.7750, -2.2558]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
--- a/tests/models/qdqbert/init.py
+++ b/tests/models/qdqbert/init.py
--- a/tests/models/qdqbert/test_modeling_qdqbert.py
+++ b/tests/models/qdqbert/test_modeling_qdqbert.py
@@ -1,573 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-# Copyright 2021 NVIDIA Corporation. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch QDQBERT model."""
-
-import unittest
-
-from transformers import QDQBertConfig, is_torch_available
-from transformers.testing_utils import require_pytorch_quantization, require_torch, slow, torch_device
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        QDQBertForMaskedLM,
-        QDQBertForMultipleChoice,
-        QDQBertForNextSentencePrediction,
-        QDQBertForQuestionAnswering,
-        QDQBertForSequenceClassification,
-        QDQBertForTokenClassification,
-        QDQBertLMHeadModel,
-        QDQBertModel,
-    )
-
-
-class QDQBertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        # Set default quantizers before creating the model.
-        import pytorch_quantization.nn as quant_nn
-        from pytorch_quantization.tensor_quant import QuantDescriptor
-
-        # The default tensor quantizer is set to use Max calibration method
-        input_desc = QuantDescriptor(num_bits=8, calib_method="max")
-        # The default tensor quantizer is set to be per-channel quantization for weights
-        weight_desc = QuantDescriptor(num_bits=8, axis=((0,)))
-        quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
-        quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
-        # For the test cases, since QDQBert model is tested in one run without calibration, the quantized tensors are set as fake quantized tensors which give float type tensors in the end.
-        quant_nn.TensorQuantizer.use_fb_fake_quant = True
-
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return QDQBertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = QDQBertModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = QDQBertModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = QDQBertLMHeadModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = QDQBertForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_model_for_causal_lm_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = QDQBertLMHeadModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = QDQBertLMHeadModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_for_next_sequence_prediction(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = QDQBertForNextSentencePrediction(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = QDQBertForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = QDQBertForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = QDQBertForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = QDQBertForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-@require_pytorch_quantization
-class QDQBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            QDQBertModel,
-            QDQBertForMaskedLM,
-            QDQBertForMultipleChoice,
-            QDQBertForNextSentencePrediction,
-            QDQBertForQuestionAnswering,
-            QDQBertForSequenceClassification,
-            QDQBertForTokenClassification,
-            QDQBertLMHeadModel,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (QDQBertLMHeadModel,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": QDQBertModel,
-            "fill-mask": QDQBertForMaskedLM,
-            "question-answering": QDQBertForQuestionAnswering,
-            "text-classification": QDQBertForSequenceClassification,
-            "text-generation": QDQBertLMHeadModel,
-            "token-classification": QDQBertForTokenClassification,
-            "zero-shot": QDQBertForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-
-    def setUp(self):
-        self.model_tester = QDQBertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=QDQBertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_causal_lm_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_for_causal_lm_as_decoder(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_next_sequence_prediction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_next_sequence_prediction(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google-bert/bert-base-uncased"
-        model = QDQBertModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    # Override
-    def test_feed_forward_chunking(self):
-        # feed forward chunking is not supported in QDQBert
-        pass
-
-
-@require_torch
-@require_pytorch_quantization
-class QDQBertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head_absolute_embedding(self):
-        # Set default quantizers before creating the model.
-        import pytorch_quantization.nn as quant_nn
-        from pytorch_quantization.tensor_quant import QuantDescriptor
-
-        # The default tensor quantizer is set to use Max calibration method
-        input_desc = QuantDescriptor(num_bits=8, calib_method="max")
-        # The default tensor quantizer is set to be per-channel quantization for weights
-        weight_desc = QuantDescriptor(num_bits=8, axis=((0,)))
-        quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
-        quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
-
-        model = QDQBertModel.from_pretrained("google-bert/bert-base-uncased")
-        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = torch.Size((1, 11, 768))
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = torch.tensor(
-            [[[0.4571, -0.0735, 0.8594], [0.2774, -0.0278, 0.8794], [0.3548, -0.0473, 0.7593]]]
-        )
-        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
--- a/tests/models/realm/init.py
+++ b/tests/models/realm/init.py
--- a/tests/models/realm/test_modeling_realm.py
+++ b/tests/models/realm/test_modeling_realm.py
@@ -1,554 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch REALM model."""
-
-import copy
-import unittest
-
-import numpy as np
-
-from transformers import RealmConfig, is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        RealmEmbedder,
-        RealmForOpenQA,
-        RealmKnowledgeAugEncoder,
-        RealmReader,
-        RealmRetriever,
-        RealmScorer,
-        RealmTokenizer,
-    )
-
-
-class RealmModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        retriever_proj_size=128,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        span_hidden_size=50,
-        max_span_width=10,
-        reader_layer_norm_eps=1e-3,
-        reader_beam_size=4,
-        reader_seq_len=288 + 32,
-        num_block_records=13353718,
-        searcher_beam_size=8,
-        searcher_seq_len=64,
-        num_labels=3,
-        num_choices=4,
-        num_candidates=10,
-        scope=None,
-    ):
-        # General config
-        self.parent = parent
-        self.batch_size = batch_size
-        self.retriever_proj_size = retriever_proj_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-
-        # Reader config
-        self.span_hidden_size = span_hidden_size
-        self.max_span_width = max_span_width
-        self.reader_layer_norm_eps = reader_layer_norm_eps
-        self.reader_beam_size = reader_beam_size
-        self.reader_seq_len = reader_seq_len
-
-        # Searcher config
-        self.num_block_records = num_block_records
-        self.searcher_beam_size = searcher_beam_size
-        self.searcher_seq_len = searcher_seq_len
-
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.num_candidates = num_candidates
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        candiate_input_ids = ids_tensor([self.batch_size, self.num_candidates, self.seq_length], self.vocab_size)
-        reader_input_ids = ids_tensor([self.reader_beam_size, self.reader_seq_len], self.vocab_size)
-
-        input_mask = None
-        candiate_input_mask = None
-        reader_input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-            candiate_input_mask = random_attention_mask([self.batch_size, self.num_candidates, self.seq_length])
-            reader_input_mask = random_attention_mask([self.reader_beam_size, self.reader_seq_len])
-
-        token_type_ids = None
-        candidate_token_type_ids = None
-        reader_token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-            candidate_token_type_ids = ids_tensor(
-                [self.batch_size, self.num_candidates, self.seq_length], self.type_vocab_size
-            )
-            reader_token_type_ids = ids_tensor([self.reader_beam_size, self.reader_seq_len], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        # inputs with additional num_candidates axis.
-        scorer_encoder_inputs = (candiate_input_ids, candiate_input_mask, candidate_token_type_ids)
-        # reader inputs
-        reader_inputs = (reader_input_ids, reader_input_mask, reader_token_type_ids)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            scorer_encoder_inputs,
-            reader_inputs,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(self):
-        return RealmConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            retriever_proj_size=self.retriever_proj_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_candidates=self.num_candidates,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_embedder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        scorer_encoder_inputs,
-        reader_inputs,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = RealmEmbedder(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.projected_score.shape, (self.batch_size, self.retriever_proj_size))
-
-    def create_and_check_encoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        scorer_encoder_inputs,
-        reader_inputs,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = RealmKnowledgeAugEncoder(config=config)
-        model.to(torch_device)
-        model.eval()
-        relevance_score = floats_tensor([self.batch_size, self.num_candidates])
-        result = model(
-            scorer_encoder_inputs[0],
-            attention_mask=scorer_encoder_inputs[1],
-            token_type_ids=scorer_encoder_inputs[2],
-            relevance_score=relevance_score,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size * self.num_candidates, self.seq_length, self.vocab_size)
-        )
-
-    def create_and_check_reader(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        scorer_encoder_inputs,
-        reader_inputs,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = RealmReader(config=config)
-        model.to(torch_device)
-        model.eval()
-        relevance_score = floats_tensor([self.reader_beam_size])
-        result = model(
-            reader_inputs[0],
-            attention_mask=reader_inputs[1],
-            token_type_ids=reader_inputs[2],
-            relevance_score=relevance_score,
-        )
-        self.parent.assertEqual(result.block_idx.shape, ())
-        self.parent.assertEqual(result.candidate.shape, ())
-        self.parent.assertEqual(result.start_pos.shape, ())
-        self.parent.assertEqual(result.end_pos.shape, ())
-
-    def create_and_check_scorer(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        scorer_encoder_inputs,
-        reader_inputs,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = RealmScorer(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            candidate_input_ids=scorer_encoder_inputs[0],
-            candidate_attention_mask=scorer_encoder_inputs[1],
-            candidate_token_type_ids=scorer_encoder_inputs[2],
-        )
-        self.parent.assertEqual(result.relevance_score.shape, (self.batch_size, self.num_candidates))
-        self.parent.assertEqual(result.query_score.shape, (self.batch_size, self.retriever_proj_size))
-        self.parent.assertEqual(
-            result.candidate_score.shape, (self.batch_size, self.num_candidates, self.retriever_proj_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            scorer_encoder_inputs,
-            reader_inputs,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class RealmModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            RealmEmbedder,
-            RealmKnowledgeAugEncoder,
-            # RealmScorer is excluded from common tests as it is a container model
-            # consisting of two RealmEmbedders & a simple inner product calculation.
-            # RealmScorer
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = ()
-    pipeline_model_mapping = {} if is_torch_available() else {}
-
-    # disable these tests because there is no base_model in Realm
-    test_save_load_fast_init_from_base = False
-    test_save_load_fast_init_to_base = False
-
-    def setUp(self):
-        self.test_pruning = False
-        self.model_tester = RealmModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RealmConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_embedder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_embedder(*config_and_inputs)
-
-    def test_encoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_encoder(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_embedder(*config_and_inputs)
-            self.model_tester.create_and_check_encoder(*config_and_inputs)
-
-    def test_scorer(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_scorer(*config_and_inputs)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        config, *inputs = self.model_tester.prepare_config_and_inputs()
-        input_ids, token_type_ids, input_mask, scorer_encoder_inputs = inputs[0:4]
-        config.return_dict = True
-
-        tokenizer = RealmTokenizer.from_pretrained("google/realm-orqa-nq-openqa")
-
-        # RealmKnowledgeAugEncoder training
-        model = RealmKnowledgeAugEncoder(config)
-        model.to(torch_device)
-        model.train()
-
-        inputs_dict = {
-            "input_ids": scorer_encoder_inputs[0].to(torch_device),
-            "attention_mask": scorer_encoder_inputs[1].to(torch_device),
-            "token_type_ids": scorer_encoder_inputs[2].to(torch_device),
-            "relevance_score": floats_tensor([self.model_tester.batch_size, self.model_tester.num_candidates]),
-        }
-        inputs_dict["labels"] = torch.zeros(
-            (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
-        )
-        inputs = inputs_dict
-        loss = model(**inputs).loss
-        loss.backward()
-
-        # RealmForOpenQA training
-        openqa_config = copy.deepcopy(config)
-        openqa_config.vocab_size = 30522  # the retrieved texts will inevitably have more than 99 vocabs.
-        openqa_config.num_block_records = 5
-        openqa_config.searcher_beam_size = 2
-
-        block_records = np.array(
-            [
-                b"This is the first record.",
-                b"This is the second record.",
-                b"This is the third record.",
-                b"This is the fourth record.",
-                b"This is the fifth record.",
-            ],
-            dtype=object,
-        )
-        retriever = RealmRetriever(block_records, tokenizer)
-        model = RealmForOpenQA(openqa_config, retriever)
-        model.to(torch_device)
-        model.train()
-
-        inputs_dict = {
-            "input_ids": input_ids[:1].to(torch_device),
-            "attention_mask": input_mask[:1].to(torch_device),
-            "token_type_ids": token_type_ids[:1].to(torch_device),
-            "answer_ids": input_ids[:1].tolist(),
-        }
-        inputs = self._prepare_for_class(inputs_dict, RealmForOpenQA)
-        loss = model(**inputs).reader_output.loss
-        loss.backward()
-
-        # Test model.block_embedding_to
-        device = torch.device("cpu")
-        model.block_embedding_to(device)
-        loss = model(**inputs).reader_output.loss
-        loss.backward()
-        self.assertEqual(model.block_emb.device.type, device.type)
-
-    @slow
-    def test_embedder_from_pretrained(self):
-        model = RealmEmbedder.from_pretrained("google/realm-cc-news-pretrained-embedder")
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_encoder_from_pretrained(self):
-        model = RealmKnowledgeAugEncoder.from_pretrained("google/realm-cc-news-pretrained-encoder")
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_open_qa_from_pretrained(self):
-        model = RealmForOpenQA.from_pretrained("google/realm-orqa-nq-openqa")
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_reader_from_pretrained(self):
-        model = RealmReader.from_pretrained("google/realm-orqa-nq-reader")
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_scorer_from_pretrained(self):
-        model = RealmScorer.from_pretrained("google/realm-cc-news-pretrained-scorer")
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class RealmModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_embedder(self):
-        retriever_projected_size = 128
-
-        model = RealmEmbedder.from_pretrained("google/realm-cc-news-pretrained-embedder")
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        expected_shape = torch.Size((1, retriever_projected_size))
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = torch.tensor([[-0.0714, -0.0837, -0.1314]])
-        self.assertTrue(torch.allclose(output[:, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_encoder(self):
-        num_candidates = 2
-        vocab_size = 30522
-
-        model = RealmKnowledgeAugEncoder.from_pretrained(
-            "google/realm-cc-news-pretrained-encoder", num_candidates=num_candidates
-        )
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]])
-        relevance_score = torch.tensor([[0.3, 0.7]], dtype=torch.float32)
-        output = model(input_ids, relevance_score=relevance_score)[0]
-
-        expected_shape = torch.Size((2, 6, vocab_size))
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = torch.tensor([[[-11.0888, -11.2544], [-10.2170, -10.3874]]])
-
-        self.assertTrue(torch.allclose(output[1, :2, :2], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_open_qa(self):
-        from transformers.models.realm.retrieval_realm import RealmRetriever
-
-        tokenizer = RealmTokenizer.from_pretrained("google/realm-orqa-nq-openqa")
-        retriever = RealmRetriever.from_pretrained("google/realm-orqa-nq-openqa")
-
-        model = RealmForOpenQA.from_pretrained(
-            "google/realm-orqa-nq-openqa",
-            retriever=retriever,
-        )
-
-        question = "Who is the pioneer in modern computer science?"
-
-        question = tokenizer(
-            [question],
-            padding=True,
-            truncation=True,
-            max_length=model.config.searcher_seq_len,
-            return_tensors="pt",
-        ).to(model.device)
-
-        predicted_answer_ids = model(**question).predicted_answer_ids
-
-        predicted_answer = tokenizer.decode(predicted_answer_ids)
-        self.assertEqual(predicted_answer, "alan mathison turing")
-
-    @slow
-    def test_inference_reader(self):
-        config = RealmConfig(reader_beam_size=2, max_span_width=3)
-        model = RealmReader.from_pretrained("google/realm-orqa-nq-reader", config=config)
-
-        concat_input_ids = torch.arange(10).view((2, 5))
-        concat_token_type_ids = torch.tensor([[0, 0, 1, 1, 1], [0, 0, 1, 1, 1]], dtype=torch.int64)
-        concat_block_mask = torch.tensor([[0, 0, 1, 1, 0], [0, 0, 1, 1, 0]], dtype=torch.int64)
-        relevance_score = torch.tensor([0.3, 0.7], dtype=torch.float32)
-
-        output = model(
-            concat_input_ids,
-            token_type_ids=concat_token_type_ids,
-            relevance_score=relevance_score,
-            block_mask=concat_block_mask,
-            return_dict=True,
-        )
-
-        block_idx_expected_shape = torch.Size(())
-        start_pos_expected_shape = torch.Size((1,))
-        end_pos_expected_shape = torch.Size((1,))
-        self.assertEqual(output.block_idx.shape, block_idx_expected_shape)
-        self.assertEqual(output.start_pos.shape, start_pos_expected_shape)
-        self.assertEqual(output.end_pos.shape, end_pos_expected_shape)
-
-        expected_block_idx = torch.tensor(1)
-        expected_start_pos = torch.tensor(3)
-        expected_end_pos = torch.tensor(3)
-
-        self.assertTrue(torch.allclose(output.block_idx, expected_block_idx, atol=1e-4))
-        self.assertTrue(torch.allclose(output.start_pos, expected_start_pos, atol=1e-4))
-        self.assertTrue(torch.allclose(output.end_pos, expected_end_pos, atol=1e-4))
-
-    @slow
-    def test_inference_scorer(self):
-        num_candidates = 2
-
-        model = RealmScorer.from_pretrained("google/realm-cc-news-pretrained-scorer", num_candidates=num_candidates)
-
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
-        candidate_input_ids = torch.tensor([[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]])
-        output = model(input_ids, candidate_input_ids=candidate_input_ids)[0]
-
-        expected_shape = torch.Size((1, 2))
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = torch.tensor([[0.7410, 0.7170]])
-        self.assertTrue(torch.allclose(output, expected_slice, atol=1e-4))
--- a/tests/models/realm/test_retrieval_realm.py
+++ b/tests/models/realm/test_retrieval_realm.py
@@ -1,187 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import shutil
-import tempfile
-from unittest import TestCase
-from unittest.mock import patch
-
-import numpy as np
-from datasets import Dataset
-
-from transformers.models.realm.configuration_realm import RealmConfig
-from transformers.models.realm.retrieval_realm import _REALM_BLOCK_RECORDS_FILENAME, RealmRetriever
-from transformers.models.realm.tokenization_realm import VOCAB_FILES_NAMES, RealmTokenizer
-
-
-class RealmRetrieverTest(TestCase):
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-        self.num_block_records = 5
-
-        # Realm tok
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "test",
-            "question",
-            "this",
-            "is",
-            "the",
-            "first",
-            "second",
-            "third",
-            "fourth",
-            "fifth",
-            "record",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        realm_tokenizer_path = os.path.join(self.tmpdirname, "realm_tokenizer")
-        os.makedirs(realm_tokenizer_path, exist_ok=True)
-        self.vocab_file = os.path.join(realm_tokenizer_path, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-        realm_block_records_path = os.path.join(self.tmpdirname, "realm_block_records")
-        os.makedirs(realm_block_records_path, exist_ok=True)
-
-    def get_tokenizer(self) -> RealmTokenizer:
-        return RealmTokenizer.from_pretrained(os.path.join(self.tmpdirname, "realm_tokenizer"))
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def get_config(self):
-        config = RealmConfig(num_block_records=self.num_block_records)
-        return config
-
-    def get_dummy_dataset(self):
-        dataset = Dataset.from_dict(
-            {
-                "id": ["0", "1"],
-                "question": ["foo", "bar"],
-                "answers": [["Foo", "Bar"], ["Bar"]],
-            }
-        )
-        return dataset
-
-    def get_dummy_block_records(self):
-        block_records = np.array(
-            [
-                b"This is the first record",
-                b"This is the second record",
-                b"This is the third record",
-                b"This is the fourth record",
-                b"This is the fifth record",
-                b"This is a longer longer longer record",
-            ],
-            dtype=object,
-        )
-        return block_records
-
-    def get_dummy_retriever(self):
-        retriever = RealmRetriever(
-            block_records=self.get_dummy_block_records(),
-            tokenizer=self.get_tokenizer(),
-        )
-        return retriever
-
-    def test_retrieve(self):
-        config = self.get_config()
-        retriever = self.get_dummy_retriever()
-        tokenizer = retriever.tokenizer
-
-        retrieved_block_ids = np.array([0, 3], dtype="long")
-        question_input_ids = tokenizer(["Test question"]).input_ids
-        answer_ids = tokenizer(
-            ["the fourth"],
-            add_special_tokens=False,
-            return_token_type_ids=False,
-            return_attention_mask=False,
-        ).input_ids
-        max_length = config.reader_seq_len
-
-        has_answers, start_pos, end_pos, concat_inputs = retriever(
-            retrieved_block_ids, question_input_ids, answer_ids=answer_ids, max_length=max_length, return_tensors="np"
-        )
-
-        self.assertEqual(len(has_answers), 2)
-        self.assertEqual(len(start_pos), 2)
-        self.assertEqual(len(end_pos), 2)
-        self.assertEqual(concat_inputs.input_ids.shape, (2, 10))
-        self.assertEqual(concat_inputs.attention_mask.shape, (2, 10))
-        self.assertEqual(concat_inputs.token_type_ids.shape, (2, 10))
-        self.assertEqual(concat_inputs.special_tokens_mask.shape, (2, 10))
-        self.assertEqual(
-            tokenizer.convert_ids_to_tokens(concat_inputs.input_ids[0]),
-            ["[CLS]", "test", "question", "[SEP]", "this", "is", "the", "first", "record", "[SEP]"],
-        )
-        self.assertEqual(
-            tokenizer.convert_ids_to_tokens(concat_inputs.input_ids[1]),
-            ["[CLS]", "test", "question", "[SEP]", "this", "is", "the", "fourth", "record", "[SEP]"],
-        )
-
-    def test_block_has_answer(self):
-        config = self.get_config()
-        retriever = self.get_dummy_retriever()
-        tokenizer = retriever.tokenizer
-
-        retrieved_block_ids = np.array([0, 3, 5], dtype="long")
-        question_input_ids = tokenizer(["Test question"]).input_ids
-        answer_ids = tokenizer(
-            ["the fourth", "longer longer"],
-            add_special_tokens=False,
-            return_token_type_ids=False,
-            return_attention_mask=False,
-        ).input_ids
-        max_length = config.reader_seq_len
-
-        has_answers, start_pos, end_pos, _ = retriever(
-            retrieved_block_ids, question_input_ids, answer_ids=answer_ids, max_length=max_length, return_tensors="np"
-        )
-
-        self.assertEqual([False, True, True], has_answers)
-        self.assertEqual([[-1, -1, -1], [6, -1, -1], [6, 7, 8]], start_pos)
-        self.assertEqual([[-1, -1, -1], [7, -1, -1], [7, 8, 9]], end_pos)
-
-    def test_save_load_pretrained(self):
-        retriever = self.get_dummy_retriever()
-        retriever.save_pretrained(os.path.join(self.tmpdirname, "realm_block_records"))
-
-        # Test local path
-        retriever = retriever.from_pretrained(os.path.join(self.tmpdirname, "realm_block_records"))
-        self.assertEqual(retriever.block_records[0], b"This is the first record")
-
-        # Test mocked remote path
-        with patch("transformers.models.realm.retrieval_realm.hf_hub_download") as mock_hf_hub_download:
-            mock_hf_hub_download.return_value = os.path.join(
-                os.path.join(self.tmpdirname, "realm_block_records"), _REALM_BLOCK_RECORDS_FILENAME
-            )
-            retriever = RealmRetriever.from_pretrained("google/realm-cc-news-pretrained-openqa")
-
-        self.assertEqual(retriever.block_records[0], b"This is the first record")
--- a/tests/models/realm/test_tokenization_realm.py
+++ b/tests/models/realm/test_tokenization_realm.py
@@ -1,322 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-
-from transformers import RealmTokenizerFast
-from transformers.models.bert.tokenization_bert import (
-    VOCAB_FILES_NAMES,
-    BasicTokenizer,
-    WordpieceTokenizer,
-    _is_control,
-    _is_punctuation,
-    _is_whitespace,
-)
-from transformers.models.realm.tokenization_realm import RealmTokenizer
-from transformers.testing_utils import require_tokenizers, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english
-
-
-@require_tokenizers
-class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "google/realm-cc-news-pretrained-embedder"
-    tokenizer_class = RealmTokenizer
-    rust_tokenizer_class = RealmTokenizerFast
-    test_rust_tokenizer = True
-    space_between_special_tokens = True
-    from_pretrained_filter = filter_non_english
-
-    def setUp(self):
-        super().setUp()
-
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00e9d,running"
-        output_text = "unwanted, running"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file)
-
-        tokens = tokenizer.tokenize("UNwant\u00e9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "UNwant\u00e9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-        # With lower casing
-        tokenizer = self.get_tokenizer(do_lower_case=True)
-        rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
-
-        sequence = "UNwant\u00e9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_chinese(self):
-        tokenizer = BasicTokenizer()
-
-        self.assertListEqual(tokenizer.tokenize("ah\u535a\u63a8zz"), ["ah", "\u535a", "\u63a8", "zz"])
-
-    def test_basic_tokenizer_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["h\u00e9llo"])
-
-    def test_basic_tokenizer_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_default(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_no_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_respects_never_split_tokens(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
-        )
-
-    def test_wordpiece_tokenizer(self):
-        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
-
-        vocab = {}
-        for i, token in enumerate(vocab_tokens):
-            vocab[token] = i
-        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
-
-        self.assertListEqual(tokenizer.tokenize(""), [])
-
-        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
-
-        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
-
-    def test_is_whitespace(self):
-        self.assertTrue(_is_whitespace(" "))
-        self.assertTrue(_is_whitespace("\t"))
-        self.assertTrue(_is_whitespace("\r"))
-        self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00a0"))
-
-        self.assertFalse(_is_whitespace("A"))
-        self.assertFalse(_is_whitespace("-"))
-
-    def test_is_control(self):
-        self.assertTrue(_is_control("\u0005"))
-
-        self.assertFalse(_is_control("A"))
-        self.assertFalse(_is_control(" "))
-        self.assertFalse(_is_control("\t"))
-        self.assertFalse(_is_control("\r"))
-
-    def test_is_punctuation(self):
-        self.assertTrue(_is_punctuation("-"))
-        self.assertTrue(_is_punctuation("$"))
-        self.assertTrue(_is_punctuation("`"))
-        self.assertTrue(_is_punctuation("."))
-
-        self.assertFalse(_is_punctuation("A"))
-        self.assertFalse(_is_punctuation(" "))
-
-    def test_clean_text(self):
-        tokenizer = self.get_tokenizer()
-
-        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
-        self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
-
-        if self.test_rust_tokenizer:
-            rust_tokenizer = self.get_rust_tokenizer()
-            self.assertListEqual(
-                [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
-            )
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("google-bert/bert-base-uncased")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [101] + text + [102]
-        assert encoded_pair == [101] + text + [102] + text_2 + [102]
-
-    def test_offsets_with_special_characters(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
-                tokens = tokenizer_r.encode_plus(
-                    sentence,
-                    return_attention_mask=False,
-                    return_token_type_ids=False,
-                    return_offsets_mapping=True,
-                    add_special_tokens=True,
-                )
-
-                do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
-                expected_results = (
-                    [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "A"),
-                        ((1, 2), ","),
-                        ((3, 5), "na"),
-                        ((5, 6), "##ï"),
-                        ((6, 8), "##ve"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "Allen"),
-                        ((21, 23), "##NL"),
-                        ((23, 24), "##P"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                    if not do_lower_case
-                    else [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "a"),
-                        ((1, 2), ","),
-                        ((3, 8), "naive"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "allen"),
-                        ((21, 23), "##nl"),
-                        ((23, 24), "##p"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                )
-
-                self.assertEqual(
-                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
-                )
-                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
-
-    @slow
-    def test_batch_encode_candidates(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]]
-
-                encoded_sentence_r = tokenizer_r.batch_encode_candidates(text, max_length=10, return_tensors="np")
-                encoded_sentence_p = tokenizer_p.batch_encode_candidates(text, max_length=10, return_tensors="np")
-
-                expected_shape = (2, 2, 10)
-
-                self.assertEqual(encoded_sentence_r["input_ids"].shape, expected_shape)
-                self.assertEqual(encoded_sentence_r["attention_mask"].shape, expected_shape)
-                self.assertEqual(encoded_sentence_r["token_type_ids"].shape, expected_shape)
-
-                self.assertEqual(encoded_sentence_p["input_ids"].shape, expected_shape)
-                self.assertEqual(encoded_sentence_p["attention_mask"].shape, expected_shape)
-                self.assertEqual(encoded_sentence_p["token_type_ids"].shape, expected_shape)
--- a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
+++ b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
@@ -23,7 +23,6 @@ from transformers.testing_utils import require_deterministic_for_xpu, require_to
 from ...test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
 from ..bert.test_modeling_bert import BertModelTester
 from ..speech_to_text.test_modeling_speech_to_text import Speech2TextModelTester
-from ..speech_to_text_2.test_modeling_speech_to_text_2 import Speech2Text2StandaloneDecoderModelTester
 from ..wav2vec2.test_modeling_wav2vec2 import Wav2Vec2ModelTester


@@ -33,7 +32,6 @@ if is_torch_available():

    from transformers import (
        BertLMHeadModel,
-        Speech2Text2ForCausalLM,
        SpeechEncoderDecoderConfig,
        SpeechEncoderDecoderModel,
        Wav2Vec2Model,
@@ -583,43 +581,3 @@ class Speech2TextBertModelTest(EncoderDecoderMixin, unittest.TestCase):
    # all published pretrained models are Speech2TextModel != Speech2TextEncoder
    def test_real_model_save_load_from_pretrained(self):
        pass
-
-
-@require_torch
-class Wav2Vec2Speech2Text2(EncoderDecoderMixin, unittest.TestCase):
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = Wav2Vec2Model(config).eval()
-        decoder_model = Speech2Text2ForCausalLM(decoder_config).eval()
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        model_tester_encoder = Wav2Vec2ModelTester(self, batch_size=13)
-        model_tester_decoder = Speech2Text2StandaloneDecoderModelTester(
-            self, batch_size=13, d_model=32, max_position_embeddings=512
-        )
-        encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
-        decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs()
-        (
-            config,
-            input_values,
-            input_mask,
-        ) = encoder_config_and_inputs
-        (decoder_config, decoder_input_ids, decoder_attention_mask, _) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        #  disable cache for now
-        decoder_config.use_cache = False
-        return {
-            "config": config,
-            "input_values": input_values,
-            "attention_mask": input_mask,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "labels": decoder_input_ids,
-        }
-
-    # there are no published pretrained Speech2Text2ForCausalLM for now
-    def test_real_model_save_load_from_pretrained(self):
-        pass
--- a/tests/models/speech_to_text_2/init.py
+++ b/tests/models/speech_to_text_2/init.py
--- a/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
+++ b/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
@@ -1,216 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch Speech2Text model."""
-
-import unittest
-
-from transformers import Speech2Text2Config
-from transformers.testing_utils import is_torch_available, require_torch, torch_device
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers.models.speech_to_text_2.modeling_speech_to_text_2 import (
-        Speech2Text2Decoder,
-        Speech2Text2ForCausalLM,
-    )
-
-
-@require_torch
-class Speech2Text2StandaloneDecoderModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        d_model=16,
-        decoder_seq_length=7,
-        is_training=True,
-        is_decoder=True,
-        use_attention_mask=True,
-        use_cache=False,
-        use_labels=True,
-        decoder_start_token_id=2,
-        decoder_ffn_dim=32,
-        decoder_layers=2,
-        decoder_attention_heads=4,
-        max_position_embeddings=30,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.hidden_size = d_model
-        self.num_hidden_layers = decoder_layers
-        self.decoder_layers = decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.decoder_attention_heads = decoder_attention_heads
-        self.num_attention_heads = decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 2
-        self.decoder_attention_idx = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = Speech2Text2Config(
-            vocab_size=self.vocab_size,
-            d_model=self.d_model,
-            decoder_layers=self.decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            decoder_attention_heads=self.decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            use_cache=self.use_cache,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            max_position_embeddings=self.max_position_embeddings,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        )
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        config.use_cache = True
-        model = Speech2Text2Decoder(config=config).to(torch_device).eval()
-        input_ids = input_ids[:2]
-
-        input_ids[input_ids == 0] += 1
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((2, 1), config.vocab_size - 1) + 1
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-
-        print(next_input_ids)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-        # test that outputs are equal for slice
-        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_torch
-class Speech2Text2StandaloneDecoderModelTest(
-    ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase
-):
-    all_model_classes = (Speech2Text2Decoder, Speech2Text2ForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (Speech2Text2ForCausalLM,) if is_torch_available() else ()
-    pipeline_model_mapping = {"text-generation": Speech2Text2ForCausalLM} if is_torch_available() else {}
-    fx_compatible = True
-    test_pruning = False
-
-    def setUp(
-        self,
-    ):
-        self.model_tester = Speech2Text2StandaloneDecoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class=Speech2Text2Config)
-
-    # not implemented currently
-    def test_inputs_embeds(self):
-        pass
-
-    # speech2text2 has no base model
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    # speech2text2 has no base model
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    # decoder cannot keep gradients
-    def test_retain_grad_hidden_states_attentions(self):
-        return
--- a/tests/models/speech_to_text_2/test_tokenization_speech_to_text_2.py
+++ b/tests/models/speech_to_text_2/test_tokenization_speech_to_text_2.py
@@ -1,98 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import json
-import os
-import tempfile
-import unittest
-
-from transformers.models.speech_to_text_2 import Speech2Text2Tokenizer
-from transformers.models.speech_to_text_2.tokenization_speech_to_text_2 import VOCAB_FILES_NAMES
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "facebook/s2t-wav2vec2-large-en-de"
-    tokenizer_class = Speech2Text2Tokenizer
-    test_rust_tokenizer = False
-
-    def setUp(self):
-        super().setUp()
-
-        vocab = "<s> <pad> </s> <unk> here@@ a couple of@@ words for the he@@ re@@ vocab".split(" ")
-        merges = ["he re</w> 123", "here a 1456"]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-
-        self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
-
-        self.tmpdirname = tempfile.mkdtemp()
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-
-        with open(self.merges_file, "w") as fp:
-            fp.write("\n".join(merges))
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "<s>")
-        self.assertEqual(vocab_keys[1], "<pad>")
-        self.assertEqual(vocab_keys[-1], "vocab")
-        self.assertEqual(len(vocab_keys), 14)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 14)
-
-    def test_tokenizer_decode(self):
-        tokenizer = Speech2Text2Tokenizer.from_pretrained(self.tmpdirname)
-
-        # make sure @@ is correctly concatenated
-        token_ids = [4, 6, 8, 7, 10]  # ["here@@", "couple", "words", "of@@", "the"]
-        output_string = tokenizer.decode(token_ids)
-
-        self.assertTrue(output_string == "herecouple words ofthe")
-
-    def test_load_no_merges_file(self):
-        tokenizer = Speech2Text2Tokenizer.from_pretrained(self.tmpdirname)
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            tokenizer.save_pretrained(tmp_dirname)
-            os.remove(os.path.join(tmp_dirname, "merges.txt"))
-
-            # load tokenizer without merges file should not throw an error
-            tokenizer = Speech2Text2Tokenizer.from_pretrained(tmp_dirname)
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            # save tokenizer and load again
-            tokenizer.save_pretrained(tmp_dirname)
-            tokenizer = Speech2Text2Tokenizer.from_pretrained(tmp_dirname)
-
-        self.assertIsNotNone(tokenizer)
-
-    # overwrite since merges_file is optional
-    def test_tokenizer_slow_store_full_signature(self):
-        if not self.test_slow_tokenizer:
-            return
-
-        signature = inspect.signature(self.tokenizer_class.__init__)
-        tokenizer = self.get_tokenizer()
-
-        for parameter_name, parameter in signature.parameters.items():
-            if parameter.default != inspect.Parameter.empty and parameter_name != "merges_file":
-                self.assertIn(parameter_name, tokenizer.init_kwargs)
--- a/tests/models/tvlt/init.py
+++ b/tests/models/tvlt/init.py
--- a/tests/models/tvlt/test_feature_extraction_tvlt.py
+++ b/tests/models/tvlt/test_feature_extraction_tvlt.py
@@ -1,182 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TVLT feature extraction."""
-
-import itertools
-import random
-import unittest
-
-import numpy as np
-
-from transformers import TvltFeatureExtractor, is_datasets_available
-from transformers.testing_utils import require_torch, require_torchaudio
-from transformers.utils.import_utils import is_torch_available
-
-from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
-
-
-if is_torch_available():
-    import torch
-
-if is_datasets_available():
-    from datasets import load_dataset
-
-global_rng = random.Random()
-
-
-# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
-def floats_list(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    values = []
-    for batch_idx in range(shape[0]):
-        values.append([])
-        for _ in range(shape[1]):
-            values[-1].append(rng.random() * scale)
-
-    return values
-
-
-class TvltFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        min_seq_length=400,
-        max_seq_length=2000,
-        spectrogram_length=2048,
-        feature_size=128,
-        num_audio_channels=1,
-        hop_length=512,
-        chunk_length=30,
-        sampling_rate=44100,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.min_seq_length = min_seq_length
-        self.max_seq_length = max_seq_length
-        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
-        self.spectrogram_length = spectrogram_length
-        self.feature_size = feature_size
-        self.num_audio_channels = num_audio_channels
-        self.hop_length = hop_length
-        self.chunk_length = chunk_length
-        self.sampling_rate = sampling_rate
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "spectrogram_length": self.spectrogram_length,
-            "feature_size": self.feature_size,
-            "num_audio_channels": self.num_audio_channels,
-            "hop_length": self.hop_length,
-            "chunk_length": self.chunk_length,
-            "sampling_rate": self.sampling_rate,
-        }
-
-    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
-        def _flatten(list_of_lists):
-            return list(itertools.chain(*list_of_lists))
-
-        if equal_length:
-            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
-        else:
-            # make sure that inputs increase in size
-            speech_inputs = [
-                floats_list((x, self.feature_size))
-                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
-            ]
-        if numpify:
-            speech_inputs = [np.asarray(x) for x in speech_inputs]
-        return speech_inputs
-
-
-@require_torch
-@require_torchaudio
-class TvltFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-    feature_extraction_class = TvltFeatureExtractor
-
-    def setUp(self):
-        self.feat_extract_tester = TvltFeatureExtractionTester(self)
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "spectrogram_length"))
-        self.assertTrue(hasattr(feature_extractor, "feature_size"))
-        self.assertTrue(hasattr(feature_extractor, "num_audio_channels"))
-        self.assertTrue(hasattr(feature_extractor, "hop_length"))
-        self.assertTrue(hasattr(feature_extractor, "chunk_length"))
-        self.assertTrue(hasattr(feature_extractor, "sampling_rate"))
-
-    def test_call(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-
-        # create three inputs of length 800, 1000, and 1200
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
-
-        # Test not batched input
-        encoded_audios = feature_extractor(np_speech_inputs[0], return_tensors="np", sampling_rate=44100).audio_values
-
-        self.assertTrue(encoded_audios.ndim == 4)
-        self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
-        self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
-        self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
-
-        # Test batched
-        encoded_audios = feature_extractor(np_speech_inputs, return_tensors="np", sampling_rate=44100).audio_values
-
-        self.assertTrue(encoded_audios.ndim == 4)
-        self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
-        self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
-        self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
-
-        # Test audio masking
-        encoded_audios = feature_extractor(
-            np_speech_inputs, return_tensors="np", sampling_rate=44100, mask_audio=True
-        ).audio_values
-
-        self.assertTrue(encoded_audios.ndim == 4)
-        self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
-        self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
-        self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
-
-        # Test 2-D numpy arrays are batched.
-        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
-        np_speech_inputs = np.asarray(speech_inputs)
-        encoded_audios = feature_extractor(np_speech_inputs, return_tensors="np", sampling_rate=44100).audio_values
-        self.assertTrue(encoded_audios.ndim == 4)
-        self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
-        self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
-        self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
-
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_integration(self):
-        input_speech = self._load_datasamples(1)
-        feature_extractor = TvltFeatureExtractor()
-        audio_values = feature_extractor(input_speech, return_tensors="pt").audio_values
-
-        self.assertEqual(audio_values.shape, (1, 1, 192, 128))
-
-        expected_slice = torch.tensor([[-0.3032, -0.2708], [-0.4434, -0.4007]])
-        self.assertTrue(torch.allclose(audio_values[0, 0, :2, :2], expected_slice, atol=1e-4))
--- a/tests/models/tvlt/test_image_processor_tvlt.py
+++ b/tests/models/tvlt/test_image_processor_tvlt.py
@@ -1,294 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TVLT image processor."""
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import TvltImageProcessor
-
-
-def prepare_video(image_processor_tester, width=10, height=10, numpify=False, torchify=False):
-    """This function prepares a video as a list of PIL images/NumPy arrays/PyTorch tensors."""
-
-    video = []
-    for i in range(image_processor_tester.num_frames):
-        video.append(np.random.randint(255, size=(image_processor_tester.num_channels, width, height), dtype=np.uint8))
-
-    if not numpify and not torchify:
-        # PIL expects the channel dimension as last dimension
-        video = [Image.fromarray(np.moveaxis(frame, 0, -1)) for frame in video]
-
-    if torchify:
-        video = [torch.from_numpy(frame) for frame in video]
-
-    return video
-
-
-def prepare_video_inputs(image_processor_tester, equal_resolution=False, numpify=False, torchify=False):
-    """This function prepares a batch of videos: a list of list of PIL images, or a list of list of numpy arrays if
-    one specifies numpify=True, or a list of list of PyTorch tensors if one specifies torchify=True.
-    One can specify whether the videos are of the same resolution or not.
-    """
-
-    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
-
-    video_inputs = []
-    for i in range(image_processor_tester.batch_size):
-        if equal_resolution:
-            width = height = image_processor_tester.max_resolution
-        else:
-            width, height = np.random.choice(
-                np.arange(image_processor_tester.min_resolution, image_processor_tester.max_resolution), 2
-            )
-            video = prepare_video(
-                image_processor_tester=image_processor_tester,
-                width=width,
-                height=height,
-                numpify=numpify,
-                torchify=torchify,
-            )
-        video_inputs.append(video)
-
-    return video_inputs
-
-
-class TvltImageProcessorTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        num_frames=4,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        do_center_crop=True,
-        crop_size=None,
-    ):
-        size = size if size is not None else {"shortest_edge": 18}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.num_frames = num_frames
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-
-    def prepare_image_processor_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-        }
-
-
-@require_torch
-@require_vision
-class TvltImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = TvltImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        self.image_processor_tester = TvltImageProcessorTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processor, "image_mean"))
-        self.assertTrue(hasattr(image_processor, "image_std"))
-        self.assertTrue(hasattr(image_processor, "do_normalize"))
-        self.assertTrue(hasattr(image_processor, "do_resize"))
-        self.assertTrue(hasattr(image_processor, "do_center_crop"))
-        self.assertTrue(hasattr(image_processor, "size"))
-
-    def test_call_pil(self):
-        # Initialize image_processor
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL videos
-        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], Image.Image)
-
-        # Test not batched input
-        encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                1,
-                self.image_processor_tester.num_frames,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_frames,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize image_processor
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], np.ndarray)
-
-        # Test not batched input
-        encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                1,
-                self.image_processor_tester.num_frames,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_frames,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_numpy_4_channels(self):
-        # Initialize image_processor
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        self.image_processor_tester.num_channels = 4
-        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], np.ndarray)
-
-        # Test not batched input
-        encoded_videos = image_processor(
-            video_inputs[0], return_tensors="pt", input_data_format="channels_first", image_mean=0, image_std=1
-        ).pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                1,
-                self.image_processor_tester.num_frames,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_videos = image_processor(
-            video_inputs, return_tensors="pt", input_data_format="channels_first", image_mean=0, image_std=1
-        ).pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_frames,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-        self.image_processor_tester.num_channels = 3
-
-    def test_call_pytorch(self):
-        # Initialize image_processor
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], torch.Tensor)
-
-        # Test not batched input
-        encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                1,
-                self.image_processor_tester.num_frames,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_frames,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
--- a/tests/models/tvlt/test_modeling_tvlt.py
+++ b/tests/models/tvlt/test_modeling_tvlt.py
@@ -1,625 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch TVLT model."""
-
-import copy
-import inspect
-import unittest
-
-import numpy as np
-from huggingface_hub import hf_hub_download
-
-from transformers import (
-    TvltConfig,
-    is_datasets_available,
-    is_speech_available,
-    is_torch_available,
-    is_vision_available,
-)
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-    import torch.nn as nn
-
-    from transformers import TvltForAudioVisualClassification, TvltForPreTraining, TvltModel
-
-
-if is_datasets_available():
-    from datasets import load_dataset
-
-if is_vision_available():
-    from transformers import TvltImageProcessor
-
-if is_speech_available():
-    from transformers import TvltFeatureExtractor
-
-
-class TvltModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        image_size=32,
-        spectrogram_length=32,
-        frequency_length=16,
-        image_patch_size=[2, 2],
-        audio_patch_size=[2, 2],
-        num_image_channels=3,
-        num_audio_channels=1,
-        num_frames=2,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=128,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        qkv_bias=True,
-        use_mean_pooling=True,
-        decoder_num_attention_heads=4,
-        decoder_hidden_size=32,
-        decoder_num_hidden_layers=2,
-        decoder_intermediate_size=128,
-        image_mask_ratio=0.75,
-        audio_mask_ratio=0.15,
-        audio_mask_type="frame-level",
-        task_matching=True,
-        task_mae=True,
-        num_labels=1,
-        is_training=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.spectrogram_length = spectrogram_length
-        self.frequency_length = frequency_length
-        self.image_patch_size = image_patch_size
-        self.audio_patch_size = audio_patch_size
-        self.num_image_channels = num_image_channels
-        self.num_audio_channels = num_audio_channels
-        self.num_frames = num_frames
-
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.qkv_bias = qkv_bias
-        self.use_mean_pooling = use_mean_pooling
-
-        self.decoder_num_attention_heads = decoder_num_attention_heads
-        self.decoder_hidden_size = decoder_hidden_size
-        self.decoder_num_hidden_layers = decoder_num_hidden_layers
-        self.decoder_intermediate_size = decoder_intermediate_size
-        self.image_mask_ratio = image_mask_ratio
-        self.audio_mask_ratio = audio_mask_ratio
-
-        self.task_matching = task_matching
-        self.task_mae = task_mae
-        self.num_labels = num_labels
-
-        self.expected_pixel_seq_len = (self.image_size // self.image_patch_size[0]) ** 2 * self.num_frames
-        self.expected_audio_seq_len = (self.spectrogram_length // self.audio_patch_size[0]) * (
-            self.frequency_length // self.audio_patch_size[1]
-        )
-        # we set the expected sequence length (which is used in several tests)
-        # this is equal to the seq length of number of image/video patches + number of audio patches
-        self.expected_seq_len = self.expected_pixel_seq_len + self.expected_audio_seq_len + 1
-
-        self.image_mae_output_dim = image_patch_size[0] ** 2 * num_image_channels
-        self.audio_mae_output_dim = audio_patch_size[0] * audio_patch_size[1] * num_audio_channels
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor(
-            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
-        )
-        audio_values = floats_tensor(
-            [self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
-        )
-
-        pixel_mask = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
-        audio_mask = floats_tensor([self.batch_size, self.expected_audio_seq_len])
-
-        config = self.get_config()
-
-        return (config, pixel_values, audio_values, pixel_mask, audio_mask)
-
-    def prepare_config_and_inputs_for_pretraining(self):
-        pixel_values = floats_tensor(
-            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
-        )
-        audio_values = floats_tensor(
-            [self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
-        )
-
-        pixel_mask = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
-        audio_mask = floats_tensor([self.batch_size, self.expected_audio_seq_len])
-
-        pixel_values_mixed = floats_tensor(
-            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
-        )
-        pixel_mask_mixed = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
-        labels = floats_tensor([self.batch_size])
-        config = self.get_config()
-
-        return (
-            config,
-            pixel_values,
-            audio_values,
-            pixel_mask,
-            audio_mask,
-            pixel_values_mixed,
-            pixel_mask_mixed,
-            labels,
-        )
-
-    def get_config(self):
-        return TvltConfig(
-            image_size=self.image_size,
-            spectrogram_length=self.spectrogram_length,
-            frequency_length=self.frequency_length,
-            image_patch_size=self.image_patch_size,
-            audio_patch_size=self.audio_patch_size,
-            num_image_channels=self.num_image_channels,
-            num_audio_channels=self.num_audio_channels,
-            num_frames=self.num_frames,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            initializer_range=self.initializer_range,
-            layer_norm_eps=self.layer_norm_eps,
-            qkv_bias=self.qkv_bias,
-            use_mean_pooling=self.use_mean_pooling,
-            decoder_num_attention_heads=self.decoder_num_attention_heads,
-            decoder_hidden_size=self.decoder_hidden_size,
-            decoder_num_hidden_layers=self.decoder_num_hidden_layers,
-            decoder_intermediate_size=self.decoder_intermediate_size,
-            image_mask_ratio=self.image_mask_ratio,
-            audio_mask_ratio=self.audio_mask_ratio,
-            task_matching=self.task_matching,
-            task_mae=self.task_mae,
-            num_labels=self.num_labels,
-        )
-
-    def create_and_check_model(self, config, pixel_values, audio_values, pixel_mask, audio_mask):
-        model = TvltModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, audio_values, pixel_mask=pixel_mask, audio_mask=audio_mask)
-        result = model(pixel_values, audio_values)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.expected_seq_len, self.hidden_size)
-        )
-
-    def create_and_check_for_audiovisual_classification(
-        self, config, pixel_values, audio_values, pixel_mask, audio_mask
-    ):
-        model = TvltForAudioVisualClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, audio_values, pixel_mask=pixel_mask, audio_mask=audio_mask)
-        result = model(pixel_values, audio_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_pretraining(
-        self,
-        config,
-        pixel_values,
-        audio_values,
-        pixel_mask,
-        audio_mask,
-        pixel_values_mixed,
-        pixel_mask_mixed,
-        labels,
-    ):
-        model = TvltForPreTraining(config=config)
-        model.to(torch_device)
-        model.train()
-        result = model(
-            pixel_values,
-            audio_values,
-            pixel_mask,
-            audio_mask,
-            pixel_values_mixed=pixel_values_mixed,
-            pixel_mask_mixed=pixel_mask_mixed,
-            labels=labels,
-        )
-        self.parent.assertEqual(
-            result.pixel_logits.shape, (self.batch_size, self.expected_pixel_seq_len, self.image_mae_output_dim)
-        )
-        self.parent.assertEqual(
-            result.audio_logits.shape, (self.batch_size, self.expected_audio_seq_len, self.audio_mae_output_dim)
-        )
-        self.parent.assertEqual(result.matching_logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_pretraining_inference(
-        self,
-        config,
-        pixel_values,
-        audio_values,
-        pixel_mask,
-        audio_mask,
-        pixel_values_mixed,
-        pixel_mask_mixed,
-        labels,
-    ):
-        model = TvltForPreTraining(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            pixel_values,
-            audio_values,
-            pixel_mask,
-            audio_mask,
-            pixel_values_mixed=pixel_values_mixed,
-            pixel_mask_mixed=pixel_mask_mixed,
-            labels=labels,
-        )
-        if result.pixel_logits is not None:
-            self.parent.assertEqual(
-                result.pixel_logits.shape, (self.batch_size, self.expected_pixel_seq_len, self.image_mae_output_dim)
-            )
-        if result.audio_logits is not None:
-            self.parent.assertEqual(
-                result.audio_logits.shape, (self.batch_size, self.expected_audio_seq_len, self.audio_mae_output_dim)
-            )
-        self.parent.assertEqual(result.matching_logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, pixel_values, audio_values, pixel_mask, audio_mask) = config_and_inputs
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "audio_values": audio_values,
-            "pixel_mask": pixel_mask,
-            "audio_mask": audio_mask,
-        }
-        return config, inputs_dict
-
-    def prepare_pixel_values(self):
-        return floats_tensor(
-            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
-        )
-
-    def prepare_audio_values(self):
-        return floats_tensor(
-            [self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
-        )
-
-
-@require_torch
-class TvltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (TvltModel, TvltForPreTraining, TvltForAudioVisualClassification) if is_torch_available() else ()
-    )
-    pipeline_model_mapping = {"feature-extraction": TvltModel} if is_torch_available() else {}
-
-    fx_compatible = False
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-    test_resize_embeddings = False
-    main_input_name = "pixel_values"
-
-    # TvltForAudioVisualClassification and TvltForPreTraining require special treatment
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=True):
-        inputs_dict = copy.deepcopy(inputs_dict)
-
-        if return_labels:
-            if model_class.__name__ == "TvltForAudioVisualClassification":
-                inputs_dict["labels"] = torch.zeros(
-                    (self.model_tester.batch_size,), dtype=torch.long, device=torch_device
-                )
-            elif model_class.__name__ == "TvltForPreTraining":
-                inputs_dict["labels"] = torch.zeros(
-                    (self.model_tester.batch_size,), dtype=torch.float, device=torch_device
-                )
-                inputs_dict["pixel_values_mixed"] = torch.zeros(
-                    (
-                        self.model_tester.batch_size,
-                        self.model_tester.num_frames,
-                        self.model_tester.num_image_channels,
-                        self.model_tester.image_size,
-                        self.model_tester.image_size,
-                    ),
-                    dtype=torch.float,
-                    device=torch_device,
-                )
-                inputs_dict["pixel_mask_mixed"] = torch.zeros(
-                    (self.model_tester.batch_size, self.model_tester.expected_pixel_seq_len),
-                    dtype=torch.float,
-                    device=torch_device,
-                )
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = TvltModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=TvltConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="TVLT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            input_embeddings = model.get_input_embeddings()
-            self.assertIsInstance(input_embeddings, (tuple))
-            for embedding in input_embeddings:
-                self.assertIsInstance(embedding, (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values", "audio_values"]
-            self.assertListEqual(arg_names[:2], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_audiovisual_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_audiovisual_classification(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pretraining()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-        self.model_tester.create_and_check_for_pretraining_inference(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "ZinengTang/tvlt-base"
-        model = TvltModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        for model_class in self.all_model_classes[1:]:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            model = model_class(config)
-            model.to(torch_device)
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            for k, v in inputs.items():
-                print(k, v.shape)
-            loss = model(**inputs).loss
-            loss.backward()
-
-    def test_training_gradient_checkpointing(self):
-        if not self.model_tester.is_training:
-            return
-
-        for model_class in self.all_model_classes[1:]:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.use_cache = False
-            config.return_dict = True
-
-            model = model_class(config)
-            model.to(torch_device)
-            model.gradient_checkpointing_enable()
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            loss = model(**inputs).loss
-            loss.backward()
-
-    def test_attention_outputs(self):
-        if not self.has_attentions:
-            pass
-
-        else:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            for model_class in self.all_model_classes[2:]:
-                seq_len = self.model_tester.expected_seq_len
-
-                inputs_dict["output_attentions"] = True
-                inputs_dict["output_hidden_states"] = False
-                config.return_dict = True
-                model = model_class(config)
-                model.to(torch_device)
-                model.eval()
-                with torch.no_grad():
-                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-                attentions = outputs.attentions
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-                # check that output_attentions also work using config
-                del inputs_dict["output_attentions"]
-                config.output_attentions = True
-                model = model_class(config)
-                model.to(torch_device)
-                model.eval()
-                with torch.no_grad():
-                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-                attentions = outputs.attentions
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_len, seq_len],
-                )
-                out_len = len(outputs)
-
-                # Check attention is always last and order is fine
-                inputs_dict["output_attentions"] = True
-                inputs_dict["output_hidden_states"] = True
-                model = model_class(config)
-                model.to(torch_device)
-                model.eval()
-                with torch.no_grad():
-                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-                self.assertEqual(out_len + 1, len(outputs))
-
-                self_attentions = outputs.attentions
-
-                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_len, seq_len],
-                )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-            expected_num_layers = self.model_tester.num_hidden_layers + 1
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            seq_length = self.model_tester.expected_seq_len
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes[2:]:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-
-# We will verify our results on a video of eating spaghetti
-# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
-def prepare_video(num_frames=8):
-    file = hf_hub_download(
-        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset"
-    )
-    video = np.load(file)[:num_frames]
-    return list(video)
-
-
-def prepare_audio(num_samples=1):
-    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-    # automatic decoding with librispeech
-    speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-    return [x["array"] for x in speech_samples]
-
-
-@require_torch
-@require_vision
-class TvltModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_processors(self):
-        # logits were tested with a different mean and std, so we use the same here
-        return (
-            TvltImageProcessor() if is_vision_available() else None,
-            TvltFeatureExtractor(),
-        )
-
-    def test_inference_for_base_model(self):
-        model = TvltModel.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
-
-        image_processor, audio_feature_extractor = self.default_processors
-        video = prepare_video()
-        audio = prepare_audio()
-        video_inputs = image_processor(video, return_tensors="pt").to(torch_device)
-        audio_inputs = audio_feature_extractor(audio, return_tensors="pt").to(torch_device)
-        inputs = {}
-        inputs.update(video_inputs)
-        inputs.update(audio_inputs)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_last_hidden_state_slice = torch.tensor([[-0.0186, -0.0691], [0.0242, -0.0398]], device=torch_device)
-        self.assertTrue(
-            torch.allclose(outputs.last_hidden_state[:, :2, :2], expected_last_hidden_state_slice, atol=1e-4)
-        )
-
-    def test_inference_for_pretraining(self):
-        model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
-
-        image_processor, audio_feature_extractor = self.default_processors
-        video = prepare_video()
-        video_mixed = prepare_video()
-        audio = prepare_audio()
-        video_inputs = image_processor(video, return_tensors="pt", mask_pixel=True).to(torch_device)
-        video_mixed_inputs = image_processor(video_mixed, is_mixed=True, return_tensors="pt").to(torch_device)
-        audio_inputs = audio_feature_extractor(audio, return_tensors="pt", mask_audio=True).to(torch_device)
-        labels = torch.tensor([[0.0]], device=torch_device)
-        inputs = {}
-        inputs.update(video_inputs)
-        inputs.update(video_mixed_inputs)
-        inputs.update(audio_inputs)
-        inputs.update({"labels": labels})
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_pixel_logits_shape = torch.Size([1, 1568, 768])
-        expected_audio_logits_shape = torch.Size([1, 96, 256])
-        expected_matching_logits_shape = torch.Size([1, 1])
-
-        if outputs.pixel_logits is not None:
-            self.assertEqual(outputs.pixel_logits.shape, expected_pixel_logits_shape)
-        if outputs.audio_logits is not None:
-            self.assertEqual(outputs.audio_logits.shape, expected_audio_logits_shape)
-        self.assertTrue(outputs.matching_logits.shape, expected_matching_logits_shape)
--- a/tests/models/tvlt/test_processor_tvlt.py
+++ b/tests/models/tvlt/test_processor_tvlt.py
@@ -1,116 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import shutil
-import tempfile
-import unittest
-
-import numpy as np
-import pytest
-
-from transformers import is_speech_available, is_vision_available
-from transformers.testing_utils import require_torch
-
-
-if is_vision_available():
-    from transformers import TvltImageProcessor
-
-if is_speech_available():
-    from transformers import TvltFeatureExtractor
-
-from transformers import TvltProcessor
-
-
-@require_torch
-class TvltProcessorTest(unittest.TestCase):
-    def setUp(self):
-        self.checkpoint = "ZinengTang/tvlt-base"
-        self.tmpdirname = tempfile.mkdtemp()
-
-    def get_image_processor(self, **kwargs):
-        return TvltImageProcessor.from_pretrained(self.checkpoint, **kwargs)
-
-    def get_feature_extractor(self, **kwargs):
-        return TvltFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def test_save_load_pretrained_default(self):
-        image_processor = self.get_image_processor()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
-        processor.save_pretrained(self.tmpdirname)
-        processor = TvltProcessor.from_pretrained(self.tmpdirname)
-
-        self.assertIsInstance(processor.feature_extractor, TvltFeatureExtractor)
-        self.assertIsInstance(processor.image_processor, TvltImageProcessor)
-
-    def test_feature_extractor(self):
-        image_processor = self.get_image_processor()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
-
-        audio = np.ones([12000])
-
-        audio_dict = feature_extractor(audio, return_tensors="np")
-        input_processor = processor(audio=audio, return_tensors="np")
-
-        for key in audio_dict.keys():
-            self.assertAlmostEqual(audio_dict[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_image_processor(self):
-        image_processor = self.get_image_processor()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
-
-        images = np.ones([3, 224, 224])
-
-        image_dict = image_processor(images, return_tensors="np")
-        input_processor = processor(images=images, return_tensors="np")
-
-        for key in image_dict.keys():
-            self.assertAlmostEqual(image_dict[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_processor(self):
-        image_processor = self.get_image_processor()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
-
-        audio = np.ones([12000])
-        images = np.ones([3, 224, 224])
-
-        inputs = processor(audio=audio, images=images)
-
-        self.assertListEqual(list(inputs.keys()), ["audio_values", "audio_mask", "pixel_values", "pixel_mask"])
-
-        # test if it raises when no input is passed
-        with pytest.raises(ValueError):
-            processor()
-
-    def test_model_input_names(self):
-        image_processor = self.get_image_processor()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
-
-        self.assertListEqual(
-            processor.model_input_names,
-            image_processor.model_input_names + feature_extractor.model_input_names,
-            msg="`processor` and `image_processor`+`feature_extractor` model input names do not match",
-        )
--- a/tests/models/vit_hybrid/init.py
+++ b/tests/models/vit_hybrid/init.py
--- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
+++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
@@ -1,281 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch ViT Hybrid model."""
-
-import unittest
-
-from transformers import ViTHybridConfig
-from transformers.testing_utils import is_flaky, require_accelerate, require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property, is_torch_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-    from torch import nn
-
-    from transformers import ViTHybridForImageClassification, ViTHybridImageProcessor, ViTHybridModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-
-class ViTHybridModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=64,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        backbone_featmap_shape=[1, 16, 4, 4],
-        scope=None,
-        attn_implementation="eager",
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.backbone_featmap_shape = backbone_featmap_shape
-        self.attn_implementation = attn_implementation
-
-        # in ViT hybrid, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        # the number of patches is based on the feature map of the backbone, which by default uses an output stride
-        # of 32, which means that the feature map has a spatial resolution of 1/32 of the input image size
-        num_patches = (self.image_size // 32) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        backbone_config = {
-            "global_padding": "same",
-            "layer_type": "bottleneck",
-            "depths": [3, 4, 9],
-            "out_features": ["stage1", "stage2", "stage3"],
-            "embedding_dynamic_padding": True,
-            "hidden_sizes": [4, 8, 16, 32],
-            "num_groups": 2,
-        }
-
-        return ViTHybridConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            backbone_featmap_shape=self.backbone_featmap_shape,
-            backbone_config=backbone_config,
-            backbone=None,
-            attn_implementation=self.attn_implementation,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = ViTHybridModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = ViTHybridForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_torch
-class ViTHybridModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ViT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (ViTHybridModel, ViTHybridForImageClassification) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {"image-feature-extraction": ViTHybridModel, "image-classification": ViTHybridForImageClassification}
-        if is_torch_available()
-        else {}
-    )
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    model_split_percents = [0.5, 0.9]
-
-    def setUp(self):
-        self.model_tester = ViTHybridModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ViTHybridConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="ViT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            # Skip the check for the backbone
-            for name, module in model.named_modules():
-                if module.__class__.__name__ == "ViTHybridPatchEmbeddings":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
-                    break
-
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if name in backbone_params:
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/vit-hybrid-base-bit-384"
-        model = ViTHybridModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
-    def test_batching_equivalence(self):
-        super().test_batching_equivalence()
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_torch
-@require_vision
-class ViTModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384").to(torch_device)
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = torch.Size((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = torch.tensor([-1.9090, -0.4993, -0.2389]).to(torch_device)
-
-        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
-
-    @slow
-    @require_accelerate
-    def test_accelerate_inference(self):
-        image_processor = ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384")
-        model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384", device_map="auto")
-
-        image = prepare_img()
-
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-        outputs = model(**inputs)
-        logits = outputs.logits
-        # model predicts one of the 1000 ImageNet classes
-        predicted_class_idx = logits.argmax(-1).item()
-
-        self.assertTrue(model.config.id2label[predicted_class_idx], "tabby, tabby cat")
--- a/tests/models/xlm_prophetnet/init.py
+++ b/tests/models/xlm_prophetnet/init.py
--- a/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py
+++ b/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py
@@ -1,150 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import XLMProphetNetForConditionalGeneration, XLMProphetNetTokenizer
-
-
-@require_torch
-class XLMProphetNetModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_pretrained_checkpoint_hidden_states(self):
-        model = XLMProphetNetForConditionalGeneration.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
-        model.to(torch_device)
-
-        # encoder-decoder outputs
-        encoder_ids = torch.tensor([[17, 96208, 103471, 2]]).to(torch_device)
-        decoder_prev_ids = torch.tensor(
-            [[2, 250, 9953, 34, 69489, 1620, 32, 118424, 624, 210, 105, 2913, 1032, 351]]
-        ).to(torch_device)
-        output = model(
-            input_ids=encoder_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=decoder_prev_ids
-        )
-        output_predited_logis = output[0]
-        expected_shape = torch.Size((1, 14, 250012))
-        self.assertEqual(output_predited_logis.shape, expected_shape)
-        expected_slice = torch.tensor(
-            [[[-6.3986, -8.2391, 12.5189], [-6.3289, -8.0864, 12.6211], [-6.2418, -8.0445, 12.7968]]]
-        ).to(torch_device)
-        self.assertTrue(torch.allclose(output_predited_logis[:, :3, :3], expected_slice, atol=1e-4))
-
-        # encoder outputs
-        encoder_outputs = model.prophetnet.encoder(encoder_ids)[0]
-        expected_encoder_outputs_slice = torch.tensor(
-            [[[-1.4260, -0.7628, 0.8453], [-1.4719, -0.1391, 0.7807], [-1.7678, 0.0114, 0.4646]]]
-        ).to(torch_device)
-        expected_shape_encoder = torch.Size((1, 4, 1024))
-        self.assertEqual(encoder_outputs.shape, expected_shape_encoder)
-        self.assertTrue(torch.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4))
-
-        # decoder outputs
-        decoder_outputs = model.prophetnet.decoder(
-            decoder_prev_ids,
-            encoder_hidden_states=encoder_outputs,
-        )
-        predicting_streams = decoder_outputs[1].view(1, model.config.ngram, 14, -1)
-        predicting_streams_logits = model.lm_head(predicting_streams)
-        next_first_stream_logits = predicting_streams_logits[:, 0]
-        self.assertTrue(torch.allclose(next_first_stream_logits[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_ntg_hidden_states(self):
-        model = XLMProphetNetForConditionalGeneration.from_pretrained(
-            "microsoft/xprophetnet-large-wiki100-cased-xglue-ntg"
-        )
-        model.to(torch_device)
-
-        encoder_ids = torch.tensor([[17, 96208, 103471, 2]]).to(torch_device)
-        decoder_prev_ids = torch.tensor(
-            [[2, 250, 9953, 34, 69489, 1620, 32, 118424, 624, 210, 105, 2913, 1032, 351]]
-        ).to(torch_device)
-        output = model(
-            input_ids=encoder_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=decoder_prev_ids
-        )
-        output_predited_logis = output[0]
-        expected_shape = torch.Size((1, 14, 250012))
-        self.assertEqual(output_predited_logis.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[-9.2253, -9.7173, -6.3529], [-7.6701, -9.0145, -1.9382], [-8.0195, -7.0004, -0.1523]]]
-        ).to(torch_device)
-
-        self.assertTrue(torch.allclose(output_predited_logis[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_xprophetnet_ntg_inference(self):
-        model = XLMProphetNetForConditionalGeneration.from_pretrained(
-            "microsoft/xprophetnet-large-wiki100-cased-xglue-ntg"
-        )
-        model.to(torch_device)
-        model.config.max_length = 512
-
-        tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased-xglue-ntg")
-
-        EN_SENTENCE = (
-            "Microsoft Corporation intends to officially end free support for the Windows 7 operating system after"
-            " January 14, 2020, according to the official portal of the organization. From that day, users of this"
-            " system will not be able to receive security updates, which could make their computers vulnerable to"
-            " cyber attacks."
-        )
-        RU_SENTENCE = (
-            "орпорация Microsoft намерена официально прекратить бесплатную поддержку операционной системы Windows 7"
-            " после 14 января 2020 года, сообщается на официальном портале организации . С указанного дня пользователи"
-            " этой системы не смогут получать обновления безопасности, из-за чего их компьютеры могут стать уязвимыми"
-            " к кибератакам."
-        )
-        ZH_SENTENCE = "根据该组织的官方门户网站，微软公司打算在2020年1月14日之后正式终止对Windows 7操作系统的免费支持。从那时起，该系统的用户将无法接收安全更新，这可能会使他们的计算机容易受到网络攻击。"
-
-        input_ids = tokenizer(
-            [EN_SENTENCE, RU_SENTENCE, ZH_SENTENCE], padding=True, max_length=255, return_tensors="pt"
-        ).input_ids
-        input_ids = input_ids.to(torch_device)
-
-        summary_ids = model.generate(
-            input_ids, num_beams=10, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
-        )
-        generated_titles = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
-        EXPECTED_TITLE_EN = "Microsoft to end Windows 7 free support after January 14, 2020"
-        EXPECTED_TITLE_RU = "Microsoft намерена прекратить бесплатную поддержку Windows 7 после 14 января 2020 года"
-        EXPECTED_TITLE_ZH = "微软打算终止对Windows 7操作系统的免费支持"
-        self.assertListEqual(
-            [EXPECTED_TITLE_EN, EXPECTED_TITLE_RU, EXPECTED_TITLE_ZH],
-            generated_titles,
-        )
-
-        summary_ids_beam1 = model.generate(
-            input_ids, num_beams=1, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
-        )
-        generated_titles_beam1_tok = [
-            tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True) for g in summary_ids_beam1
-        ]
-        EXPECTED_TITLE_EN_BEAM1_TOK = "▁Microsoft ▁to ▁end ▁free ▁support ▁for ▁Windows ▁7".split(" ")
-        EXPECTED_TITLE_RU_BEAM1_TOK = "▁Microsoft ▁намерен а ▁прекрати ть ▁бес плат ную ▁поддержку ▁Windows ▁7 ▁после ▁14 ▁января ▁2020 ▁года".split(
-            " "
-        )
-        EXPECTED_TITLE_ZH_BEAM1_TOK = "微软 公司 打算 终止 对 Windows ▁7 操作 系统的 免费 支持".split(" ")
-        self.assertListEqual(
-            [EXPECTED_TITLE_EN_BEAM1_TOK, EXPECTED_TITLE_RU_BEAM1_TOK, EXPECTED_TITLE_ZH_BEAM1_TOK],
-            generated_titles_beam1_tok,
-        )
--- a/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py
+++ b/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py
@@ -1,154 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers.models.xlm_prophetnet.tokenization_xlm_prophetnet import SPIECE_UNDERLINE, XLMProphetNetTokenizer
-from transformers.testing_utils import get_tests_dir, require_sentencepiece, slow
-from transformers.utils import cached_property
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
-
-
-@require_sentencepiece
-class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "microsoft/xprophetnet-large-wiki100-cased"
-    tokenizer_class = XLMProphetNetTokenizer
-    test_rust_tokenizer = False
-    test_sentencepiece = True
-
-    def setUp(self):
-        super().setUp()
-
-        # We have a SentencePiece fixture for testing
-        tokenizer = XLMProphetNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = "[PAD]"
-        token_id = 0
-
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "[PAD]")
-        self.assertEqual(vocab_keys[1], "[CLS]")
-        self.assertEqual(vocab_keys[-1], "j")
-        self.assertEqual(len(vocab_keys), 1_012)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 1_012)
-
-    def test_full_tokenizer(self):
-        tokenizer = XLMProphetNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
-
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens),
-            [value + tokenizer.fairseq_offset for value in [285, 46, 10, 170, 382]],
-        )
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(
-            tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "9",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "é",
-                ".",
-            ],
-        )
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(
-            ids,
-            [
-                value + tokenizer.fairseq_offset
-                for value in [8, 21, 84, 55, 24, 19, 7, -9, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, -9, 4]
-            ],
-        )
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(
-            back_tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "[UNK]",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "[UNK]",
-                ".",
-            ],
-        )
-
-    @cached_property
-    def big_tokenizer(self):
-        return XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
-
-    @slow
-    def test_tokenization_base_easy_symbols(self):
-        symbols = "Hello World!"
-        original_tokenizer_encodings = [35389, 6672, 49, 2]
-        self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
-
-    @slow
-    def test_tokenizer_integration(self):
-        expected_encoding = {'input_ids': [[11073, 82783, 18, 26, 82783, 549, 51540, 248, 17209, 1301, 217, 20, 215186, 1325, 147, 17209, 1301, 217, 20, 56370, 53, 122020, 20, 16477, 27, 87355, 4548, 20, 4728, 78392, 17, 159969, 18, 26, 24491, 629, 15, 538, 22704, 5439, 15, 2788, 24491, 9885, 15, 43534, 605, 15, 814, 18403, 33200, 29, 15, 43534, 24458, 12410, 111, 24966, 83669, 9637, 144068, 26, 850, 22346, 27, 147, 24966, 83669, 83490, 26, 39113, 735, 27, 689, 656, 2800, 1339, 4600, 53, 122020, 115785, 34, 816, 1339, 46887, 18, 147, 53905, 1951, 42238, 41170, 17732, 834, 436, 15, 27523, 98733, 217, 147, 5542, 4981, 930, 17347, 16, 2], [20091, 629, 94, 82786, 58, 490, 20, 1528, 84, 53905, 344, 80592, 110128, 18822, 5267, 1306, 62, 152537, 308, 7997, 401, 124427, 549, 35442, 225, 109, 15055, 25748, 147, 7119, 43712, 34, 767, 135366, 18, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [592, 63784, 119466, 17, 147808, 88214, 18, 656, 81, 32, 3296, 10280, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
-
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="microsoft/xprophetnet-large-wiki100-cased",
-            revision="1acad1643ddd54a44df6a1b797ada8373685d90e",
-        )