Add Fast Yolos Processor (#37292)

* Add Fast Yolos Processor * Update modular file * Fix copies --------- Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
2025-04-15 17:53:08 +05:30
parent ecaeee66bc
commit f6c79f767c
6 changed files with 1373 additions and 268 deletions
--- a/tests/models/yolos/test_image_processing_yolos.py
+++ b/tests/models/yolos/test_image_processing_yolos.py
@@ -21,7 +21,7 @@ import numpy as np
 from parameterized import parameterized

 from transformers.testing_utils import require_torch, require_vision, slow
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available

 from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs

@@ -34,6 +34,9 @@ if is_vision_available():

    from transformers import YolosImageProcessor

+    if is_torchvision_available():
+        from transformers import YolosImageProcessorFast
+

 class YolosImageProcessingTester:
    def __init__(
@@ -143,6 +146,7 @@ class YolosImageProcessingTester:
@require_vision
 class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = YolosImageProcessor if is_vision_available() else None
+    fast_image_processing_class = YolosImageProcessorFast if is_torchvision_available() else None

    def setUp(self):
        super().setUp()
@@ -153,23 +157,25 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
        return self.image_processor_tester.prepare_image_processor_dict()

    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processing, "image_mean"))
+            self.assertTrue(hasattr(image_processing, "image_std"))
+            self.assertTrue(hasattr(image_processing, "do_normalize"))
+            self.assertTrue(hasattr(image_processing, "do_resize"))
+            self.assertTrue(hasattr(image_processing, "size"))

    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
-        self.assertEqual(image_processor.do_pad, True)
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class.from_dict(self.image_processor_dict)
+            self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
+            self.assertEqual(image_processor.do_pad, True)

-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
-        )
-        self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
-        self.assertEqual(image_processor.do_pad, False)
+            image_processor = image_processing_class.from_dict(
+                self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
+            )
+            self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
+            self.assertEqual(image_processor.do_pad, False)

    def test_equivalence_padding(self):
        # Initialize image_processings
@@ -199,21 +205,22 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
        ]
    )
    def test_resize_max_size_respected(self, image_size, longest_edge, shortest_edge):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)

-        # create torch tensors as image
-        image = torch.randint(0, 256, image_size, dtype=torch.uint8)
-        processed_image = image_processor(
-            image,
-            size={"longest_edge": longest_edge, "shortest_edge": shortest_edge},
-            do_pad=False,
-            return_tensors="pt",
-        )["pixel_values"]
+            # create torch tensors as image
+            image = torch.randint(0, 256, image_size, dtype=torch.uint8)
+            processed_image = image_processor(
+                image,
+                size={"longest_edge": longest_edge, "shortest_edge": shortest_edge},
+                do_pad=False,
+                return_tensors="pt",
+            )["pixel_values"]

-        shape = list(processed_image.shape[-2:])
-        max_size, min_size = max(shape), min(shape)
-        self.assertTrue(max_size <= 1333, f"Expected max_size <= 1333, got image shape {shape}")
-        self.assertTrue(min_size <= 800, f"Expected min_size <= 800, got image shape {shape}")
+            shape = list(processed_image.shape[-2:])
+            max_size, min_size = max(shape), min(shape)
+            self.assertTrue(max_size <= 1333, f"Expected max_size <= 1333, got image shape {shape}")
+            self.assertTrue(min_size <= 800, f"Expected min_size <= 800, got image shape {shape}")

    @slow
    def test_call_pytorch_with_coco_detection_annotations(self):
@@ -224,40 +231,41 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix

        target = {"image_id": 39769, "annotations": target}

-        # encode them
-        image_processing = YolosImageProcessor.from_pretrained("hustvl/yolos-small")
-        encoding = image_processing(images=image, annotations=target, return_tensors="pt")
+        for image_processing_class in self.image_processor_list:
+            # encode them
+            image_processing = image_processing_class.from_pretrained("hustvl/yolos-small")
+            encoding = image_processing(images=image, annotations=target, return_tensors="pt")

-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1056])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+            # verify pixel values
+            expected_shape = torch.Size([1, 3, 800, 1056])
+            self.assertEqual(encoding["pixel_values"].shape, expected_shape)

-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        torch.testing.assert_close(encoding["pixel_values"][0, 0, 0, :3], expected_slice, rtol=1e-4, atol=1e-4)
+            expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+            torch.testing.assert_close(encoding["pixel_values"][0, 0, 0, :3], expected_slice, rtol=1e-4, atol=1e-4)

-        # verify area
-        expected_area = torch.tensor([5832.7256, 11144.6689, 484763.2500, 829269.8125, 146579.4531, 164177.6250])
-        torch.testing.assert_close(encoding["labels"][0]["area"], expected_area)
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        torch.testing.assert_close(encoding["labels"][0]["boxes"][0], expected_boxes_slice, rtol=1e-3, atol=1e-3)
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        torch.testing.assert_close(encoding["labels"][0]["image_id"], expected_image_id)
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        torch.testing.assert_close(encoding["labels"][0]["iscrowd"], expected_is_crowd)
-        # verify class_labels
-        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
-        torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels)
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size)
-        # verify size
-        expected_size = torch.tensor([800, 1056])
-        torch.testing.assert_close(encoding["labels"][0]["size"], expected_size)
+            # verify area
+            expected_area = torch.tensor([5832.7256, 11144.6689, 484763.2500, 829269.8125, 146579.4531, 164177.6250])
+            torch.testing.assert_close(encoding["labels"][0]["area"], expected_area)
+            # verify boxes
+            expected_boxes_shape = torch.Size([6, 4])
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+            expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+            torch.testing.assert_close(encoding["labels"][0]["boxes"][0], expected_boxes_slice, rtol=1e-3, atol=1e-3)
+            # verify image_id
+            expected_image_id = torch.tensor([39769])
+            torch.testing.assert_close(encoding["labels"][0]["image_id"], expected_image_id)
+            # verify is_crowd
+            expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+            torch.testing.assert_close(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+            # verify class_labels
+            expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+            torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels)
+            # verify orig_size
+            expected_orig_size = torch.tensor([480, 640])
+            torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size)
+            # verify size
+            expected_size = torch.tensor([800, 1056])
+            torch.testing.assert_close(encoding["labels"][0]["size"], expected_size)

    @slow
    def test_call_pytorch_with_coco_panoptic_annotations(self):
@@ -270,43 +278,45 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix

        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")

-        # encode them
-        image_processing = YolosImageProcessor(format="coco_panoptic")
-        encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
+        for image_processing_class in self.image_processor_list:
+            # encode them
+            image_processing = image_processing_class(format="coco_panoptic")
+            encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")

-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1056])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+            # verify pixel values
+            expected_shape = torch.Size([1, 3, 800, 1056])
+            self.assertEqual(encoding["pixel_values"].shape, expected_shape)

-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        torch.testing.assert_close(encoding["pixel_values"][0, 0, 0, :3], expected_slice, rtol=1e-4, atol=1e-4)
+            expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+            torch.testing.assert_close(encoding["pixel_values"][0, 0, 0, :3], expected_slice, rtol=1e-4, atol=1e-4)

-        # verify area
-        expected_area = torch.tensor([146591.5000, 163974.2500, 480092.2500, 11187.0000, 5824.5000, 7562.5000])
-        torch.testing.assert_close(encoding["labels"][0]["area"], expected_area)
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        torch.testing.assert_close(encoding["labels"][0]["boxes"][0], expected_boxes_slice, rtol=1e-3, atol=1e-3)
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        torch.testing.assert_close(encoding["labels"][0]["image_id"], expected_image_id)
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        torch.testing.assert_close(encoding["labels"][0]["iscrowd"], expected_is_crowd)
-        # verify class_labels
-        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
-        torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels)
-        # verify masks
-        expected_masks_sum = 815161
-        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size)
-        # verify size
-        expected_size = torch.tensor([800, 1056])
-        torch.testing.assert_close(encoding["labels"][0]["size"], expected_size)
+            # verify area
+            expected_area = torch.tensor([146591.5000, 163974.2500, 480092.2500, 11187.0000, 5824.5000, 7562.5000])
+            torch.testing.assert_close(encoding["labels"][0]["area"], expected_area)
+            # verify boxes
+            expected_boxes_shape = torch.Size([6, 4])
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+            expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
+            torch.testing.assert_close(encoding["labels"][0]["boxes"][0], expected_boxes_slice, rtol=1e-3, atol=1e-3)
+            # verify image_id
+            expected_image_id = torch.tensor([39769])
+            torch.testing.assert_close(encoding["labels"][0]["image_id"], expected_image_id)
+            # verify is_crowd
+            expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+            torch.testing.assert_close(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+            # verify class_labels
+            expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
+            torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels)
+            # verify masks
+            expected_masks_sum = 815161
+            relative_error = torch.abs(encoding["labels"][0]["masks"].sum() - expected_masks_sum) / expected_masks_sum
+            self.assertTrue(relative_error < 1e-3)
+            # verify orig_size
+            expected_orig_size = torch.tensor([480, 640])
+            torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size)
+            # verify size
+            expected_size = torch.tensor([800, 1056])
+            torch.testing.assert_close(encoding["labels"][0]["size"], expected_size)

    # Output size is slight different from DETR as yolos takes mod of 16
    @slow
@@ -336,96 +346,97 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
        images = [image_0, image_1]
        annotations = [annotations_0, annotations_1]

-        image_processing = YolosImageProcessor()
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            return_tensors="pt",  # do_convert_annotations=True
-        )
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class()
+            encoding = image_processing(
+                images=images,
+                annotations=annotations,
+                return_segmentation_masks=True,
+                return_tensors="pt",  # do_convert_annotations=True
+            )

-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1056
-        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+            # Check the pixel values have been padded
+            postprocessed_height, postprocessed_width = 800, 1056
+            expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
+            self.assertEqual(encoding["pixel_values"].shape, expected_shape)

-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        expected_boxes_0 = torch.tensor(
-            [
-                [0.6879, 0.4609, 0.0755, 0.3691],
-                [0.2118, 0.3359, 0.2601, 0.1566],
-                [0.5011, 0.5000, 0.9979, 1.0000],
-                [0.5010, 0.5020, 0.9979, 0.9959],
-                [0.3284, 0.5944, 0.5884, 0.8112],
-                [0.8394, 0.5445, 0.3213, 0.9110],
-            ]
-        )
-        expected_boxes_1 = torch.tensor(
-            [
-                [0.4169, 0.2765, 0.0458, 0.2215],
-                [0.1284, 0.2016, 0.1576, 0.0940],
-                [0.3792, 0.4933, 0.7559, 0.9865],
-                [0.3794, 0.5002, 0.7563, 0.9955],
-                [0.1990, 0.5456, 0.3566, 0.8646],
-                [0.5845, 0.4115, 0.3462, 0.7161],
-            ]
-        )
-        torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3, atol=1e-3)
-        torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3, atol=1e-3)
+            # Check the bounding boxes have been adjusted for padded images
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
+            self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
+            expected_boxes_0 = torch.tensor(
+                [
+                    [0.6879, 0.4609, 0.0755, 0.3691],
+                    [0.2118, 0.3359, 0.2601, 0.1566],
+                    [0.5011, 0.5000, 0.9979, 1.0000],
+                    [0.5010, 0.5020, 0.9979, 0.9959],
+                    [0.3284, 0.5944, 0.5884, 0.8112],
+                    [0.8394, 0.5445, 0.3213, 0.9110],
+                ]
+            )
+            expected_boxes_1 = torch.tensor(
+                [
+                    [0.4169, 0.2765, 0.0458, 0.2215],
+                    [0.1284, 0.2016, 0.1576, 0.0940],
+                    [0.3792, 0.4933, 0.7559, 0.9865],
+                    [0.3794, 0.5002, 0.7563, 0.9955],
+                    [0.1990, 0.5456, 0.3566, 0.8646],
+                    [0.5845, 0.4115, 0.3462, 0.7161],
+                ]
+            )
+            torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3, atol=1e-3)
+            torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3, atol=1e-3)

-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1056]))
-        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1056]))
+            # Check the masks have also been padded
+            self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1056]))
+            self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1056]))

-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="pt",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = torch.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = torch.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = torch.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = torch.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1, atol=1)
-        torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1, atol=1)
+            # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
+            # format and not in the range [0, 1]
+            encoding = image_processing(
+                images=images,
+                annotations=annotations,
+                return_segmentation_masks=True,
+                do_convert_annotations=False,
+                return_tensors="pt",
+            )
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
+            self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
+            # Convert to absolute coordinates
+            unnormalized_boxes_0 = torch.vstack(
+                [
+                    expected_boxes_0[:, 0] * postprocessed_width,
+                    expected_boxes_0[:, 1] * postprocessed_height,
+                    expected_boxes_0[:, 2] * postprocessed_width,
+                    expected_boxes_0[:, 3] * postprocessed_height,
+                ]
+            ).T
+            unnormalized_boxes_1 = torch.vstack(
+                [
+                    expected_boxes_1[:, 0] * postprocessed_width,
+                    expected_boxes_1[:, 1] * postprocessed_height,
+                    expected_boxes_1[:, 2] * postprocessed_width,
+                    expected_boxes_1[:, 3] * postprocessed_height,
+                ]
+            ).T
+            # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
+            expected_boxes_0 = torch.vstack(
+                [
+                    unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
+                    unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
+                    unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
+                    unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
+                ]
+            ).T
+            expected_boxes_1 = torch.vstack(
+                [
+                    unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
+                    unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
+                    unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
+                    unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
+                ]
+            ).T
+            torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1, atol=1)
+            torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1, atol=1)

    # Output size is slight different from DETR as yolos takes mod of 16
    def test_batched_coco_panoptic_annotations(self):
@@ -457,98 +468,100 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
        annotations = [annotation_0, annotation_1]

        # encode them
-        image_processing = YolosImageProcessor(format="coco_panoptic")
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_tensors="pt",
-            return_segmentation_masks=True,
-        )
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class()
+            image_processing = YolosImageProcessor(format="coco_panoptic")
+            encoding = image_processing(
+                images=images,
+                annotations=annotations,
+                masks_path=masks_path,
+                return_tensors="pt",
+                return_segmentation_masks=True,
+            )

-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1056
-        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+            # Check the pixel values have been padded
+            postprocessed_height, postprocessed_width = 800, 1056
+            expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
+            self.assertEqual(encoding["pixel_values"].shape, expected_shape)

-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        expected_boxes_0 = torch.tensor(
-            [
-                [0.2625, 0.5437, 0.4688, 0.8625],
-                [0.7719, 0.4104, 0.4531, 0.7125],
-                [0.5000, 0.4927, 0.9969, 0.9854],
-                [0.1688, 0.2000, 0.2063, 0.0917],
-                [0.5492, 0.2760, 0.0578, 0.2187],
-                [0.4992, 0.4990, 0.9984, 0.9979],
-            ]
-        )
-        expected_boxes_1 = torch.tensor(
-            [
-                [0.1591, 0.3262, 0.2841, 0.5175],
-                [0.4678, 0.2463, 0.2746, 0.4275],
-                [0.3030, 0.2956, 0.6042, 0.5913],
-                [0.1023, 0.1200, 0.1250, 0.0550],
-                [0.3329, 0.1656, 0.0350, 0.1312],
-                [0.3026, 0.2994, 0.6051, 0.5987],
-            ]
-        )
-        torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3, atol=1e-3)
-        torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3, atol=1e-3)
+            # Check the bounding boxes have been adjusted for padded images
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
+            self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
+            expected_boxes_0 = torch.tensor(
+                [
+                    [0.2625, 0.5437, 0.4688, 0.8625],
+                    [0.7719, 0.4104, 0.4531, 0.7125],
+                    [0.5000, 0.4927, 0.9969, 0.9854],
+                    [0.1688, 0.2000, 0.2063, 0.0917],
+                    [0.5492, 0.2760, 0.0578, 0.2187],
+                    [0.4992, 0.4990, 0.9984, 0.9979],
+                ]
+            )
+            expected_boxes_1 = torch.tensor(
+                [
+                    [0.1591, 0.3262, 0.2841, 0.5175],
+                    [0.4678, 0.2463, 0.2746, 0.4275],
+                    [0.3030, 0.2956, 0.6042, 0.5913],
+                    [0.1023, 0.1200, 0.1250, 0.0550],
+                    [0.3329, 0.1656, 0.0350, 0.1312],
+                    [0.3026, 0.2994, 0.6051, 0.5987],
+                ]
+            )
+            torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3, atol=1e-3)
+            torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3, atol=1e-3)

-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1056]))
-        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1056]))
+            # Check the masks have also been padded
+            self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1056]))
+            self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1056]))

-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="pt",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = torch.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = torch.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = torch.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = torch.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1, rtol=1)
-        torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1, rtol=1)
+            # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
+            # format and not in the range [0, 1]
+            encoding = image_processing(
+                images=images,
+                annotations=annotations,
+                masks_path=masks_path,
+                return_segmentation_masks=True,
+                do_convert_annotations=False,
+                return_tensors="pt",
+            )
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
+            self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
+            # Convert to absolute coordinates
+            unnormalized_boxes_0 = torch.vstack(
+                [
+                    expected_boxes_0[:, 0] * postprocessed_width,
+                    expected_boxes_0[:, 1] * postprocessed_height,
+                    expected_boxes_0[:, 2] * postprocessed_width,
+                    expected_boxes_0[:, 3] * postprocessed_height,
+                ]
+            ).T
+            unnormalized_boxes_1 = torch.vstack(
+                [
+                    expected_boxes_1[:, 0] * postprocessed_width,
+                    expected_boxes_1[:, 1] * postprocessed_height,
+                    expected_boxes_1[:, 2] * postprocessed_width,
+                    expected_boxes_1[:, 3] * postprocessed_height,
+                ]
+            ).T
+            # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
+            expected_boxes_0 = torch.vstack(
+                [
+                    unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
+                    unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
+                    unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
+                    unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
+                ]
+            ).T
+            expected_boxes_1 = torch.vstack(
+                [
+                    unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
+                    unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
+                    unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
+                    unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
+                ]
+            ).T
+            torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1, rtol=1)
+            torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1, rtol=1)

    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Yolos
    def test_max_width_max_height_resizing_and_pad_strategy(self):