From 7e35f37071655744b94c583edbb2f7fc209e1b7d Mon Sep 17 00:00:00 2001
From: rjenc29 <rjenc29@gmail.com>
Date: Mon, 11 Dec 2023 15:51:51 +0000
Subject: [PATCH] Fix a couple of typos and add an illustrative test (#26941)

* fix a typo and add an illustrative test

* appease black

* reduce code duplication and add Annotion type back with a pending deprecation warning

* remove unused code

* change warning type

* black formatting fix

* change enum deprecation approach to support 3.8 and earlier

* add stacklevel

* fix black issue

* fix ruff issues

* fix ruff issues

* move tests to own mixin

* include yolos

* fix black formatting issue

* fix black formatting issue

* use logger instead of warnings and include target version for deprecation
---
 src/transformers/image_utils.py               | 57 +++++++++++++
 .../image_processing_conditional_detr.py      | 49 ++++-------
 .../image_processing_deformable_detr.py       | 50 ++++--------
 .../models/deta/image_processing_deta.py      | 52 ++++--------
 .../models/detr/image_processing_detr.py      | 50 ++++--------
 .../models/yolos/image_processing_yolos.py    | 54 ++++---------
 .../test_image_processing_conditional_detr.py |  4 +-
 .../test_image_processing_deformable_detr.py  |  4 +-
 .../models/deta/test_image_processing_deta.py |  4 +-
 .../models/detr/test_image_processing_detr.py | 61 +++++++++++++-
 .../yolos/test_image_processing_yolos.py      |  4 +-
 tests/test_image_processing_common.py         | 81 +++++++++++++++++++
 12 files changed, 279 insertions(+), 191 deletions(-)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index bfb88d03d3..5d280bf5e2 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -15,6 +15,7 @@
 
 import base64
 import os
+from enum import EnumMeta
 from io import BytesIO
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
 
@@ -29,6 +30,7 @@ from .utils import (
     is_torch_available,
     is_torch_tensor,
     is_vision_available,
+    logging,
     requires_backends,
     to_numpy,
 )
@@ -56,6 +58,9 @@ if TYPE_CHECKING:
         import torch
 
 
+logger = logging.get_logger(__name__)
+
+
 ImageInput = Union[
     "PIL.Image.Image", np.ndarray, "torch.Tensor", List["PIL.Image.Image"], List[np.ndarray], List["torch.Tensor"]
 ]  # noqa
@@ -66,6 +71,28 @@ class ChannelDimension(ExplicitEnum):
     LAST = "channels_last"
 
 
+class AnnotationFormat(ExplicitEnum):
+    COCO_DETECTION = "coco_detection"
+    COCO_PANOPTIC = "coco_panoptic"
+
+
+class DeprecatedEnumMeta(EnumMeta):
+    def __init__(cls, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        logger.warning_once(
+            f"`{cls.__name__}` is deprecated and will be removed in v4.38. "
+            f"Please use `transformers.image_utils.AnnotationFormat` instead."
+        )
+
+
+class AnnotionFormat(ExplicitEnum, metaclass=DeprecatedEnumMeta):
+    COCO_DETECTION = AnnotationFormat.COCO_DETECTION.value
+    COCO_PANOPTIC = AnnotationFormat.COCO_PANOPTIC.value
+
+
+AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+
+
 def is_pil_image(img):
     return is_vision_available() and isinstance(img, PIL.Image.Image)
 
@@ -664,3 +691,33 @@ class ImageFeatureExtractionMixin:
         return image.rotate(
             angle, resample=resample, expand=expand, center=center, translate=translate, fillcolor=fillcolor
         )
+
+
+def promote_annotation_format(annotation_format: Union[AnnotionFormat, AnnotationFormat]) -> AnnotationFormat:
+    # can be removed when `AnnotionFormat` is fully deprecated
+    return AnnotationFormat(annotation_format.value)
+
+
+def validate_annotations(
+    annotation_format: AnnotationFormat,
+    supported_annotation_formats: Tuple[AnnotationFormat, ...],
+    annotations: List[Dict],
+) -> None:
+    if promote_annotation_format(annotation_format) not in supported_annotation_formats:
+        raise ValueError(f"Unsupported annotation format: {format} must be one of {supported_annotation_formats}")
+
+    if promote_annotation_format(annotation_format) is AnnotationFormat.COCO_DETECTION:
+        if not valid_coco_detection_annotations(annotations):
+            raise ValueError(
+                "Invalid COCO detection annotations. Annotations must a dict (single image) or list of dicts "
+                "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
+                "being a list of annotations in the COCO format."
+            )
+
+    if promote_annotation_format(annotation_format) is AnnotationFormat.COCO_PANOPTIC:
+        if not valid_coco_panoptic_annotations(annotations):
+            raise ValueError(
+                "Invalid COCO panoptic annotations. Annotations must a dict (single image) or list of dicts "
+                "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
+                "the latter being a list of annotations in the COCO format."
+            )
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index 919e8b9da8..3ec0696430 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -37,6 +37,9 @@ from ...image_transforms import (
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    AnnotionFormat,  # noqa: F401
     ChannelDimension,
     ImageInput,
     PILImageResampling,
@@ -45,12 +48,10 @@ from ...image_utils import (
     is_scaled_image,
     make_list_of_images,
     to_numpy_array,
-    valid_coco_detection_annotations,
-    valid_coco_panoptic_annotations,
     valid_images,
+    validate_annotations,
 )
 from ...utils import (
-    ExplicitEnum,
     TensorType,
     is_flax_available,
     is_jax_tensor,
@@ -80,15 +81,8 @@ if is_scipy_available():
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
-AnnotationType = Dict[str, Union[int, str, List[Dict]]]
 
-
-class AnnotionFormat(ExplicitEnum):
-    COCO_DETECTION = "coco_detection"
-    COCO_PANOPTIC = "coco_panoptic"
-
-
-SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.COCO_PANOPTIC)
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
 # Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
@@ -802,7 +796,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
     def __init__(
         self,
-        format: Union[str, AnnotionFormat] = AnnotionFormat.COCO_DETECTION,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
         do_resize: bool = True,
         size: Dict[str, int] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
@@ -861,7 +855,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
         self,
         image: np.ndarray,
         target: Dict,
-        format: Optional[AnnotionFormat] = None,
+        format: Optional[AnnotationFormat] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -871,12 +865,12 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
         """
         format = format if format is not None else self.format
 
-        if format == AnnotionFormat.COCO_DETECTION:
+        if format == AnnotationFormat.COCO_DETECTION:
             return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_detection_annotation(
                 image, target, return_segmentation_masks, input_data_format=input_data_format
             )
-        elif format == AnnotionFormat.COCO_PANOPTIC:
+        elif format == AnnotationFormat.COCO_PANOPTIC:
             return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_panoptic_annotation(
                 image,
@@ -1118,7 +1112,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: Optional[bool] = None,
-        format: Optional[Union[str, AnnotionFormat]] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -1164,7 +1158,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
                 Standard deviation to use when normalizing the image.
             do_pad (`bool`, *optional*, defaults to self.do_pad):
                 Whether to pad the image.
-            format (`str` or `AnnotionFormat`, *optional*, defaults to self.format):
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
                 Format of the annotations.
             return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
                 Type of tensors to return. If `None`, will return the list of images.
@@ -1231,28 +1225,13 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        format = AnnotionFormat(format)
+        format = AnnotationFormat(format)
         if annotations is not None:
-            if format == AnnotionFormat.COCO_DETECTION and not valid_coco_detection_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
-                    "being a list of annotations in the COCO format."
-                )
-            elif format == AnnotionFormat.COCO_PANOPTIC and not valid_coco_panoptic_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO panoptic annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
-                    "the latter being a list of annotations in the COCO format."
-                )
-            elif format not in SUPPORTED_ANNOTATION_FORMATS:
-                raise ValueError(
-                    f"Unsupported annotation format: {format} must be one of {SUPPORTED_ANNOTATION_FORMATS}"
-                )
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
 
         if (
             masks_path is not None
-            and format == AnnotionFormat.COCO_PANOPTIC
+            and format == AnnotationFormat.COCO_PANOPTIC
             and not isinstance(masks_path, (pathlib.Path, str))
         ):
             raise ValueError(
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index 69a4cb0c67..00cf8eaecf 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -37,6 +37,9 @@ from ...image_transforms import (
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    AnnotionFormat,  # noqa: F401
     ChannelDimension,
     ImageInput,
     PILImageResampling,
@@ -45,12 +48,10 @@ from ...image_utils import (
     is_scaled_image,
     make_list_of_images,
     to_numpy_array,
-    valid_coco_detection_annotations,
-    valid_coco_panoptic_annotations,
     valid_images,
+    validate_annotations,
 )
 from ...utils import (
-    ExplicitEnum,
     TensorType,
     is_flax_available,
     is_jax_tensor,
@@ -79,15 +80,7 @@ if is_scipy_available():
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
-AnnotationType = Dict[str, Union[int, str, List[Dict]]]
-
-
-class AnnotionFormat(ExplicitEnum):
-    COCO_DETECTION = "coco_detection"
-    COCO_PANOPTIC = "coco_panoptic"
-
-
-SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.COCO_PANOPTIC)
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
 # Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
@@ -801,7 +794,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
     def __init__(
         self,
-        format: Union[str, AnnotionFormat] = AnnotionFormat.COCO_DETECTION,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
         do_resize: bool = True,
         size: Dict[str, int] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
@@ -860,7 +853,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
         self,
         image: np.ndarray,
         target: Dict,
-        format: Optional[AnnotionFormat] = None,
+        format: Optional[AnnotationFormat] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -870,12 +863,12 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
         """
         format = format if format is not None else self.format
 
-        if format == AnnotionFormat.COCO_DETECTION:
+        if format == AnnotationFormat.COCO_DETECTION:
             return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_detection_annotation(
                 image, target, return_segmentation_masks, input_data_format=input_data_format
             )
-        elif format == AnnotionFormat.COCO_PANOPTIC:
+        elif format == AnnotationFormat.COCO_PANOPTIC:
             return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_panoptic_annotation(
                 image,
@@ -1117,7 +1110,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: Optional[bool] = None,
-        format: Optional[Union[str, AnnotionFormat]] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -1163,7 +1156,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
                 Standard deviation to use when normalizing the image.
             do_pad (`bool`, *optional*, defaults to self.do_pad):
                 Whether to pad the image.
-            format (`str` or `AnnotionFormat`, *optional*, defaults to self.format):
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
                 Format of the annotations.
             return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
                 Type of tensors to return. If `None`, will return the list of images.
@@ -1230,28 +1223,13 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        format = AnnotionFormat(format)
+        format = AnnotationFormat(format)
         if annotations is not None:
-            if format == AnnotionFormat.COCO_DETECTION and not valid_coco_detection_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
-                    "being a list of annotations in the COCO format."
-                )
-            elif format == AnnotionFormat.COCO_PANOPTIC and not valid_coco_panoptic_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO panoptic annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
-                    "the latter being a list of annotations in the COCO format."
-                )
-            elif format not in SUPPORTED_ANNOTATION_FORMATS:
-                raise ValueError(
-                    f"Unsupported annotation format: {format} must be one of {SUPPORTED_ANNOTATION_FORMATS}"
-                )
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
 
         if (
             masks_path is not None
-            and format == AnnotionFormat.COCO_PANOPTIC
+            and format == AnnotationFormat.COCO_PANOPTIC
             and not isinstance(masks_path, (pathlib.Path, str))
         ):
             raise ValueError(
diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py
index 619139dae5..1e3ece8e32 100644
--- a/src/transformers/models/deta/image_processing_deta.py
+++ b/src/transformers/models/deta/image_processing_deta.py
@@ -34,6 +34,8 @@ from ...image_transforms import (
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotionFormat,  # noqa: F401
     ChannelDimension,
     ImageInput,
     PILImageResampling,
@@ -42,9 +44,8 @@ from ...image_utils import (
     is_batched,
     is_scaled_image,
     to_numpy_array,
-    valid_coco_detection_annotations,
-    valid_coco_panoptic_annotations,
     valid_images,
+    validate_annotations,
 )
 from ...utils import (
     is_flax_available,
@@ -57,7 +58,7 @@ from ...utils import (
     is_vision_available,
     logging,
 )
-from ...utils.generic import ExplicitEnum, TensorType
+from ...utils.generic import TensorType
 
 
 if is_torch_available():
@@ -73,13 +74,7 @@ if is_vision_available():
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
-
-class AnnotionFormat(ExplicitEnum):
-    COCO_DETECTION = "coco_detection"
-    COCO_PANOPTIC = "coco_panoptic"
-
-
-SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.COCO_PANOPTIC)
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
 # Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
@@ -507,7 +502,7 @@ class DetaImageProcessor(BaseImageProcessor):
 
     def __init__(
         self,
-        format: Union[str, AnnotionFormat] = AnnotionFormat.COCO_DETECTION,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
         do_resize: bool = True,
         size: Dict[str, int] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
@@ -542,7 +537,7 @@ class DetaImageProcessor(BaseImageProcessor):
         self,
         image: np.ndarray,
         target: Dict,
-        format: Optional[AnnotionFormat] = None,
+        format: Optional[AnnotationFormat] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -552,12 +547,12 @@ class DetaImageProcessor(BaseImageProcessor):
         """
         format = format if format is not None else self.format
 
-        if format == AnnotionFormat.COCO_DETECTION:
+        if format == AnnotationFormat.COCO_DETECTION:
             return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_detection_annotation(
                 image, target, return_segmentation_masks, input_data_format=input_data_format
             )
-        elif format == AnnotionFormat.COCO_PANOPTIC:
+        elif format == AnnotationFormat.COCO_PANOPTIC:
             return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_panoptic_annotation(
                 image,
@@ -789,7 +784,7 @@ class DetaImageProcessor(BaseImageProcessor):
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: Optional[bool] = None,
-        format: Optional[Union[str, AnnotionFormat]] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -803,12 +798,12 @@ class DetaImageProcessor(BaseImageProcessor):
                 Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
                 from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             annotations (`List[Dict]` or `List[List[Dict]]`, *optional*):
-                List of annotations associated with the image or batch of images. If annotionation is for object
+                List of annotations associated with the image or batch of images. If annotation is for object
                 detection, the annotations should be a dictionary with the following keys:
                 - "image_id" (`int`): The image id.
                 - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
                   dictionary. An image can have no annotations, in which case the list should be empty.
-                If annotionation is for segmentation, the annotations should be a dictionary with the following keys:
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
                 - "image_id" (`int`): The image id.
                 - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
                   An image can have no segments, in which case the list should be empty.
@@ -835,7 +830,7 @@ class DetaImageProcessor(BaseImageProcessor):
                 Standard deviation to use when normalizing the image.
             do_pad (`bool`, *optional*, defaults to self.do_pad):
                 Whether to pad the image.
-            format (`str` or `AnnotionFormat`, *optional*, defaults to self.format):
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
                 Format of the annotations.
             return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
                 Type of tensors to return. If `None`, will return the list of images.
@@ -894,28 +889,13 @@ class DetaImageProcessor(BaseImageProcessor):
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        format = AnnotionFormat(format)
+        format = AnnotationFormat(format)
         if annotations is not None:
-            if format == AnnotionFormat.COCO_DETECTION and not valid_coco_detection_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
-                    "being a list of annotations in the COCO format."
-                )
-            elif format == AnnotionFormat.COCO_PANOPTIC and not valid_coco_panoptic_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO panoptic annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
-                    "the latter being a list of annotations in the COCO format."
-                )
-            elif format not in SUPPORTED_ANNOTATION_FORMATS:
-                raise ValueError(
-                    f"Unsupported annotation format: {format} must be one of {SUPPORTED_ANNOTATION_FORMATS}"
-                )
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
 
         if (
             masks_path is not None
-            and format == AnnotionFormat.COCO_PANOPTIC
+            and format == AnnotationFormat.COCO_PANOPTIC
             and not isinstance(masks_path, (pathlib.Path, str))
         ):
             raise ValueError(
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index c4e4b97c63..8b64b9c4d9 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -36,6 +36,9 @@ from ...image_transforms import (
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    AnnotionFormat,  # noqa: F401
     ChannelDimension,
     ImageInput,
     PILImageResampling,
@@ -44,12 +47,10 @@ from ...image_utils import (
     is_scaled_image,
     make_list_of_images,
     to_numpy_array,
-    valid_coco_detection_annotations,
-    valid_coco_panoptic_annotations,
     valid_images,
+    validate_annotations,
 )
 from ...utils import (
-    ExplicitEnum,
     TensorType,
     is_flax_available,
     is_jax_tensor,
@@ -79,15 +80,7 @@ if is_scipy_available():
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
-AnnotationType = Dict[str, Union[int, str, List[Dict]]]
-
-
-class AnnotionFormat(ExplicitEnum):
-    COCO_DETECTION = "coco_detection"
-    COCO_PANOPTIC = "coco_panoptic"
-
-
-SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.COCO_PANOPTIC)
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
 def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
@@ -785,7 +778,7 @@ class DetrImageProcessor(BaseImageProcessor):
 
     def __init__(
         self,
-        format: Union[str, AnnotionFormat] = AnnotionFormat.COCO_DETECTION,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
         do_resize: bool = True,
         size: Dict[str, int] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
@@ -842,7 +835,7 @@ class DetrImageProcessor(BaseImageProcessor):
         self,
         image: np.ndarray,
         target: Dict,
-        format: Optional[AnnotionFormat] = None,
+        format: Optional[AnnotationFormat] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -852,12 +845,12 @@ class DetrImageProcessor(BaseImageProcessor):
         """
         format = format if format is not None else self.format
 
-        if format == AnnotionFormat.COCO_DETECTION:
+        if format == AnnotationFormat.COCO_DETECTION:
             return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_detection_annotation(
                 image, target, return_segmentation_masks, input_data_format=input_data_format
             )
-        elif format == AnnotionFormat.COCO_PANOPTIC:
+        elif format == AnnotationFormat.COCO_PANOPTIC:
             return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_panoptic_annotation(
                 image,
@@ -1089,7 +1082,7 @@ class DetrImageProcessor(BaseImageProcessor):
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: Optional[bool] = None,
-        format: Optional[Union[str, AnnotionFormat]] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -1135,7 +1128,7 @@ class DetrImageProcessor(BaseImageProcessor):
                 Standard deviation to use when normalizing the image.
             do_pad (`bool`, *optional*, defaults to self.do_pad):
                 Whether to pad the image.
-            format (`str` or `AnnotionFormat`, *optional*, defaults to self.format):
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
                 Format of the annotations.
             return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
                 Type of tensors to return. If `None`, will return the list of images.
@@ -1202,28 +1195,13 @@ class DetrImageProcessor(BaseImageProcessor):
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        format = AnnotionFormat(format)
+        format = AnnotationFormat(format)
         if annotations is not None:
-            if format == AnnotionFormat.COCO_DETECTION and not valid_coco_detection_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
-                    "being a list of annotations in the COCO format."
-                )
-            elif format == AnnotionFormat.COCO_PANOPTIC and not valid_coco_panoptic_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO panoptic annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
-                    "the latter being a list of annotations in the COCO format."
-                )
-            elif format not in SUPPORTED_ANNOTATION_FORMATS:
-                raise ValueError(
-                    f"Unsupported annotation format: {format} must be one of {SUPPORTED_ANNOTATION_FORMATS}"
-                )
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
 
         if (
             masks_path is not None
-            and format == AnnotionFormat.COCO_PANOPTIC
+            and format == AnnotationFormat.COCO_PANOPTIC
             and not isinstance(masks_path, (pathlib.Path, str))
         ):
             raise ValueError(
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index 074b1df4d7..4b59fd5ef0 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -35,6 +35,9 @@ from ...image_transforms import (
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    AnnotionFormat,  # noqa: F401
     ChannelDimension,
     ImageInput,
     PILImageResampling,
@@ -43,12 +46,10 @@ from ...image_utils import (
     is_scaled_image,
     make_list_of_images,
     to_numpy_array,
-    valid_coco_detection_annotations,
-    valid_coco_panoptic_annotations,
     valid_images,
+    validate_annotations,
 )
 from ...utils import (
-    ExplicitEnum,
     TensorType,
     is_flax_available,
     is_jax_tensor,
@@ -77,15 +78,7 @@ if is_scipy_available():
 
 logger = logging.get_logger(__name__)
 
-AnnotationType = Dict[str, Union[int, str, List[Dict]]]
-
-
-class AnnotionFormat(ExplicitEnum):
-    COCO_DETECTION = "coco_detection"
-    COCO_PANOPTIC = "coco_panoptic"
-
-
-SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.COCO_PANOPTIC)
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
 # Copied from transformers.models.detr.image_processing_detr.get_max_height_width
@@ -712,7 +705,7 @@ class YolosImageProcessor(BaseImageProcessor):
 
     def __init__(
         self,
-        format: Union[str, AnnotionFormat] = AnnotionFormat.COCO_DETECTION,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
         do_resize: bool = True,
         size: Dict[str, int] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
@@ -771,7 +764,7 @@ class YolosImageProcessor(BaseImageProcessor):
         self,
         image: np.ndarray,
         target: Dict,
-        format: Optional[AnnotionFormat] = None,
+        format: Optional[AnnotationFormat] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -781,12 +774,12 @@ class YolosImageProcessor(BaseImageProcessor):
         """
         format = format if format is not None else self.format
 
-        if format == AnnotionFormat.COCO_DETECTION:
+        if format == AnnotationFormat.COCO_DETECTION:
             return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_detection_annotation(
                 image, target, return_segmentation_masks, input_data_format=input_data_format
             )
-        elif format == AnnotionFormat.COCO_PANOPTIC:
+        elif format == AnnotationFormat.COCO_PANOPTIC:
             return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_panoptic_annotation(
                 image,
@@ -1026,7 +1019,7 @@ class YolosImageProcessor(BaseImageProcessor):
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: Optional[bool] = None,
-        format: Optional[Union[str, AnnotionFormat]] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -1040,12 +1033,12 @@ class YolosImageProcessor(BaseImageProcessor):
                 Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
                 from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
-                List of annotations associated with the image or batch of images. If annotionation is for object
+                List of annotations associated with the image or batch of images. If annotation is for object
                 detection, the annotations should be a dictionary with the following keys:
                 - "image_id" (`int`): The image id.
                 - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
                   dictionary. An image can have no annotations, in which case the list should be empty.
-                If annotionation is for segmentation, the annotations should be a dictionary with the following keys:
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
                 - "image_id" (`int`): The image id.
                 - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
                   An image can have no segments, in which case the list should be empty.
@@ -1072,7 +1065,7 @@ class YolosImageProcessor(BaseImageProcessor):
                 Standard deviation to use when normalizing the image.
             do_pad (`bool`, *optional*, defaults to self.do_pad):
                 Whether to pad the image.
-            format (`str` or `AnnotionFormat`, *optional*, defaults to self.format):
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
                 Format of the annotations.
             return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
                 Type of tensors to return. If `None`, will return the list of images.
@@ -1136,28 +1129,13 @@ class YolosImageProcessor(BaseImageProcessor):
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        format = AnnotionFormat(format)
+        format = AnnotationFormat(format)
         if annotations is not None:
-            if format == AnnotionFormat.COCO_DETECTION and not valid_coco_detection_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
-                    "being a list of annotations in the COCO format."
-                )
-            elif format == AnnotionFormat.COCO_PANOPTIC and not valid_coco_panoptic_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO panoptic annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
-                    "the latter being a list of annotations in the COCO format."
-                )
-            elif format not in SUPPORTED_ANNOTATION_FORMATS:
-                raise ValueError(
-                    f"Unsupported annotation format: {format} must be one of {SUPPORTED_ANNOTATION_FORMATS}"
-                )
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
 
         if (
             masks_path is not None
-            and format == AnnotionFormat.COCO_PANOPTIC
+            and format == AnnotationFormat.COCO_PANOPTIC
             and not isinstance(masks_path, (pathlib.Path, str))
         ):
             raise ValueError(
diff --git a/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/models/conditional_detr/test_image_processing_conditional_detr.py
index cc3fd501a6..4b18a6ecd7 100644
--- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py
+++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py
@@ -21,7 +21,7 @@ import unittest
 from transformers.testing_utils import require_torch, require_vision, slow
 from transformers.utils import is_torch_available, is_vision_available
 
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
 
 
 if is_torch_available():
@@ -127,7 +127,7 @@ class ConditionalDetrImageProcessingTester(unittest.TestCase):
 
 @require_torch
 @require_vision
-class ConditionalDetrImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = ConditionalDetrImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
index 4fd2de49f7..ec65f7b9a5 100644
--- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py
+++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
@@ -21,7 +21,7 @@ import unittest
 from transformers.testing_utils import require_torch, require_vision, slow
 from transformers.utils import is_torch_available, is_vision_available
 
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
 
 
 if is_torch_available():
@@ -127,7 +127,7 @@ class DeformableDetrImageProcessingTester(unittest.TestCase):
 
 @require_torch
 @require_vision
-class DeformableDetrImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = DeformableDetrImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/deta/test_image_processing_deta.py b/tests/models/deta/test_image_processing_deta.py
index 7cde8474bf..1e48147607 100644
--- a/tests/models/deta/test_image_processing_deta.py
+++ b/tests/models/deta/test_image_processing_deta.py
@@ -21,7 +21,7 @@ import unittest
 from transformers.testing_utils import require_torch, require_vision, slow
 from transformers.utils import is_torch_available, is_vision_available
 
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
 
 
 if is_torch_available():
@@ -127,7 +127,7 @@ class DetaImageProcessingTester(unittest.TestCase):
 
 @require_torch
 @require_vision
-class DetaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+class DetaImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = DetaImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py
index 2a095b259e..7a5cb9efed 100644
--- a/tests/models/detr/test_image_processing_detr.py
+++ b/tests/models/detr/test_image_processing_detr.py
@@ -21,7 +21,7 @@ import unittest
 from transformers.testing_utils import require_torch, require_vision, slow
 from transformers.utils import is_torch_available, is_vision_available
 
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
 
 
 if is_torch_available():
@@ -127,7 +127,7 @@ class DetrImageProcessingTester(unittest.TestCase):
 
 @require_torch
 @require_vision
-class DetrImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = DetrImageProcessor if is_vision_available() else None
 
     def setUp(self):
@@ -159,6 +159,63 @@ class DetrImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
         self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
         self.assertEqual(image_processor.do_pad, False)
 
+    def test_should_raise_if_annotation_format_invalid(self):
+        image_processor_dict = self.image_processor_tester.prepare_image_processor_dict()
+
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            detection_target = json.loads(f.read())
+
+        annotations = {"image_id": 39769, "annotations": detection_target}
+
+        params = {
+            "images": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+            "annotations": annotations,
+            "return_tensors": "pt",
+        }
+
+        image_processor_params = {**image_processor_dict, **{"format": "_INVALID_FORMAT_"}}
+        image_processor = self.image_processing_class(**image_processor_params)
+
+        with self.assertRaises(ValueError) as e:
+            image_processor(**params)
+
+        self.assertTrue(str(e.exception).startswith("_INVALID_FORMAT_ is not a valid AnnotationFormat"))
+
+    def test_valid_coco_detection_annotations(self):
+        # prepare image and target
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        params = {"image_id": 39769, "annotations": target}
+
+        # encode them
+        image_processing = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
+
+        # legal encodings (single image)
+        _ = image_processing(images=image, annotations=params, return_tensors="pt")
+        _ = image_processing(images=image, annotations=[params], return_tensors="pt")
+
+        # legal encodings (batch of one image)
+        _ = image_processing(images=[image], annotations=params, return_tensors="pt")
+        _ = image_processing(images=[image], annotations=[params], return_tensors="pt")
+
+        # legal encoding (batch of more than one image)
+        n = 5
+        _ = image_processing(images=[image] * n, annotations=[params] * n, return_tensors="pt")
+
+        # example of an illegal encoding (missing the 'image_id' key)
+        with self.assertRaises(ValueError) as e:
+            image_processing(images=image, annotations={"annotations": target}, return_tensors="pt")
+
+        self.assertTrue(str(e.exception).startswith("Invalid COCO detection annotations"))
+
+        # example of an illegal encoding (unequal lengths of images and annotations)
+        with self.assertRaises(ValueError) as e:
+            image_processing(images=[image] * n, annotations=[params] * (n - 1), return_tensors="pt")
+
+        self.assertTrue(str(e.exception) == "The number of images (5) and annotations (4) do not match.")
+
     @slow
     def test_call_pytorch_with_coco_detection_annotations(self):
         # prepare image and target
diff --git a/tests/models/yolos/test_image_processing_yolos.py b/tests/models/yolos/test_image_processing_yolos.py
index 003a006110..1039e4c91b 100644
--- a/tests/models/yolos/test_image_processing_yolos.py
+++ b/tests/models/yolos/test_image_processing_yolos.py
@@ -21,7 +21,7 @@ import unittest
 from transformers.testing_utils import require_torch, require_vision, slow
 from transformers.utils import is_torch_available, is_vision_available
 
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
 
 
 if is_torch_available():
@@ -127,7 +127,7 @@ class YolosImageProcessingTester(unittest.TestCase):
 
 @require_torch
 @require_vision
-class YolosImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = YolosImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py
index cb78b33375..dcbee270f9 100644
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -15,8 +15,11 @@
 
 import json
 import os
+import pathlib
 import tempfile
 
+from transformers import BatchFeature
+from transformers.image_utils import AnnotationFormat, AnnotionFormat
 from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
@@ -285,3 +288,81 @@ class ImageProcessingTestMixin:
         self.assertEqual(
             tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
         )
+
+
+class AnnotationFormatTestMixin:
+    # this mixin adds a test to assert that usages of the
+    # to-be-deprecated `AnnotionFormat` continue to be
+    # supported for the time being
+
+    def test_processor_can_use_legacy_annotation_format(self):
+        image_processor_dict = self.image_processor_tester.prepare_image_processor_dict()
+        fixtures_path = pathlib.Path(__file__).parent / "fixtures" / "tests_samples" / "COCO"
+
+        with open(fixtures_path / "coco_annotations.txt", "r") as f:
+            detection_target = json.loads(f.read())
+
+        detection_annotations = {"image_id": 39769, "annotations": detection_target}
+
+        detection_params = {
+            "images": Image.open(fixtures_path / "000000039769.png"),
+            "annotations": detection_annotations,
+            "return_tensors": "pt",
+        }
+
+        with open(fixtures_path / "coco_panoptic_annotations.txt", "r") as f:
+            panoptic_target = json.loads(f.read())
+
+        panoptic_annotations = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": panoptic_target}
+
+        masks_path = pathlib.Path(fixtures_path / "coco_panoptic")
+
+        panoptic_params = {
+            "images": Image.open(fixtures_path / "000000039769.png"),
+            "annotations": panoptic_annotations,
+            "return_tensors": "pt",
+            "masks_path": masks_path,
+        }
+
+        test_cases = [
+            ("coco_detection", detection_params),
+            ("coco_panoptic", panoptic_params),
+            (AnnotionFormat.COCO_DETECTION, detection_params),
+            (AnnotionFormat.COCO_PANOPTIC, panoptic_params),
+            (AnnotationFormat.COCO_DETECTION, detection_params),
+            (AnnotationFormat.COCO_PANOPTIC, panoptic_params),
+        ]
+
+        def _compare(a, b) -> None:
+            if isinstance(a, (dict, BatchFeature)):
+                self.assertEqual(a.keys(), b.keys())
+                for k, v in a.items():
+                    _compare(v, b[k])
+            elif isinstance(a, list):
+                self.assertEqual(len(a), len(b))
+                for idx in range(len(a)):
+                    _compare(a[idx], b[idx])
+            elif isinstance(a, torch.Tensor):
+                self.assertTrue(torch.allclose(a, b, atol=1e-3))
+            elif isinstance(a, str):
+                self.assertEqual(a, b)
+
+        for annotation_format, params in test_cases:
+            with self.subTest(annotation_format):
+                image_processor_params = {**image_processor_dict, **{"format": annotation_format}}
+                image_processor_first = self.image_processing_class(**image_processor_params)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    image_processor_first.save_pretrained(tmpdirname)
+                    image_processor_second = self.image_processing_class.from_pretrained(tmpdirname)
+
+                # check the 'format' key exists and that the dicts of the
+                # first and second processors are equal
+                self.assertIn("format", image_processor_first.to_dict().keys())
+                self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
+
+                # perform encoding using both processors and compare
+                # the resulting BatchFeatures
+                first_encoding = image_processor_first(**params)
+                second_encoding = image_processor_second(**params)
+                _compare(first_encoding, second_encoding)