[DETR] Update the processing to adapt masks & bboxes to reflect padding (#28363)

* Update the processing so bbox coords are adjusted for padding * Just pad masks * Tidy up, add tests * Better tests * Fix yolos and mark as slow for pycocotols * Fix yolos - return_tensors * Clarify padding and normalization behaviour
2024-02-13 18:27:06 +00:00
parent 3de6a6b493
commit bd4b83e1ba
15 changed files with 1820 additions and 138 deletions
--- a/src/transformers/models/bridgetower/image_processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -280,7 +280,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
            **kwargs,
        )
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
+    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
    def _pad_image(
        self,
        image: np.ndarray,
@@ -308,7 +308,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
        )
        return padded_image
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
+    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
    def pad(
        self,
        images: List[np.ndarray],
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -785,9 +785,14 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_annotations (`bool`, *optional*, defaults to `True`):
            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
        do_pad (`bool`, *optional*, defaults to `True`):
-            Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
-            overridden by the `do_pad` parameter in the `preprocess` method.
+            method. If `True` will pad the images in the batch to the largest height and width in the batch.
            Padding will be applied to the bottom and right of the image with zeros.
    """
    model_input_names = ["pixel_values", "pixel_mask"]
@@ -804,6 +809,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
        do_normalize: bool = True,
        image_mean: Union[float, List[float]] = None,
        image_std: Union[float, List[float]] = None,
        do_convert_annotations: Optional[bool] = None,
        do_pad: bool = True,
        **kwargs,
    ) -> None:
@@ -822,6 +828,10 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
        size = get_size_dict(size, max_size=max_size, default_to_square=False)
        # Backwards compatibility
        if do_convert_annotations is None:
            do_convert_annotations = do_normalize
        super().__init__(**kwargs)
        self.format = format
        self.do_resize = do_resize
@@ -830,6 +840,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.do_convert_annotations = do_convert_annotations
        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
        self.do_pad = do_pad
@@ -1007,18 +1018,64 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
        """
        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
-        `[center_x, center_y, width, height]` format.
+        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
        """
        return normalize_annotation(annotation, image_size=image_size)
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
    def _update_annotation_for_padded_image(
        self,
        annotation: Dict,
        input_image_size: Tuple[int, int],
        output_image_size: Tuple[int, int],
        padding,
        update_bboxes,
    ) -> Dict:
        """
        Update the annotation for a padded image.
        """
        new_annotation = {}
        new_annotation["size"] = output_image_size
        for key, value in annotation.items():
            if key == "masks":
                masks = value
                masks = pad(
                    masks,
                    padding,
                    mode=PaddingMode.CONSTANT,
                    constant_values=0,
                    input_data_format=ChannelDimension.FIRST,
                )
                masks = safe_squeeze(masks, 1)
                new_annotation["masks"] = masks
            elif key == "boxes" and update_bboxes:
                boxes = value
                boxes *= np.asarray(
                    [
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                    ]
                )
                new_annotation["boxes"] = boxes
            elif key == "size":
                new_annotation["size"] = output_image_size
            else:
                new_annotation[key] = value
        return new_annotation
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
    def _pad_image(
        self,
        image: np.ndarray,
        output_size: Tuple[int, int],
        annotation: Optional[Dict[str, Any]] = None,
        constant_values: Union[float, Iterable[float]] = 0,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        update_bboxes: bool = True,
    ) -> np.ndarray:
        """
        Pad an image with zeros to the given size.
@@ -1037,25 +1094,33 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
            data_format=data_format,
            input_data_format=input_data_format,
        )
-        return padded_image
+        if annotation is not None:
            annotation = self._update_annotation_for_padded_image(
                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
            )
        return padded_image, annotation
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
    def pad(
        self,
        images: List[np.ndarray],
        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
        constant_values: Union[float, Iterable[float]] = 0,
        return_pixel_mask: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        update_bboxes: bool = True,
    ) -> BatchFeature:
        """
        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
        in the batch and optionally returns their corresponding pixel mask.
        Args:
-            image (`np.ndarray`):
+            images (List[`np.ndarray`]):
-                Image to pad.
+                Images to pad.
            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
                Annotations to transform according to the padding that is applied to the images.
            constant_values (`float` or `Iterable[float]`, *optional*):
                The value to use for the padding if `mode` is `"constant"`.
            return_pixel_mask (`bool`, *optional*, defaults to `True`):
@@ -1071,19 +1136,29 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
            update_bboxes (`bool`, *optional*, defaults to `True`):
                Whether to update the bounding boxes in the annotations to match the padded images. If the
                bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
                format, the bounding boxes will not be updated.
        """
        pad_size = get_max_height_width(images, input_data_format=input_data_format)
-        padded_images = [
+        annotation_list = annotations if annotations is not None else [None] * len(images)
-            self._pad_image(
+        padded_images = []
        padded_annotations = []
        for image, annotation in zip(images, annotation_list):
            padded_image, padded_annotation = self._pad_image(
                image,
                pad_size,
                annotation,
                constant_values=constant_values,
                data_format=data_format,
                input_data_format=input_data_format,
                update_bboxes=update_bboxes,
            )
-            for image in images
+            padded_images.append(padded_image)
-        ]
+            padded_annotations.append(padded_annotation)
        data = {"pixel_values": padded_images}
        if return_pixel_mask:
@@ -1093,7 +1168,14 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
            ]
            data["pixel_mask"] = masks
-        return BatchFeature(data=data, tensor_type=return_tensors)
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
        if annotations is not None:
            encoded_inputs["labels"] = [
                BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
            ]
        return encoded_inputs
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
    def preprocess(
@@ -1108,6 +1190,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[Union[int, float]] = None,
        do_normalize: Optional[bool] = None,
        do_convert_annotations: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_pad: Optional[bool] = None,
@@ -1151,12 +1234,17 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
                Rescale factor to use when rescaling the image.
            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
                Whether to normalize the image.
            do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
                Whether to convert the annotations to the format expected by the model. Converts the bounding
                boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
                and in relative coordinates.
            image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
                Mean to use when normalizing the image.
            image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
                Standard deviation to use when normalizing the image.
            do_pad (`bool`, *optional*, defaults to self.do_pad):
-                Whether to pad the image.
+                Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
                and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
                Format of the annotations.
            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
@@ -1197,6 +1285,9 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
        do_normalize = self.do_normalize if do_normalize is None else do_normalize
        image_mean = self.image_mean if image_mean is None else image_mean
        image_std = self.image_std if image_std is None else image_std
        do_convert_annotations = (
            self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
        )
        do_pad = self.do_pad if do_pad is None else do_pad
        format = self.format if format is None else format
@@ -1300,29 +1391,34 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
            images = [
                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
            ]
-            if annotations is not None:
+
-                annotations = [
+        if do_convert_annotations and annotations is not None:
-                    self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+            annotations = [
-                    for annotation, image in zip(annotations, images)
+                self.normalize_annotation(annotation, get_image_size(image, input_data_format))
-                ]
+                for annotation, image in zip(annotations, images)
            ]
        if do_pad:
            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
-            data = self.pad(
+            encoded_inputs = self.pad(
-                images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
+                images,
                annotations=annotations,
                return_pixel_mask=True,
                data_format=data_format,
                input_data_format=input_data_format,
                return_tensors=return_tensors,
                update_bboxes=do_convert_annotations,
            )
        else:
            images = [
                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
                for image in images
            ]
-            data = {"pixel_values": images}
+            encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
-
+            if annotations is not None:
-        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+                encoded_inputs["labels"] = [
-        if annotations is not None:
+                    BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
-            encoded_inputs["labels"] = [
+                ]
                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
            ]
        return encoded_inputs
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -783,9 +783,14 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_annotations (`bool`, *optional*, defaults to `True`):
            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
        do_pad (`bool`, *optional*, defaults to `True`):
-            Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
-            overridden by the `do_pad` parameter in the `preprocess` method.
+            method. If `True` will pad the images in the batch to the largest height and width in the batch.
            Padding will be applied to the bottom and right of the image with zeros.
    """
    model_input_names = ["pixel_values", "pixel_mask"]
@@ -802,6 +807,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
        do_normalize: bool = True,
        image_mean: Union[float, List[float]] = None,
        image_std: Union[float, List[float]] = None,
        do_convert_annotations: Optional[bool] = None,
        do_pad: bool = True,
        **kwargs,
    ) -> None:
@@ -820,6 +826,10 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
        size = get_size_dict(size, max_size=max_size, default_to_square=False)
        # Backwards compatibility
        if do_convert_annotations is None:
            do_convert_annotations = do_normalize
        super().__init__(**kwargs)
        self.format = format
        self.do_resize = do_resize
@@ -828,6 +838,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.do_convert_annotations = do_convert_annotations
        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
        self.do_pad = do_pad
@@ -1005,18 +1016,64 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
        """
        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
-        `[center_x, center_y, width, height]` format.
+        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
        """
        return normalize_annotation(annotation, image_size=image_size)
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
    def _update_annotation_for_padded_image(
        self,
        annotation: Dict,
        input_image_size: Tuple[int, int],
        output_image_size: Tuple[int, int],
        padding,
        update_bboxes,
    ) -> Dict:
        """
        Update the annotation for a padded image.
        """
        new_annotation = {}
        new_annotation["size"] = output_image_size
        for key, value in annotation.items():
            if key == "masks":
                masks = value
                masks = pad(
                    masks,
                    padding,
                    mode=PaddingMode.CONSTANT,
                    constant_values=0,
                    input_data_format=ChannelDimension.FIRST,
                )
                masks = safe_squeeze(masks, 1)
                new_annotation["masks"] = masks
            elif key == "boxes" and update_bboxes:
                boxes = value
                boxes *= np.asarray(
                    [
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                    ]
                )
                new_annotation["boxes"] = boxes
            elif key == "size":
                new_annotation["size"] = output_image_size
            else:
                new_annotation[key] = value
        return new_annotation
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
    def _pad_image(
        self,
        image: np.ndarray,
        output_size: Tuple[int, int],
        annotation: Optional[Dict[str, Any]] = None,
        constant_values: Union[float, Iterable[float]] = 0,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        update_bboxes: bool = True,
    ) -> np.ndarray:
        """
        Pad an image with zeros to the given size.
@@ -1035,25 +1092,33 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
            data_format=data_format,
            input_data_format=input_data_format,
        )
-        return padded_image
+        if annotation is not None:
            annotation = self._update_annotation_for_padded_image(
                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
            )
        return padded_image, annotation
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
    def pad(
        self,
        images: List[np.ndarray],
        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
        constant_values: Union[float, Iterable[float]] = 0,
        return_pixel_mask: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        update_bboxes: bool = True,
    ) -> BatchFeature:
        """
        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
        in the batch and optionally returns their corresponding pixel mask.
        Args:
-            image (`np.ndarray`):
+            images (List[`np.ndarray`]):
-                Image to pad.
+                Images to pad.
            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
                Annotations to transform according to the padding that is applied to the images.
            constant_values (`float` or `Iterable[float]`, *optional*):
                The value to use for the padding if `mode` is `"constant"`.
            return_pixel_mask (`bool`, *optional*, defaults to `True`):
@@ -1069,19 +1134,29 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
            update_bboxes (`bool`, *optional*, defaults to `True`):
                Whether to update the bounding boxes in the annotations to match the padded images. If the
                bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
                format, the bounding boxes will not be updated.
        """
        pad_size = get_max_height_width(images, input_data_format=input_data_format)
-        padded_images = [
+        annotation_list = annotations if annotations is not None else [None] * len(images)
-            self._pad_image(
+        padded_images = []
        padded_annotations = []
        for image, annotation in zip(images, annotation_list):
            padded_image, padded_annotation = self._pad_image(
                image,
                pad_size,
                annotation,
                constant_values=constant_values,
                data_format=data_format,
                input_data_format=input_data_format,
                update_bboxes=update_bboxes,
            )
-            for image in images
+            padded_images.append(padded_image)
-        ]
+            padded_annotations.append(padded_annotation)
        data = {"pixel_values": padded_images}
        if return_pixel_mask:
@@ -1091,7 +1166,14 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
            ]
            data["pixel_mask"] = masks
-        return BatchFeature(data=data, tensor_type=return_tensors)
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
        if annotations is not None:
            encoded_inputs["labels"] = [
                BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
            ]
        return encoded_inputs
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
    def preprocess(
@@ -1106,6 +1188,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[Union[int, float]] = None,
        do_normalize: Optional[bool] = None,
        do_convert_annotations: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_pad: Optional[bool] = None,
@@ -1149,12 +1232,17 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
                Rescale factor to use when rescaling the image.
            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
                Whether to normalize the image.
            do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
                Whether to convert the annotations to the format expected by the model. Converts the bounding
                boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
                and in relative coordinates.
            image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
                Mean to use when normalizing the image.
            image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
                Standard deviation to use when normalizing the image.
            do_pad (`bool`, *optional*, defaults to self.do_pad):
-                Whether to pad the image.
+                Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
                and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
                Format of the annotations.
            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
@@ -1195,6 +1283,9 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
        do_normalize = self.do_normalize if do_normalize is None else do_normalize
        image_mean = self.image_mean if image_mean is None else image_mean
        image_std = self.image_std if image_std is None else image_std
        do_convert_annotations = (
            self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
        )
        do_pad = self.do_pad if do_pad is None else do_pad
        format = self.format if format is None else format
@@ -1298,29 +1389,34 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
            images = [
                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
            ]
-            if annotations is not None:
+
-                annotations = [
+        if do_convert_annotations and annotations is not None:
-                    self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+            annotations = [
-                    for annotation, image in zip(annotations, images)
+                self.normalize_annotation(annotation, get_image_size(image, input_data_format))
-                ]
+                for annotation, image in zip(annotations, images)
            ]
        if do_pad:
            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
-            data = self.pad(
+            encoded_inputs = self.pad(
-                images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
+                images,
                annotations=annotations,
                return_pixel_mask=True,
                data_format=data_format,
                input_data_format=input_data_format,
                return_tensors=return_tensors,
                update_bboxes=do_convert_annotations,
            )
        else:
            images = [
                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
                for image in images
            ]
-            data = {"pixel_values": images}
+            encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
-
+            if annotations is not None:
-        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+                encoded_inputs["labels"] = [
-        if annotations is not None:
+                    BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
-            encoded_inputs["labels"] = [
+                ]
                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
            ]
        return encoded_inputs
--- a/src/transformers/models/deta/image_processing_deta.py
+++ b/src/transformers/models/deta/image_processing_deta.py
@@ -35,6 +35,7 @@ from ...image_utils import (
    IMAGENET_DEFAULT_MEAN,
    IMAGENET_DEFAULT_STD,
    AnnotationFormat,
    AnnotationType,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
@@ -492,9 +493,14 @@ class DetaImageProcessor(BaseImageProcessor):
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_annotations (`bool`, *optional*, defaults to `True`):
            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
        do_pad (`bool`, *optional*, defaults to `True`):
-            Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
-            overridden by the `do_pad` parameter in the `preprocess` method.
+            method. If `True` will pad the images in the batch to the largest height and width in the batch.
            Padding will be applied to the bottom and right of the image with zeros.
    """
    model_input_names = ["pixel_values", "pixel_mask"]
@@ -510,6 +516,7 @@ class DetaImageProcessor(BaseImageProcessor):
        do_normalize: bool = True,
        image_mean: Union[float, List[float]] = None,
        image_std: Union[float, List[float]] = None,
        do_convert_annotations: bool = True,
        do_pad: bool = True,
        **kwargs,
    ) -> None:
@@ -519,6 +526,9 @@ class DetaImageProcessor(BaseImageProcessor):
        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
        size = get_size_dict(size, default_to_square=False)
        if do_convert_annotations is None:
            do_convert_annotations = do_normalize
        super().__init__(**kwargs)
        self.format = format
        self.do_resize = do_resize
@@ -527,6 +537,7 @@ class DetaImageProcessor(BaseImageProcessor):
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.do_convert_annotations = do_convert_annotations
        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
        self.do_pad = do_pad
@@ -680,18 +691,64 @@ class DetaImageProcessor(BaseImageProcessor):
    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
        """
        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
-        `[center_x, center_y, width, height]` format.
+        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
        """
        return normalize_annotation(annotation, image_size=image_size)
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
    def _update_annotation_for_padded_image(
        self,
        annotation: Dict,
        input_image_size: Tuple[int, int],
        output_image_size: Tuple[int, int],
        padding,
        update_bboxes,
    ) -> Dict:
        """
        Update the annotation for a padded image.
        """
        new_annotation = {}
        new_annotation["size"] = output_image_size
        for key, value in annotation.items():
            if key == "masks":
                masks = value
                masks = pad(
                    masks,
                    padding,
                    mode=PaddingMode.CONSTANT,
                    constant_values=0,
                    input_data_format=ChannelDimension.FIRST,
                )
                masks = safe_squeeze(masks, 1)
                new_annotation["masks"] = masks
            elif key == "boxes" and update_bboxes:
                boxes = value
                boxes *= np.asarray(
                    [
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                    ]
                )
                new_annotation["boxes"] = boxes
            elif key == "size":
                new_annotation["size"] = output_image_size
            else:
                new_annotation[key] = value
        return new_annotation
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
    def _pad_image(
        self,
        image: np.ndarray,
        output_size: Tuple[int, int],
        annotation: Optional[Dict[str, Any]] = None,
        constant_values: Union[float, Iterable[float]] = 0,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        update_bboxes: bool = True,
    ) -> np.ndarray:
        """
        Pad an image with zeros to the given size.
@@ -710,25 +767,33 @@ class DetaImageProcessor(BaseImageProcessor):
            data_format=data_format,
            input_data_format=input_data_format,
        )
-        return padded_image
+        if annotation is not None:
            annotation = self._update_annotation_for_padded_image(
                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
            )
        return padded_image, annotation
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
    def pad(
        self,
        images: List[np.ndarray],
        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
        constant_values: Union[float, Iterable[float]] = 0,
        return_pixel_mask: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        update_bboxes: bool = True,
    ) -> BatchFeature:
        """
        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
        in the batch and optionally returns their corresponding pixel mask.
        Args:
-            image (`np.ndarray`):
+            images (List[`np.ndarray`]):
-                Image to pad.
+                Images to pad.
            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
                Annotations to transform according to the padding that is applied to the images.
            constant_values (`float` or `Iterable[float]`, *optional*):
                The value to use for the padding if `mode` is `"constant"`.
            return_pixel_mask (`bool`, *optional*, defaults to `True`):
@@ -744,19 +809,29 @@ class DetaImageProcessor(BaseImageProcessor):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
            update_bboxes (`bool`, *optional*, defaults to `True`):
                Whether to update the bounding boxes in the annotations to match the padded images. If the
                bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
                format, the bounding boxes will not be updated.
        """
        pad_size = get_max_height_width(images, input_data_format=input_data_format)
-        padded_images = [
+        annotation_list = annotations if annotations is not None else [None] * len(images)
-            self._pad_image(
+        padded_images = []
        padded_annotations = []
        for image, annotation in zip(images, annotation_list):
            padded_image, padded_annotation = self._pad_image(
                image,
                pad_size,
                annotation,
                constant_values=constant_values,
                data_format=data_format,
                input_data_format=input_data_format,
                update_bboxes=update_bboxes,
            )
-            for image in images
+            padded_images.append(padded_image)
-        ]
+            padded_annotations.append(padded_annotation)
        data = {"pixel_values": padded_images}
        if return_pixel_mask:
@@ -766,7 +841,14 @@ class DetaImageProcessor(BaseImageProcessor):
            ]
            data["pixel_mask"] = masks
-        return BatchFeature(data=data, tensor_type=return_tensors)
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
        if annotations is not None:
            encoded_inputs["labels"] = [
                BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
            ]
        return encoded_inputs
    def preprocess(
        self,
@@ -782,6 +864,7 @@ class DetaImageProcessor(BaseImageProcessor):
        do_normalize: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_annotations: Optional[bool] = None,
        do_pad: Optional[bool] = None,
        format: Optional[Union[str, AnnotationFormat]] = None,
        return_tensors: Optional[Union[TensorType, str]] = None,
@@ -827,8 +910,13 @@ class DetaImageProcessor(BaseImageProcessor):
                Mean to use when normalizing the image.
            image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
                Standard deviation to use when normalizing the image.
            do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
                Whether to convert the annotations to the format expected by the model. Converts the bounding
                boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
                and in relative coordinates.
            do_pad (`bool`, *optional*, defaults to self.do_pad):
-                Whether to pad the image.
+                Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
                and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
                Format of the annotations.
            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
@@ -861,6 +949,9 @@ class DetaImageProcessor(BaseImageProcessor):
        do_normalize = self.do_normalize if do_normalize is None else do_normalize
        image_mean = self.image_mean if image_mean is None else image_mean
        image_std = self.image_std if image_std is None else image_std
        do_convert_annotations = (
            self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
        )
        do_pad = self.do_pad if do_pad is None else do_pad
        format = self.format if format is None else format
@@ -964,29 +1055,34 @@ class DetaImageProcessor(BaseImageProcessor):
            images = [
                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
            ]
-            if annotations is not None:
+
-                annotations = [
+        if do_convert_annotations and annotations is not None:
-                    self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+            annotations = [
-                    for annotation, image in zip(annotations, images)
+                self.normalize_annotation(annotation, get_image_size(image, input_data_format))
-                ]
+                for annotation, image in zip(annotations, images)
            ]
        if do_pad:
            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
-            data = self.pad(
+            encoded_inputs = self.pad(
-                images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
+                images,
                annotations=annotations,
                return_pixel_mask=True,
                data_format=data_format,
                input_data_format=input_data_format,
                return_tensors=return_tensors,
                update_bboxes=do_convert_annotations,
            )
        else:
            images = [
                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
                for image in images
            ]
-            data = {"pixel_values": images}
+            encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
-
+            if annotations is not None:
-        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+                encoded_inputs["labels"] = [
-        if annotations is not None:
+                    BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
-            encoded_inputs["labels"] = [
+                ]
                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
            ]
        return encoded_inputs
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -760,7 +760,7 @@ class DetrImageProcessor(BaseImageProcessor):
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
            `preprocess` method.
-        do_normalize:
+        do_normalize (`bool`, *optional*, defaults to True):
            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
            `preprocess` method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
@@ -769,9 +769,14 @@ class DetrImageProcessor(BaseImageProcessor):
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_annotations (`bool`, *optional*, defaults to `True`):
            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
        do_pad (`bool`, *optional*, defaults to `True`):
-            Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
-            overridden by the `do_pad` parameter in the `preprocess` method.
+            method. If `True` will pad the images in the batch to the largest height and width in the batch.
            Padding will be applied to the bottom and right of the image with zeros.
    """
    model_input_names = ["pixel_values", "pixel_mask"]
@@ -787,6 +792,7 @@ class DetrImageProcessor(BaseImageProcessor):
        do_normalize: bool = True,
        image_mean: Union[float, List[float]] = None,
        image_std: Union[float, List[float]] = None,
        do_convert_annotations: Optional[bool] = None,
        do_pad: bool = True,
        **kwargs,
    ) -> None:
@@ -805,6 +811,10 @@ class DetrImageProcessor(BaseImageProcessor):
        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
        size = get_size_dict(size, max_size=max_size, default_to_square=False)
        # Backwards compatibility
        if do_convert_annotations is None:
            do_convert_annotations = do_normalize
        super().__init__(**kwargs)
        self.format = format
        self.do_resize = do_resize
@@ -813,6 +823,7 @@ class DetrImageProcessor(BaseImageProcessor):
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.do_convert_annotations = do_convert_annotations
        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
        self.do_pad = do_pad
@@ -981,17 +992,62 @@ class DetrImageProcessor(BaseImageProcessor):
    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
        """
        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
-        `[center_x, center_y, width, height]` format.
+        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
        """
        return normalize_annotation(annotation, image_size=image_size)
    def _update_annotation_for_padded_image(
        self,
        annotation: Dict,
        input_image_size: Tuple[int, int],
        output_image_size: Tuple[int, int],
        padding,
        update_bboxes,
    ) -> Dict:
        """
        Update the annotation for a padded image.
        """
        new_annotation = {}
        new_annotation["size"] = output_image_size
        for key, value in annotation.items():
            if key == "masks":
                masks = value
                masks = pad(
                    masks,
                    padding,
                    mode=PaddingMode.CONSTANT,
                    constant_values=0,
                    input_data_format=ChannelDimension.FIRST,
                )
                masks = safe_squeeze(masks, 1)
                new_annotation["masks"] = masks
            elif key == "boxes" and update_bboxes:
                boxes = value
                boxes *= np.asarray(
                    [
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                    ]
                )
                new_annotation["boxes"] = boxes
            elif key == "size":
                new_annotation["size"] = output_image_size
            else:
                new_annotation[key] = value
        return new_annotation
    def _pad_image(
        self,
        image: np.ndarray,
        output_size: Tuple[int, int],
        annotation: Optional[Dict[str, Any]] = None,
        constant_values: Union[float, Iterable[float]] = 0,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        update_bboxes: bool = True,
    ) -> np.ndarray:
        """
        Pad an image with zeros to the given size.
@@ -1010,24 +1066,32 @@ class DetrImageProcessor(BaseImageProcessor):
            data_format=data_format,
            input_data_format=input_data_format,
        )
-        return padded_image
+        if annotation is not None:
            annotation = self._update_annotation_for_padded_image(
                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
            )
        return padded_image, annotation
    def pad(
        self,
        images: List[np.ndarray],
        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
        constant_values: Union[float, Iterable[float]] = 0,
        return_pixel_mask: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        update_bboxes: bool = True,
    ) -> BatchFeature:
        """
        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
        in the batch and optionally returns their corresponding pixel mask.
        Args:
-            image (`np.ndarray`):
+            images (List[`np.ndarray`]):
-                Image to pad.
+                Images to pad.
            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
                Annotations to transform according to the padding that is applied to the images.
            constant_values (`float` or `Iterable[float]`, *optional*):
                The value to use for the padding if `mode` is `"constant"`.
            return_pixel_mask (`bool`, *optional*, defaults to `True`):
@@ -1043,19 +1107,29 @@ class DetrImageProcessor(BaseImageProcessor):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
            update_bboxes (`bool`, *optional*, defaults to `True`):
                Whether to update the bounding boxes in the annotations to match the padded images. If the
                bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
                format, the bounding boxes will not be updated.
        """
        pad_size = get_max_height_width(images, input_data_format=input_data_format)
-        padded_images = [
+        annotation_list = annotations if annotations is not None else [None] * len(images)
-            self._pad_image(
+        padded_images = []
        padded_annotations = []
        for image, annotation in zip(images, annotation_list):
            padded_image, padded_annotation = self._pad_image(
                image,
                pad_size,
                annotation,
                constant_values=constant_values,
                data_format=data_format,
                input_data_format=input_data_format,
                update_bboxes=update_bboxes,
            )
-            for image in images
+            padded_images.append(padded_image)
-        ]
+            padded_annotations.append(padded_annotation)
        data = {"pixel_values": padded_images}
        if return_pixel_mask:
@@ -1065,7 +1139,14 @@ class DetrImageProcessor(BaseImageProcessor):
            ]
            data["pixel_mask"] = masks
-        return BatchFeature(data=data, tensor_type=return_tensors)
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
        if annotations is not None:
            encoded_inputs["labels"] = [
                BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
            ]
        return encoded_inputs
    def preprocess(
        self,
@@ -1079,6 +1160,7 @@ class DetrImageProcessor(BaseImageProcessor):
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[Union[int, float]] = None,
        do_normalize: Optional[bool] = None,
        do_convert_annotations: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_pad: Optional[bool] = None,
@@ -1122,12 +1204,17 @@ class DetrImageProcessor(BaseImageProcessor):
                Rescale factor to use when rescaling the image.
            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
                Whether to normalize the image.
            do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
                Whether to convert the annotations to the format expected by the model. Converts the bounding
                boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
                and in relative coordinates.
            image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
                Mean to use when normalizing the image.
            image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
                Standard deviation to use when normalizing the image.
            do_pad (`bool`, *optional*, defaults to self.do_pad):
-                Whether to pad the image.
+                Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
                and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
                Format of the annotations.
            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
@@ -1168,6 +1255,9 @@ class DetrImageProcessor(BaseImageProcessor):
        do_normalize = self.do_normalize if do_normalize is None else do_normalize
        image_mean = self.image_mean if image_mean is None else image_mean
        image_std = self.image_std if image_std is None else image_std
        do_convert_annotations = (
            self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
        )
        do_pad = self.do_pad if do_pad is None else do_pad
        format = self.format if format is None else format
@@ -1271,29 +1361,34 @@ class DetrImageProcessor(BaseImageProcessor):
            images = [
                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
            ]
-            if annotations is not None:
+
-                annotations = [
+        if do_convert_annotations and annotations is not None:
-                    self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+            annotations = [
-                    for annotation, image in zip(annotations, images)
+                self.normalize_annotation(annotation, get_image_size(image, input_data_format))
-                ]
+                for annotation, image in zip(annotations, images)
            ]
        if do_pad:
            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
-            data = self.pad(
+            encoded_inputs = self.pad(
-                images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
+                images,
                annotations=annotations,
                return_pixel_mask=True,
                data_format=data_format,
                input_data_format=input_data_format,
                return_tensors=return_tensors,
                update_bboxes=do_convert_annotations,
            )
        else:
            images = [
                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
                for image in images
            ]
-            data = {"pixel_values": images}
+            encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
-
+            if annotations is not None:
-        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+                encoded_inputs["labels"] = [
-        if annotations is not None:
+                    BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
-            encoded_inputs["labels"] = [
+                ]
                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
            ]
        return encoded_inputs
--- a/src/transformers/models/mask2former/image_processing_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former.py
@@ -771,7 +771,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
        )
        return encoded_inputs
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
+    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
    def _pad_image(
        self,
        image: np.ndarray,
@@ -799,7 +799,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
        )
        return padded_image
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
+    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
    def pad(
        self,
        images: List[np.ndarray],
--- a/src/transformers/models/maskformer/image_processing_maskformer.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer.py
@@ -788,7 +788,7 @@ class MaskFormerImageProcessor(BaseImageProcessor):
        )
        return encoded_inputs
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
+    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
    def _pad_image(
        self,
        image: np.ndarray,
@@ -816,7 +816,7 @@ class MaskFormerImageProcessor(BaseImageProcessor):
        )
        return padded_image
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
+    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
    def pad(
        self,
        images: List[np.ndarray],
--- a/src/transformers/models/oneformer/image_processing_oneformer.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@@ -770,7 +770,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
        )
        return encoded_inputs
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
+    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
    def _pad_image(
        self,
        image: np.ndarray,
@@ -798,7 +798,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
        )
        return padded_image
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
+    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
    def pad(
        self,
        images: List[np.ndarray],
--- a/src/transformers/models/vilt/image_processing_vilt.py
+++ b/src/transformers/models/vilt/image_processing_vilt.py
@@ -251,7 +251,6 @@ class ViltImageProcessor(BaseImageProcessor):
            **kwargs,
        )
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
    def _pad_image(
        self,
        image: np.ndarray,
@@ -279,7 +278,6 @@ class ViltImageProcessor(BaseImageProcessor):
        )
        return padded_image
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
    def pad(
        self,
        images: List[np.ndarray],
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -696,8 +696,9 @@ class YolosImageProcessor(BaseImageProcessor):
            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_pad (`bool`, *optional*, defaults to `True`):
-            Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
-            overridden by the `do_pad` parameter in the `preprocess` method.
+            method. If `True` will pad the images in the batch to the largest height and width in the batch.
            Padding will be applied to the bottom and right of the image with zeros.
    """
    model_input_names = ["pixel_values", "pixel_mask"]
@@ -713,6 +714,7 @@ class YolosImageProcessor(BaseImageProcessor):
        do_normalize: bool = True,
        image_mean: Union[float, List[float]] = None,
        image_std: Union[float, List[float]] = None,
        do_convert_annotations: Optional[bool] = None,
        do_pad: bool = True,
        **kwargs,
    ) -> None:
@@ -731,6 +733,10 @@ class YolosImageProcessor(BaseImageProcessor):
        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
        size = get_size_dict(size, max_size=max_size, default_to_square=False)
        # Backwards compatibility
        if do_convert_annotations is None:
            do_convert_annotations = do_normalize
        super().__init__(**kwargs)
        self.format = format
        self.do_resize = do_resize
@@ -739,6 +745,7 @@ class YolosImageProcessor(BaseImageProcessor):
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.do_convert_annotations = do_convert_annotations
        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
        self.do_pad = do_pad
@@ -916,18 +923,64 @@ class YolosImageProcessor(BaseImageProcessor):
    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
        """
        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
-        `[center_x, center_y, width, height]` format.
+        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
        """
        return normalize_annotation(annotation, image_size=image_size)
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
    def _update_annotation_for_padded_image(
        self,
        annotation: Dict,
        input_image_size: Tuple[int, int],
        output_image_size: Tuple[int, int],
        padding,
        update_bboxes,
    ) -> Dict:
        """
        Update the annotation for a padded image.
        """
        new_annotation = {}
        new_annotation["size"] = output_image_size
        for key, value in annotation.items():
            if key == "masks":
                masks = value
                masks = pad(
                    masks,
                    padding,
                    mode=PaddingMode.CONSTANT,
                    constant_values=0,
                    input_data_format=ChannelDimension.FIRST,
                )
                masks = safe_squeeze(masks, 1)
                new_annotation["masks"] = masks
            elif key == "boxes" and update_bboxes:
                boxes = value
                boxes *= np.asarray(
                    [
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                    ]
                )
                new_annotation["boxes"] = boxes
            elif key == "size":
                new_annotation["size"] = output_image_size
            else:
                new_annotation[key] = value
        return new_annotation
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
    def _pad_image(
        self,
        image: np.ndarray,
        output_size: Tuple[int, int],
        annotation: Optional[Dict[str, Any]] = None,
        constant_values: Union[float, Iterable[float]] = 0,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        update_bboxes: bool = True,
    ) -> np.ndarray:
        """
        Pad an image with zeros to the given size.
@@ -946,16 +999,22 @@ class YolosImageProcessor(BaseImageProcessor):
            data_format=data_format,
            input_data_format=input_data_format,
        )
-        return padded_image
+        if annotation is not None:
            annotation = self._update_annotation_for_padded_image(
                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
            )
        return padded_image, annotation
    def pad(
        self,
        images: List[np.ndarray],
        annotations: Optional[List[Dict[str, Any]]] = None,
        constant_values: Union[float, Iterable[float]] = 0,
        return_pixel_mask: bool = False,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        update_bboxes: bool = True,
    ) -> BatchFeature:
        """
        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
@@ -964,6 +1023,9 @@ class YolosImageProcessor(BaseImageProcessor):
        Args:
            image (`np.ndarray`):
                Image to pad.
            annotations (`List[Dict[str, any]]`, *optional*):
                Annotations to pad along with the images. If provided, the bounding boxes will be updated to match the
                padded images.
            constant_values (`float` or `Iterable[float]`, *optional*):
                The value to use for the padding if `mode` is `"constant"`.
            return_pixel_mask (`bool`, *optional*, defaults to `True`):
@@ -979,19 +1041,29 @@ class YolosImageProcessor(BaseImageProcessor):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
            update_bboxes (`bool`, *optional*, defaults to `True`):
                Whether to update the bounding boxes in the annotations to match the padded images. If the
                bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
                format, the bounding boxes will not be updated.
        """
        pad_size = get_max_height_width(images, input_data_format=input_data_format)
-        padded_images = [
+        annotation_list = annotations if annotations is not None else [None] * len(images)
-            self._pad_image(
+        padded_images = []
        padded_annotations = []
        for image, annotation in zip(images, annotation_list):
            padded_image, padded_annotation = self._pad_image(
                image,
                pad_size,
                annotation,
                constant_values=constant_values,
                data_format=data_format,
                input_data_format=input_data_format,
                update_bboxes=update_bboxes,
            )
-            for image in images
+            padded_images.append(padded_image)
-        ]
+            padded_annotations.append(padded_annotation)
        data = {"pixel_values": padded_images}
        if return_pixel_mask:
@@ -1017,6 +1089,7 @@ class YolosImageProcessor(BaseImageProcessor):
        do_normalize: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_annotations: Optional[bool] = None,
        do_pad: Optional[bool] = None,
        format: Optional[Union[str, AnnotationFormat]] = None,
        return_tensors: Optional[Union[TensorType, str]] = None,
@@ -1062,8 +1135,13 @@ class YolosImageProcessor(BaseImageProcessor):
                Mean to use when normalizing the image.
            image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
                Standard deviation to use when normalizing the image.
            do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
                Whether to convert the annotations to the format expected by the model. Converts the bounding
                boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
                and in relative coordinates.
            do_pad (`bool`, *optional*, defaults to self.do_pad):
-                Whether to pad the image.
+                Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
                and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
                Format of the annotations.
            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
@@ -1101,6 +1179,9 @@ class YolosImageProcessor(BaseImageProcessor):
        do_normalize = self.do_normalize if do_normalize is None else do_normalize
        image_mean = self.image_mean if image_mean is None else image_mean
        image_std = self.image_std if image_std is None else image_std
        do_convert_annotations = (
            self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
        )
        do_pad = self.do_pad if do_pad is None else do_pad
        format = self.format if format is None else format
@@ -1204,26 +1285,34 @@ class YolosImageProcessor(BaseImageProcessor):
            images = [
                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
            ]
-            if annotations is not None:
+
-                annotations = [
+        if do_convert_annotations and annotations is not None:
-                    self.normalize_annotation(annotation, get_image_size(image))
+            annotations = [
-                    for annotation, image in zip(annotations, images)
+                self.normalize_annotation(annotation, get_image_size(image))
-                ]
+                for annotation, image in zip(annotations, images)
            ]
        if do_pad:
-            data = self.pad(images, data_format=data_format, input_data_format=input_data_format)
+            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
            encoded_inputs = self.pad(
                images,
                annotations=annotations,
                return_pixel_mask=True,
                data_format=data_format,
                input_data_format=input_data_format,
                update_bboxes=do_convert_annotations,
                return_tensors=return_tensors,
            )
        else:
            images = [
                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
                for image in images
            ]
-            data = {"pixel_values": images}
+            encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
-
+            if annotations is not None:
-        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+                encoded_inputs["labels"] = [
-        if annotations is not None:
+                    BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
-            encoded_inputs["labels"] = [
+                ]
                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
            ]
        return encoded_inputs
--- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py
+++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py
@@ -248,3 +248,246 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
        # verify size
        expected_size = torch.tensor([800, 1066])
        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
    @slow
    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->ConditionalDetr, facebook/detr-resnet-50 ->microsoft/conditional-detr-resnet-50
    def test_batched_coco_detection_annotations(self):
        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
            target = json.loads(f.read())
        annotations_0 = {"image_id": 39769, "annotations": target}
        annotations_1 = {"image_id": 39769, "annotations": target}
        # Adjust the bounding boxes for the resized image
        w_0, h_0 = image_0.size
        w_1, h_1 = image_1.size
        for i in range(len(annotations_1["annotations"])):
            coords = annotations_1["annotations"][i]["bbox"]
            new_bbox = [
                coords[0] * w_1 / w_0,
                coords[1] * h_1 / h_0,
                coords[2] * w_1 / w_0,
                coords[3] * h_1 / h_0,
            ]
            annotations_1["annotations"][i]["bbox"] = new_bbox
        images = [image_0, image_1]
        annotations = [annotations_0, annotations_1]
        image_processing = ConditionalDetrImageProcessor()
        encoding = image_processing(
            images=images,
            annotations=annotations,
            return_segmentation_masks=True,
            return_tensors="pt",  # do_convert_annotations=True
        )
        # Check the pixel values have been padded
        postprocessed_height, postprocessed_width = 800, 1066
        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
        # Check the bounding boxes have been adjusted for padded images
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        expected_boxes_0 = torch.tensor(
            [
                [0.6879, 0.4609, 0.0755, 0.3691],
                [0.2118, 0.3359, 0.2601, 0.1566],
                [0.5011, 0.5000, 0.9979, 1.0000],
                [0.5010, 0.5020, 0.9979, 0.9959],
                [0.3284, 0.5944, 0.5884, 0.8112],
                [0.8394, 0.5445, 0.3213, 0.9110],
            ]
        )
        expected_boxes_1 = torch.tensor(
            [
                [0.4130, 0.2765, 0.0453, 0.2215],
                [0.1272, 0.2016, 0.1561, 0.0940],
                [0.3757, 0.4933, 0.7488, 0.9865],
                [0.3759, 0.5002, 0.7492, 0.9955],
                [0.1971, 0.5456, 0.3532, 0.8646],
                [0.5790, 0.4115, 0.3430, 0.7161],
            ]
        )
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
        # Check the masks have also been padded
        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
        # format and not in the range [0, 1]
        encoding = image_processing(
            images=images,
            annotations=annotations,
            return_segmentation_masks=True,
            do_convert_annotations=False,
            return_tensors="pt",
        )
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        # Convert to absolute coordinates
        unnormalized_boxes_0 = torch.vstack(
            [
                expected_boxes_0[:, 0] * postprocessed_width,
                expected_boxes_0[:, 1] * postprocessed_height,
                expected_boxes_0[:, 2] * postprocessed_width,
                expected_boxes_0[:, 3] * postprocessed_height,
            ]
        ).T
        unnormalized_boxes_1 = torch.vstack(
            [
                expected_boxes_1[:, 0] * postprocessed_width,
                expected_boxes_1[:, 1] * postprocessed_height,
                expected_boxes_1[:, 2] * postprocessed_width,
                expected_boxes_1[:, 3] * postprocessed_height,
            ]
        ).T
        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
        expected_boxes_0 = torch.vstack(
            [
                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
            ]
        ).T
        expected_boxes_1 = torch.vstack(
            [
                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
            ]
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
    @slow
    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->ConditionalDetr
    def test_batched_coco_panoptic_annotations(self):
        # prepare image, target and masks_path
        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
            target = json.loads(f.read())
        annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
        annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
        w_0, h_0 = image_0.size
        w_1, h_1 = image_1.size
        for i in range(len(annotation_1["segments_info"])):
            coords = annotation_1["segments_info"][i]["bbox"]
            new_bbox = [
                coords[0] * w_1 / w_0,
                coords[1] * h_1 / h_0,
                coords[2] * w_1 / w_0,
                coords[3] * h_1 / h_0,
            ]
            annotation_1["segments_info"][i]["bbox"] = new_bbox
        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
        images = [image_0, image_1]
        annotations = [annotation_0, annotation_1]
        # encode them
        image_processing = ConditionalDetrImageProcessor(format="coco_panoptic")
        encoding = image_processing(
            images=images,
            annotations=annotations,
            masks_path=masks_path,
            return_tensors="pt",
            return_segmentation_masks=True,
        )
        # Check the pixel values have been padded
        postprocessed_height, postprocessed_width = 800, 1066
        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
        # Check the bounding boxes have been adjusted for padded images
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        expected_boxes_0 = torch.tensor(
            [
                [0.2625, 0.5437, 0.4688, 0.8625],
                [0.7719, 0.4104, 0.4531, 0.7125],
                [0.5000, 0.4927, 0.9969, 0.9854],
                [0.1688, 0.2000, 0.2063, 0.0917],
                [0.5492, 0.2760, 0.0578, 0.2187],
                [0.4992, 0.4990, 0.9984, 0.9979],
            ]
        )
        expected_boxes_1 = torch.tensor(
            [
                [0.1576, 0.3262, 0.2814, 0.5175],
                [0.4634, 0.2463, 0.2720, 0.4275],
                [0.3002, 0.2956, 0.5985, 0.5913],
                [0.1013, 0.1200, 0.1238, 0.0550],
                [0.3297, 0.1656, 0.0347, 0.1312],
                [0.2997, 0.2994, 0.5994, 0.5987],
            ]
        )
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
        # Check the masks have also been padded
        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
        # format and not in the range [0, 1]
        encoding = image_processing(
            images=images,
            annotations=annotations,
            masks_path=masks_path,
            return_segmentation_masks=True,
            do_convert_annotations=False,
            return_tensors="pt",
        )
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        # Convert to absolute coordinates
        unnormalized_boxes_0 = torch.vstack(
            [
                expected_boxes_0[:, 0] * postprocessed_width,
                expected_boxes_0[:, 1] * postprocessed_height,
                expected_boxes_0[:, 2] * postprocessed_width,
                expected_boxes_0[:, 3] * postprocessed_height,
            ]
        ).T
        unnormalized_boxes_1 = torch.vstack(
            [
                expected_boxes_1[:, 0] * postprocessed_width,
                expected_boxes_1[:, 1] * postprocessed_height,
                expected_boxes_1[:, 2] * postprocessed_width,
                expected_boxes_1[:, 3] * postprocessed_height,
            ]
        ).T
        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
        expected_boxes_0 = torch.vstack(
            [
                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
            ]
        ).T
        expected_boxes_1 = torch.vstack(
            [
                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
            ]
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
--- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py
+++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
@@ -250,3 +250,246 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
        # verify size
        expected_size = torch.tensor([800, 1066])
        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
    @slow
    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->DeformableDetr
    def test_batched_coco_detection_annotations(self):
        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
            target = json.loads(f.read())
        annotations_0 = {"image_id": 39769, "annotations": target}
        annotations_1 = {"image_id": 39769, "annotations": target}
        # Adjust the bounding boxes for the resized image
        w_0, h_0 = image_0.size
        w_1, h_1 = image_1.size
        for i in range(len(annotations_1["annotations"])):
            coords = annotations_1["annotations"][i]["bbox"]
            new_bbox = [
                coords[0] * w_1 / w_0,
                coords[1] * h_1 / h_0,
                coords[2] * w_1 / w_0,
                coords[3] * h_1 / h_0,
            ]
            annotations_1["annotations"][i]["bbox"] = new_bbox
        images = [image_0, image_1]
        annotations = [annotations_0, annotations_1]
        image_processing = DeformableDetrImageProcessor()
        encoding = image_processing(
            images=images,
            annotations=annotations,
            return_segmentation_masks=True,
            return_tensors="pt",  # do_convert_annotations=True
        )
        # Check the pixel values have been padded
        postprocessed_height, postprocessed_width = 800, 1066
        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
        # Check the bounding boxes have been adjusted for padded images
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        expected_boxes_0 = torch.tensor(
            [
                [0.6879, 0.4609, 0.0755, 0.3691],
                [0.2118, 0.3359, 0.2601, 0.1566],
                [0.5011, 0.5000, 0.9979, 1.0000],
                [0.5010, 0.5020, 0.9979, 0.9959],
                [0.3284, 0.5944, 0.5884, 0.8112],
                [0.8394, 0.5445, 0.3213, 0.9110],
            ]
        )
        expected_boxes_1 = torch.tensor(
            [
                [0.4130, 0.2765, 0.0453, 0.2215],
                [0.1272, 0.2016, 0.1561, 0.0940],
                [0.3757, 0.4933, 0.7488, 0.9865],
                [0.3759, 0.5002, 0.7492, 0.9955],
                [0.1971, 0.5456, 0.3532, 0.8646],
                [0.5790, 0.4115, 0.3430, 0.7161],
            ]
        )
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
        # Check the masks have also been padded
        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
        # format and not in the range [0, 1]
        encoding = image_processing(
            images=images,
            annotations=annotations,
            return_segmentation_masks=True,
            do_convert_annotations=False,
            return_tensors="pt",
        )
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        # Convert to absolute coordinates
        unnormalized_boxes_0 = torch.vstack(
            [
                expected_boxes_0[:, 0] * postprocessed_width,
                expected_boxes_0[:, 1] * postprocessed_height,
                expected_boxes_0[:, 2] * postprocessed_width,
                expected_boxes_0[:, 3] * postprocessed_height,
            ]
        ).T
        unnormalized_boxes_1 = torch.vstack(
            [
                expected_boxes_1[:, 0] * postprocessed_width,
                expected_boxes_1[:, 1] * postprocessed_height,
                expected_boxes_1[:, 2] * postprocessed_width,
                expected_boxes_1[:, 3] * postprocessed_height,
            ]
        ).T
        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
        expected_boxes_0 = torch.vstack(
            [
                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
            ]
        ).T
        expected_boxes_1 = torch.vstack(
            [
                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
            ]
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
    @slow
    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->DeformableDetr
    def test_batched_coco_panoptic_annotations(self):
        # prepare image, target and masks_path
        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
            target = json.loads(f.read())
        annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
        annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
        w_0, h_0 = image_0.size
        w_1, h_1 = image_1.size
        for i in range(len(annotation_1["segments_info"])):
            coords = annotation_1["segments_info"][i]["bbox"]
            new_bbox = [
                coords[0] * w_1 / w_0,
                coords[1] * h_1 / h_0,
                coords[2] * w_1 / w_0,
                coords[3] * h_1 / h_0,
            ]
            annotation_1["segments_info"][i]["bbox"] = new_bbox
        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
        images = [image_0, image_1]
        annotations = [annotation_0, annotation_1]
        # encode them
        image_processing = DeformableDetrImageProcessor(format="coco_panoptic")
        encoding = image_processing(
            images=images,
            annotations=annotations,
            masks_path=masks_path,
            return_tensors="pt",
            return_segmentation_masks=True,
        )
        # Check the pixel values have been padded
        postprocessed_height, postprocessed_width = 800, 1066
        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
        # Check the bounding boxes have been adjusted for padded images
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        expected_boxes_0 = torch.tensor(
            [
                [0.2625, 0.5437, 0.4688, 0.8625],
                [0.7719, 0.4104, 0.4531, 0.7125],
                [0.5000, 0.4927, 0.9969, 0.9854],
                [0.1688, 0.2000, 0.2063, 0.0917],
                [0.5492, 0.2760, 0.0578, 0.2187],
                [0.4992, 0.4990, 0.9984, 0.9979],
            ]
        )
        expected_boxes_1 = torch.tensor(
            [
                [0.1576, 0.3262, 0.2814, 0.5175],
                [0.4634, 0.2463, 0.2720, 0.4275],
                [0.3002, 0.2956, 0.5985, 0.5913],
                [0.1013, 0.1200, 0.1238, 0.0550],
                [0.3297, 0.1656, 0.0347, 0.1312],
                [0.2997, 0.2994, 0.5994, 0.5987],
            ]
        )
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
        # Check the masks have also been padded
        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
        # format and not in the range [0, 1]
        encoding = image_processing(
            images=images,
            annotations=annotations,
            masks_path=masks_path,
            return_segmentation_masks=True,
            do_convert_annotations=False,
            return_tensors="pt",
        )
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        # Convert to absolute coordinates
        unnormalized_boxes_0 = torch.vstack(
            [
                expected_boxes_0[:, 0] * postprocessed_width,
                expected_boxes_0[:, 1] * postprocessed_height,
                expected_boxes_0[:, 2] * postprocessed_width,
                expected_boxes_0[:, 3] * postprocessed_height,
            ]
        ).T
        unnormalized_boxes_1 = torch.vstack(
            [
                expected_boxes_1[:, 0] * postprocessed_width,
                expected_boxes_1[:, 1] * postprocessed_height,
                expected_boxes_1[:, 2] * postprocessed_width,
                expected_boxes_1[:, 3] * postprocessed_height,
            ]
        ).T
        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
        expected_boxes_0 = torch.vstack(
            [
                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
            ]
        ).T
        expected_boxes_1 = torch.vstack(
            [
                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
            ]
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
--- a/tests/models/deta/test_image_processing_deta.py
+++ b/tests/models/deta/test_image_processing_deta.py
@@ -244,3 +244,246 @@ class DetaImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
        # verify size
        expected_size = torch.tensor([800, 1066])
        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
    @slow
    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->Deta
    def test_batched_coco_detection_annotations(self):
        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
            target = json.loads(f.read())
        annotations_0 = {"image_id": 39769, "annotations": target}
        annotations_1 = {"image_id": 39769, "annotations": target}
        # Adjust the bounding boxes for the resized image
        w_0, h_0 = image_0.size
        w_1, h_1 = image_1.size
        for i in range(len(annotations_1["annotations"])):
            coords = annotations_1["annotations"][i]["bbox"]
            new_bbox = [
                coords[0] * w_1 / w_0,
                coords[1] * h_1 / h_0,
                coords[2] * w_1 / w_0,
                coords[3] * h_1 / h_0,
            ]
            annotations_1["annotations"][i]["bbox"] = new_bbox
        images = [image_0, image_1]
        annotations = [annotations_0, annotations_1]
        image_processing = DetaImageProcessor()
        encoding = image_processing(
            images=images,
            annotations=annotations,
            return_segmentation_masks=True,
            return_tensors="pt",  # do_convert_annotations=True
        )
        # Check the pixel values have been padded
        postprocessed_height, postprocessed_width = 800, 1066
        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
        # Check the bounding boxes have been adjusted for padded images
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        expected_boxes_0 = torch.tensor(
            [
                [0.6879, 0.4609, 0.0755, 0.3691],
                [0.2118, 0.3359, 0.2601, 0.1566],
                [0.5011, 0.5000, 0.9979, 1.0000],
                [0.5010, 0.5020, 0.9979, 0.9959],
                [0.3284, 0.5944, 0.5884, 0.8112],
                [0.8394, 0.5445, 0.3213, 0.9110],
            ]
        )
        expected_boxes_1 = torch.tensor(
            [
                [0.4130, 0.2765, 0.0453, 0.2215],
                [0.1272, 0.2016, 0.1561, 0.0940],
                [0.3757, 0.4933, 0.7488, 0.9865],
                [0.3759, 0.5002, 0.7492, 0.9955],
                [0.1971, 0.5456, 0.3532, 0.8646],
                [0.5790, 0.4115, 0.3430, 0.7161],
            ]
        )
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
        # Check the masks have also been padded
        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
        # format and not in the range [0, 1]
        encoding = image_processing(
            images=images,
            annotations=annotations,
            return_segmentation_masks=True,
            do_convert_annotations=False,
            return_tensors="pt",
        )
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        # Convert to absolute coordinates
        unnormalized_boxes_0 = torch.vstack(
            [
                expected_boxes_0[:, 0] * postprocessed_width,
                expected_boxes_0[:, 1] * postprocessed_height,
                expected_boxes_0[:, 2] * postprocessed_width,
                expected_boxes_0[:, 3] * postprocessed_height,
            ]
        ).T
        unnormalized_boxes_1 = torch.vstack(
            [
                expected_boxes_1[:, 0] * postprocessed_width,
                expected_boxes_1[:, 1] * postprocessed_height,
                expected_boxes_1[:, 2] * postprocessed_width,
                expected_boxes_1[:, 3] * postprocessed_height,
            ]
        ).T
        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
        expected_boxes_0 = torch.vstack(
            [
                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
            ]
        ).T
        expected_boxes_1 = torch.vstack(
            [
                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
            ]
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
    @slow
    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->Deta
    def test_batched_coco_panoptic_annotations(self):
        # prepare image, target and masks_path
        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
            target = json.loads(f.read())
        annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
        annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
        w_0, h_0 = image_0.size
        w_1, h_1 = image_1.size
        for i in range(len(annotation_1["segments_info"])):
            coords = annotation_1["segments_info"][i]["bbox"]
            new_bbox = [
                coords[0] * w_1 / w_0,
                coords[1] * h_1 / h_0,
                coords[2] * w_1 / w_0,
                coords[3] * h_1 / h_0,
            ]
            annotation_1["segments_info"][i]["bbox"] = new_bbox
        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
        images = [image_0, image_1]
        annotations = [annotation_0, annotation_1]
        # encode them
        image_processing = DetaImageProcessor(format="coco_panoptic")
        encoding = image_processing(
            images=images,
            annotations=annotations,
            masks_path=masks_path,
            return_tensors="pt",
            return_segmentation_masks=True,
        )
        # Check the pixel values have been padded
        postprocessed_height, postprocessed_width = 800, 1066
        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
        # Check the bounding boxes have been adjusted for padded images
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        expected_boxes_0 = torch.tensor(
            [
                [0.2625, 0.5437, 0.4688, 0.8625],
                [0.7719, 0.4104, 0.4531, 0.7125],
                [0.5000, 0.4927, 0.9969, 0.9854],
                [0.1688, 0.2000, 0.2063, 0.0917],
                [0.5492, 0.2760, 0.0578, 0.2187],
                [0.4992, 0.4990, 0.9984, 0.9979],
            ]
        )
        expected_boxes_1 = torch.tensor(
            [
                [0.1576, 0.3262, 0.2814, 0.5175],
                [0.4634, 0.2463, 0.2720, 0.4275],
                [0.3002, 0.2956, 0.5985, 0.5913],
                [0.1013, 0.1200, 0.1238, 0.0550],
                [0.3297, 0.1656, 0.0347, 0.1312],
                [0.2997, 0.2994, 0.5994, 0.5987],
            ]
        )
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
        # Check the masks have also been padded
        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
        # format and not in the range [0, 1]
        encoding = image_processing(
            images=images,
            annotations=annotations,
            masks_path=masks_path,
            return_segmentation_masks=True,
            do_convert_annotations=False,
            return_tensors="pt",
        )
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        # Convert to absolute coordinates
        unnormalized_boxes_0 = torch.vstack(
            [
                expected_boxes_0[:, 0] * postprocessed_width,
                expected_boxes_0[:, 1] * postprocessed_height,
                expected_boxes_0[:, 2] * postprocessed_width,
                expected_boxes_0[:, 3] * postprocessed_height,
            ]
        ).T
        unnormalized_boxes_1 = torch.vstack(
            [
                expected_boxes_1[:, 0] * postprocessed_width,
                expected_boxes_1[:, 1] * postprocessed_height,
                expected_boxes_1[:, 2] * postprocessed_width,
                expected_boxes_1[:, 3] * postprocessed_height,
            ]
        ).T
        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
        expected_boxes_0 = torch.vstack(
            [
                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
            ]
        ).T
        expected_boxes_1 = torch.vstack(
            [
                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
            ]
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
--- a/tests/models/detr/test_image_processing_detr.py
+++ b/tests/models/detr/test_image_processing_detr.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
 import pathlib
 import unittest
@@ -308,3 +307,244 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
        # verify size
        expected_size = torch.tensor([800, 1066])
        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
    @slow
    def test_batched_coco_detection_annotations(self):
        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
            target = json.loads(f.read())
        annotations_0 = {"image_id": 39769, "annotations": target}
        annotations_1 = {"image_id": 39769, "annotations": target}
        # Adjust the bounding boxes for the resized image
        w_0, h_0 = image_0.size
        w_1, h_1 = image_1.size
        for i in range(len(annotations_1["annotations"])):
            coords = annotations_1["annotations"][i]["bbox"]
            new_bbox = [
                coords[0] * w_1 / w_0,
                coords[1] * h_1 / h_0,
                coords[2] * w_1 / w_0,
                coords[3] * h_1 / h_0,
            ]
            annotations_1["annotations"][i]["bbox"] = new_bbox
        images = [image_0, image_1]
        annotations = [annotations_0, annotations_1]
        image_processing = DetrImageProcessor()
        encoding = image_processing(
            images=images,
            annotations=annotations,
            return_segmentation_masks=True,
            return_tensors="pt",  # do_convert_annotations=True
        )
        # Check the pixel values have been padded
        postprocessed_height, postprocessed_width = 800, 1066
        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
        # Check the bounding boxes have been adjusted for padded images
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        expected_boxes_0 = torch.tensor(
            [
                [0.6879, 0.4609, 0.0755, 0.3691],
                [0.2118, 0.3359, 0.2601, 0.1566],
                [0.5011, 0.5000, 0.9979, 1.0000],
                [0.5010, 0.5020, 0.9979, 0.9959],
                [0.3284, 0.5944, 0.5884, 0.8112],
                [0.8394, 0.5445, 0.3213, 0.9110],
            ]
        )
        expected_boxes_1 = torch.tensor(
            [
                [0.4130, 0.2765, 0.0453, 0.2215],
                [0.1272, 0.2016, 0.1561, 0.0940],
                [0.3757, 0.4933, 0.7488, 0.9865],
                [0.3759, 0.5002, 0.7492, 0.9955],
                [0.1971, 0.5456, 0.3532, 0.8646],
                [0.5790, 0.4115, 0.3430, 0.7161],
            ]
        )
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
        # Check the masks have also been padded
        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
        # format and not in the range [0, 1]
        encoding = image_processing(
            images=images,
            annotations=annotations,
            return_segmentation_masks=True,
            do_convert_annotations=False,
            return_tensors="pt",
        )
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        # Convert to absolute coordinates
        unnormalized_boxes_0 = torch.vstack(
            [
                expected_boxes_0[:, 0] * postprocessed_width,
                expected_boxes_0[:, 1] * postprocessed_height,
                expected_boxes_0[:, 2] * postprocessed_width,
                expected_boxes_0[:, 3] * postprocessed_height,
            ]
        ).T
        unnormalized_boxes_1 = torch.vstack(
            [
                expected_boxes_1[:, 0] * postprocessed_width,
                expected_boxes_1[:, 1] * postprocessed_height,
                expected_boxes_1[:, 2] * postprocessed_width,
                expected_boxes_1[:, 3] * postprocessed_height,
            ]
        ).T
        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
        expected_boxes_0 = torch.vstack(
            [
                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
            ]
        ).T
        expected_boxes_1 = torch.vstack(
            [
                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
            ]
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
    @slow
    def test_batched_coco_panoptic_annotations(self):
        # prepare image, target and masks_path
        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
            target = json.loads(f.read())
        annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
        annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
        w_0, h_0 = image_0.size
        w_1, h_1 = image_1.size
        for i in range(len(annotation_1["segments_info"])):
            coords = annotation_1["segments_info"][i]["bbox"]
            new_bbox = [
                coords[0] * w_1 / w_0,
                coords[1] * h_1 / h_0,
                coords[2] * w_1 / w_0,
                coords[3] * h_1 / h_0,
            ]
            annotation_1["segments_info"][i]["bbox"] = new_bbox
        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
        images = [image_0, image_1]
        annotations = [annotation_0, annotation_1]
        # encode them
        image_processing = DetrImageProcessor(format="coco_panoptic")
        encoding = image_processing(
            images=images,
            annotations=annotations,
            masks_path=masks_path,
            return_tensors="pt",
            return_segmentation_masks=True,
        )
        # Check the pixel values have been padded
        postprocessed_height, postprocessed_width = 800, 1066
        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
        # Check the bounding boxes have been adjusted for padded images
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        expected_boxes_0 = torch.tensor(
            [
                [0.2625, 0.5437, 0.4688, 0.8625],
                [0.7719, 0.4104, 0.4531, 0.7125],
                [0.5000, 0.4927, 0.9969, 0.9854],
                [0.1688, 0.2000, 0.2063, 0.0917],
                [0.5492, 0.2760, 0.0578, 0.2187],
                [0.4992, 0.4990, 0.9984, 0.9979],
            ]
        )
        expected_boxes_1 = torch.tensor(
            [
                [0.1576, 0.3262, 0.2814, 0.5175],
                [0.4634, 0.2463, 0.2720, 0.4275],
                [0.3002, 0.2956, 0.5985, 0.5913],
                [0.1013, 0.1200, 0.1238, 0.0550],
                [0.3297, 0.1656, 0.0347, 0.1312],
                [0.2997, 0.2994, 0.5994, 0.5987],
            ]
        )
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
        # Check the masks have also been padded
        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
        # format and not in the range [0, 1]
        encoding = image_processing(
            images=images,
            annotations=annotations,
            masks_path=masks_path,
            return_segmentation_masks=True,
            do_convert_annotations=False,
            return_tensors="pt",
        )
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        # Convert to absolute coordinates
        unnormalized_boxes_0 = torch.vstack(
            [
                expected_boxes_0[:, 0] * postprocessed_width,
                expected_boxes_0[:, 1] * postprocessed_height,
                expected_boxes_0[:, 2] * postprocessed_width,
                expected_boxes_0[:, 3] * postprocessed_height,
            ]
        ).T
        unnormalized_boxes_1 = torch.vstack(
            [
                expected_boxes_1[:, 0] * postprocessed_width,
                expected_boxes_1[:, 1] * postprocessed_height,
                expected_boxes_1[:, 2] * postprocessed_width,
                expected_boxes_1[:, 3] * postprocessed_height,
            ]
        ).T
        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
        expected_boxes_0 = torch.vstack(
            [
                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
            ]
        ).T
        expected_boxes_1 = torch.vstack(
            [
                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
            ]
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
--- a/tests/models/yolos/test_image_processing_yolos.py
+++ b/tests/models/yolos/test_image_processing_yolos.py
@@ -287,3 +287,246 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
        # verify size
        expected_size = torch.tensor([800, 1056])
        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
    @slow
    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->Yolos
    def test_batched_coco_detection_annotations(self):
        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
            target = json.loads(f.read())
        annotations_0 = {"image_id": 39769, "annotations": target}
        annotations_1 = {"image_id": 39769, "annotations": target}
        # Adjust the bounding boxes for the resized image
        w_0, h_0 = image_0.size
        w_1, h_1 = image_1.size
        for i in range(len(annotations_1["annotations"])):
            coords = annotations_1["annotations"][i]["bbox"]
            new_bbox = [
                coords[0] * w_1 / w_0,
                coords[1] * h_1 / h_0,
                coords[2] * w_1 / w_0,
                coords[3] * h_1 / h_0,
            ]
            annotations_1["annotations"][i]["bbox"] = new_bbox
        images = [image_0, image_1]
        annotations = [annotations_0, annotations_1]
        image_processing = YolosImageProcessor()
        encoding = image_processing(
            images=images,
            annotations=annotations,
            return_segmentation_masks=True,
            return_tensors="pt",  # do_convert_annotations=True
        )
        # Check the pixel values have been padded
        postprocessed_height, postprocessed_width = 800, 1066
        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
        # Check the bounding boxes have been adjusted for padded images
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        expected_boxes_0 = torch.tensor(
            [
                [0.6879, 0.4609, 0.0755, 0.3691],
                [0.2118, 0.3359, 0.2601, 0.1566],
                [0.5011, 0.5000, 0.9979, 1.0000],
                [0.5010, 0.5020, 0.9979, 0.9959],
                [0.3284, 0.5944, 0.5884, 0.8112],
                [0.8394, 0.5445, 0.3213, 0.9110],
            ]
        )
        expected_boxes_1 = torch.tensor(
            [
                [0.4130, 0.2765, 0.0453, 0.2215],
                [0.1272, 0.2016, 0.1561, 0.0940],
                [0.3757, 0.4933, 0.7488, 0.9865],
                [0.3759, 0.5002, 0.7492, 0.9955],
                [0.1971, 0.5456, 0.3532, 0.8646],
                [0.5790, 0.4115, 0.3430, 0.7161],
            ]
        )
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
        # Check the masks have also been padded
        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
        # format and not in the range [0, 1]
        encoding = image_processing(
            images=images,
            annotations=annotations,
            return_segmentation_masks=True,
            do_convert_annotations=False,
            return_tensors="pt",
        )
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        # Convert to absolute coordinates
        unnormalized_boxes_0 = torch.vstack(
            [
                expected_boxes_0[:, 0] * postprocessed_width,
                expected_boxes_0[:, 1] * postprocessed_height,
                expected_boxes_0[:, 2] * postprocessed_width,
                expected_boxes_0[:, 3] * postprocessed_height,
            ]
        ).T
        unnormalized_boxes_1 = torch.vstack(
            [
                expected_boxes_1[:, 0] * postprocessed_width,
                expected_boxes_1[:, 1] * postprocessed_height,
                expected_boxes_1[:, 2] * postprocessed_width,
                expected_boxes_1[:, 3] * postprocessed_height,
            ]
        ).T
        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
        expected_boxes_0 = torch.vstack(
            [
                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
            ]
        ).T
        expected_boxes_1 = torch.vstack(
            [
                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
            ]
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
    @slow
    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->Yolos
    def test_batched_coco_panoptic_annotations(self):
        # prepare image, target and masks_path
        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
            target = json.loads(f.read())
        annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
        annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
        w_0, h_0 = image_0.size
        w_1, h_1 = image_1.size
        for i in range(len(annotation_1["segments_info"])):
            coords = annotation_1["segments_info"][i]["bbox"]
            new_bbox = [
                coords[0] * w_1 / w_0,
                coords[1] * h_1 / h_0,
                coords[2] * w_1 / w_0,
                coords[3] * h_1 / h_0,
            ]
            annotation_1["segments_info"][i]["bbox"] = new_bbox
        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
        images = [image_0, image_1]
        annotations = [annotation_0, annotation_1]
        # encode them
        image_processing = YolosImageProcessor(format="coco_panoptic")
        encoding = image_processing(
            images=images,
            annotations=annotations,
            masks_path=masks_path,
            return_tensors="pt",
            return_segmentation_masks=True,
        )
        # Check the pixel values have been padded
        postprocessed_height, postprocessed_width = 800, 1066
        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
        # Check the bounding boxes have been adjusted for padded images
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        expected_boxes_0 = torch.tensor(
            [
                [0.2625, 0.5437, 0.4688, 0.8625],
                [0.7719, 0.4104, 0.4531, 0.7125],
                [0.5000, 0.4927, 0.9969, 0.9854],
                [0.1688, 0.2000, 0.2063, 0.0917],
                [0.5492, 0.2760, 0.0578, 0.2187],
                [0.4992, 0.4990, 0.9984, 0.9979],
            ]
        )
        expected_boxes_1 = torch.tensor(
            [
                [0.1576, 0.3262, 0.2814, 0.5175],
                [0.4634, 0.2463, 0.2720, 0.4275],
                [0.3002, 0.2956, 0.5985, 0.5913],
                [0.1013, 0.1200, 0.1238, 0.0550],
                [0.3297, 0.1656, 0.0347, 0.1312],
                [0.2997, 0.2994, 0.5994, 0.5987],
            ]
        )
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
        # Check the masks have also been padded
        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
        # format and not in the range [0, 1]
        encoding = image_processing(
            images=images,
            annotations=annotations,
            masks_path=masks_path,
            return_segmentation_masks=True,
            do_convert_annotations=False,
            return_tensors="pt",
        )
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        # Convert to absolute coordinates
        unnormalized_boxes_0 = torch.vstack(
            [
                expected_boxes_0[:, 0] * postprocessed_width,
                expected_boxes_0[:, 1] * postprocessed_height,
                expected_boxes_0[:, 2] * postprocessed_width,
                expected_boxes_0[:, 3] * postprocessed_height,
            ]
        ).T
        unnormalized_boxes_1 = torch.vstack(
            [
                expected_boxes_1[:, 0] * postprocessed_width,
                expected_boxes_1[:, 1] * postprocessed_height,
                expected_boxes_1[:, 2] * postprocessed_width,
                expected_boxes_1[:, 3] * postprocessed_height,
            ]
        ).T
        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
        expected_boxes_0 = torch.vstack(
            [
                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
            ]
        ).T
        expected_boxes_1 = torch.vstack(
            [
                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
            ]
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))