Add Fast Yolos Processor (#37292)

* Add Fast Yolos Processor * Update modular file * Fix copies --------- Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
2025-04-15 17:53:08 +05:30
parent ecaeee66bc
commit f6c79f767c
6 changed files with 1373 additions and 268 deletions
--- a/docs/source/en/model_doc/yolos.md
+++ b/docs/source/en/model_doc/yolos.md
@@ -92,6 +92,11 @@ Use [`YolosImageProcessor`] for preparing images (and optional targets) for the

 [[autodoc]] YolosImageProcessor
    - preprocess
+
+## YolosImageProcessorFast
+
+[[autodoc]] YolosImageProcessorFast
+    - preprocess
    - pad
    - post_process_object_detection

--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -167,7 +167,7 @@ else:
            ("vit_msn", ("ViTImageProcessor", "ViTImageProcessorFast")),
            ("vitmatte", ("VitMatteImageProcessor",)),
            ("xclip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
-            ("yolos", ("YolosImageProcessor",)),
+            ("yolos", ("YolosImageProcessor", "YolosImageProcessorFast")),
            ("zoedepth", ("ZoeDepthImageProcessor",)),
        ]
    )
--- a/src/transformers/models/yolos/init.py
+++ b/src/transformers/models/yolos/init.py
@@ -21,6 +21,7 @@ if TYPE_CHECKING:
    from .configuration_yolos import *
    from .feature_extraction_yolos import *
    from .image_processing_yolos import *
+    from .image_processing_yolos_fast import *
    from .modeling_yolos import *
 else:
    import sys
--- a/src/transformers/models/yolos/image_processing_yolos_fast.py
+++ b/src/transformers/models/yolos/image_processing_yolos_fast.py
@@ -0,0 +1,893 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/yolos/modular_yolos.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_yolos.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import pathlib
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ...image_processing_utils import BatchFeature, get_size_dict
+from ...image_processing_utils_fast import (
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    SizeDict,
+    get_image_size_for_max_height_width,
+    get_max_height_width,
+    safe_squeeze,
+)
+from ...image_transforms import center_to_corners_format, corners_to_center_format
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    validate_annotations,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    add_start_docstrings,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    logging,
+)
+from ...utils.import_utils import requires
+
+
+if is_torch_available():
+    import torch
+
+
+if is_torchvision_v2_available():
+    from torchvision.io import read_image
+    from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
+    from torchvision.io import read_image
+    from torchvision.transforms import functional as F
+
+
+logger = logging.get_logger(__name__)
+
+
+class YolosFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    format: Optional[Union[str, AnnotationFormat]]
+    do_convert_annotations: Optional[bool]
+    do_pad: Optional[bool]
+    pad_size: Optional[Dict[str, int]]
+    return_segmentation_masks: Optional[bool]
+
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
+
+# inspired by https://github.com/facebookresearch/yolos/blob/master/datasets/coco.py#L33
+def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor:
+    """
+    Convert a COCO polygon annotation to a mask.
+
+    Args:
+        segmentations (`List[List[float]]`):
+            List of polygons, each polygon represented by a list of x-y coordinates.
+        height (`int`):
+            Height of the mask.
+        width (`int`):
+            Width of the mask.
+    """
+    try:
+        from pycocotools import mask as coco_mask
+    except ImportError:
+        raise ImportError("Pycocotools is not installed in your environment.")
+
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8, device=device)
+        mask = torch.any(mask, axis=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, axis=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8, device=device)
+
+    return masks
+
+
+# inspired by https://github.com/facebookresearch/yolos/blob/master/datasets/coco.py#L50
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by YOLOS.
+    """
+    image_height, image_width = image.size()[-2:]
+
+    image_id = target["image_id"]
+    image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    classes = []
+    area = []
+    boxes = []
+    keypoints = []
+    for obj in annotations:
+        if "iscrowd" not in obj or obj["iscrowd"] == 0:
+            classes.append(obj["category_id"])
+            area.append(obj["area"])
+            boxes.append(obj["bbox"])
+            if "keypoints" in obj:
+                keypoints.append(obj["keypoints"])
+
+    classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
+    area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
+    iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
+    # guard against no boxes via resizing
+    boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {
+        "image_id": image_id,
+        "class_labels": classes[keep],
+        "boxes": boxes[keep],
+        "area": area[keep],
+        "iscrowd": iscrowd[keep],
+        "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
+    }
+
+    if keypoints:
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+
+    if return_segmentation_masks:
+        segmentation_masks = [obj["segmentation"] for obj in annotations]
+        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width, device=image.device)
+        new_target["masks"] = masks[keep]
+
+    return new_target
+
+
+def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    Args:
+        masks: masks in format `[number_masks, height, width]` where N is the number of masks
+
+    Returns:
+        boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+
+    h, w = masks.shape[-2:]
+    y = torch.arange(0, h, dtype=torch.float32, device=masks.device)
+    x = torch.arange(0, w, dtype=torch.float32, device=masks.device)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = torch.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * torch.unsqueeze(x, 0)
+    x_max = x_mask.view(x_mask.shape[0], -1).max(-1)[0]
+    x_min = (
+        torch.where(masks, x.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
+    )
+
+    y_mask = masks * torch.unsqueeze(y, 0)
+    y_max = y_mask.view(y_mask.shape[0], -1).max(-1)[0]
+    y_min = (
+        torch.where(masks, y.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
+    )
+
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
+
+
+# 2 functions below adapted from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
+# Copyright (c) 2018, Alexander Kirillov
+# All rights reserved.
+def rgb_to_id(color):
+    """
+    Converts RGB color to unique ID.
+    """
+    if isinstance(color, torch.Tensor) and len(color.shape) == 3:
+        if color.dtype == torch.uint8:
+            color = color.to(torch.int32)
+        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+    return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+
+def prepare_coco_panoptic_annotation(
+    image: torch.Tensor,
+    target: Dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
+) -> Dict:
+    """
+    Prepare a coco panoptic annotation for YOLOS.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+    annotation_path = pathlib.Path(masks_path) / target["file_name"]
+
+    new_target = {}
+    new_target["image_id"] = torch.as_tensor(
+        [target["image_id"] if "image_id" in target else target["id"]], dtype=torch.int64, device=image.device
+    )
+    new_target["size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
+    new_target["orig_size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
+
+    if "segments_info" in target:
+        masks = read_image(annotation_path).permute(1, 2, 0).to(dtype=torch.int32, device=image.device)
+        masks = rgb_to_id(masks)
+
+        ids = torch.as_tensor([segment_info["id"] for segment_info in target["segments_info"]], device=image.device)
+        masks = masks == ids[:, None, None]
+        masks = masks.to(torch.bool)
+        if return_masks:
+            new_target["masks"] = masks
+        new_target["boxes"] = masks_to_boxes(masks)
+        new_target["class_labels"] = torch.as_tensor(
+            [segment_info["category_id"] for segment_info in target["segments_info"]],
+            dtype=torch.int64,
+            device=image.device,
+        )
+        new_target["iscrowd"] = torch.as_tensor(
+            [segment_info["iscrowd"] for segment_info in target["segments_info"]],
+            dtype=torch.int64,
+            device=image.device,
+        )
+        new_target["area"] = torch.as_tensor(
+            [segment_info["area"] for segment_info in target["segments_info"]],
+            dtype=torch.float32,
+            device=image.device,
+        )
+
+    return new_target
+
+
+def get_size_with_aspect_ratio(
+    image_size: Tuple[int, int], size: int, max_size: Optional[int] = None, mod_size: int = 16
+) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size with multiple of divisible_size.
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+        mod_size (`int`, *optional*):
+            The size to make multiple of mod_size.
+    """
+    height, width = image_size
+    raw_size = None
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
+
+    if width < height:
+        ow = size
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
+    elif (height <= width and height == size) or (width <= height and width == size):
+        oh, ow = height, width
+    else:
+        oh = size
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
+    if mod_size is not None:
+        ow_mod = torch.remainder(torch.tensor(ow), mod_size).item()
+        oh_mod = torch.remainder(torch.tensor(oh), mod_size).item()
+        ow = ow - ow_mod
+        oh = oh - oh_mod
+
+    return (oh, ow)
+
+
+@add_start_docstrings(
+    "Constructs a fast Yolos image processor.",
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    """
+        format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_convert_annotations (`bool`, *optional*, defaults to `True`):
+            Controls whether to convert the annotations to the format expected by the YOLOS model. Converts the
+            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+            method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+            If `pad_size` is provided, the image will be padded to the specified dimensions.
+            Otherwise, the image will be padded to the maximum height and width of the batch.
+        pad_size (`Dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+            provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+            height and width in the batch.
+        return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+            Whether to return segmentation masks.
+    """,
+)
+@requires(backends=("torchvision", "torch"))
+class YolosImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_DEFAULT_MEAN
+    image_std = IMAGENET_DEFAULT_STD
+    format = AnnotationFormat.COCO_DETECTION
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_pad = True
+    size = {"shortest_edge": 800, "longest_edge": 1333}
+    default_to_square = False
+    model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = YolosFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[YolosFastImageProcessorKwargs]) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
+
+        size = kwargs.pop("size", None)
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None if size is None else 1333
+
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        self.size = get_size_dict(size, max_size=max_size, default_to_square=False)
+
+        # Backwards compatibility
+        do_convert_annotations = kwargs.get("do_convert_annotations", None)
+        do_normalize = kwargs.get("do_normalize", None)
+        if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
+            self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `YolosImageProcessorFast.from_pretrained(checkpoint, size=600,
+        max_size=800)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "max_size" in kwargs:
+            image_processor_dict["max_size"] = kwargs.pop("max_size")
+        if "pad_and_return_pixel_mask" in kwargs:
+            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    def prepare_annotation(
+        self,
+        image: torch.Tensor,
+        target: Dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> Dict:
+        """
+        Prepare an annotation for feeding into YOLOS model.
+        """
+        format = format if format is not None else self.format
+
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        elif format == AnnotationFormat.COCO_PANOPTIC:
+            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_panoptic_annotation(
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    def resize(
+        self,
+        image: torch.Tensor,
+        size: SizeDict,
+        interpolation: "F.InterpolationMode" = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                Resampling filter to use if resizing the image.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        if size.shortest_edge and size.longest_edge:
+            # Resize the image so that the shortest edge or the longest edge is of the given size
+            # while maintaining the aspect ratio of the original image.
+            new_size = get_size_with_aspect_ratio(
+                image.size()[-2:],
+                size["shortest_edge"],
+                size["longest_edge"],
+            )
+        elif size.max_height and size.max_width:
+            new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"])
+        elif size.height and size.width:
+            new_size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+
+        image = F.resize(
+            image,
+            size=new_size,
+            interpolation=interpolation,
+            **kwargs,
+        )
+        return image
+
+    def resize_annotation(
+        self,
+        annotation: Dict[str, Any],
+        orig_size: Tuple[int, int],
+        target_size: Tuple[int, int],
+        threshold: float = 0.5,
+        interpolation: "F.InterpolationMode" = None,
+    ):
+        """
+        Resizes an annotation to a target size.
+
+        Args:
+            annotation (`Dict[str, Any]`):
+                The annotation dictionary.
+            orig_size (`Tuple[int, int]`):
+                The original size of the input image.
+            target_size (`Tuple[int, int]`):
+                The target size of the image, as returned by the preprocessing `resize` step.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The threshold used to binarize the segmentation masks.
+            resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`):
+                The resampling filter to use when resizing the masks.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST
+        ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]
+
+        new_annotation = {}
+        new_annotation["size"] = target_size
+
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                scaled_boxes = boxes * torch.as_tensor(
+                    [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device
+                )
+                new_annotation["boxes"] = scaled_boxes
+            elif key == "area":
+                area = value
+                scaled_area = area * (ratio_width * ratio_height)
+                new_annotation["area"] = scaled_area
+            elif key == "masks":
+                masks = value[:, None]
+                masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks]
+                masks = torch.stack(masks).to(torch.float32)
+                masks = masks[:, 0] > threshold
+                new_annotation["masks"] = masks
+            elif key == "size":
+                new_annotation["size"] = target_size
+            else:
+                new_annotation[key] = value
+
+        return new_annotation
+
+    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+        image_height, image_width = image_size
+        norm_annotation = {}
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                boxes = corners_to_center_format(boxes)
+                boxes /= torch.as_tensor(
+                    [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device
+                )
+                norm_annotation[key] = boxes
+            else:
+                norm_annotation[key] = value
+        return norm_annotation
+
+    def _update_annotation_for_padded_image(
+        self,
+        annotation: Dict,
+        input_image_size: Tuple[int, int],
+        output_image_size: Tuple[int, int],
+        padding,
+        update_bboxes,
+    ) -> Dict:
+        """
+        Update the annotation for a padded image.
+        """
+        new_annotation = {}
+        new_annotation["size"] = output_image_size
+        ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size))
+
+        for key, value in annotation.items():
+            if key == "masks":
+                masks = value
+                masks = F.pad(
+                    masks,
+                    padding,
+                    fill=0,
+                )
+                masks = safe_squeeze(masks, 1)
+                new_annotation["masks"] = masks
+            elif key == "boxes" and update_bboxes:
+                boxes = value
+                boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device)
+                new_annotation["boxes"] = boxes
+            elif key == "size":
+                new_annotation["size"] = output_image_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+
+    def pad(
+        self,
+        image: torch.Tensor,
+        padded_size: Tuple[int, int],
+        annotation: Optional[Dict[str, Any]] = None,
+        update_bboxes: bool = True,
+        fill: int = 0,
+    ):
+        original_size = image.size()[-2:]
+        padding_bottom = padded_size[0] - original_size[0]
+        padding_right = padded_size[1] - original_size[1]
+        if padding_bottom < 0 or padding_right < 0:
+            raise ValueError(
+                f"Padding dimensions are negative. Please make sure that the padded size is larger than the "
+                f"original size. Got padded size: {padded_size}, original size: {original_size}."
+            )
+        if original_size != padded_size:
+            padding = [0, 0, padding_right, padding_bottom]
+            image = F.pad(image, padding, fill=fill)
+            if annotation is not None:
+                annotation = self._update_annotation_for_padded_image(
+                    annotation, original_size, padded_size, padding, update_bboxes
+                )
+
+        # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+        pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device)
+        pixel_mask[: original_size[0], : original_size[1]] = 1
+
+        return image, pixel_mask, annotation
+
+    @add_start_docstrings(
+        BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+        """
+        annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+            List of annotations associated with the image or batch of images. If annotation is for object
+            detection, the annotations should be a dictionary with the following keys:
+            - "image_id" (`int`): The image id.
+            - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
+                dictionary. An image can have no annotations, in which case the list should be empty.
+            If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+            - "image_id" (`int`): The image id.
+            - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                An image can have no segments, in which case the list should be empty.
+            - "file_name" (`str`): The file name of the image.
+        format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_convert_annotations (`bool`, *optional*, defaults to `True`):
+            Controls whether to convert the annotations to the format expected by the YOLOS model. Converts the
+            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+            method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+            If `pad_size` is provided, the image will be padded to the specified dimensions.
+            Otherwise, the image will be padded to the maximum height and width of the batch.
+        pad_size (`Dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+            provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+            height and width in the batch.
+        return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+            Whether to return segmentation masks.
+        masks_path (`str` or `pathlib.Path`, *optional*):
+            Path to the directory containing the segmentation masks.
+        """,
+    )
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        **kwargs: Unpack[YolosFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        if "pad_and_return_pixel_mask" in kwargs:
+            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
+            logger.warning_once(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+                "use `do_pad` instead."
+            )
+
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` argument is deprecated and will be removed in a future version, use"
+                " `size['longest_edge']` instead."
+            )
+            kwargs["size"] = kwargs.pop("max_size")
+
+        return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
+
+    def _preprocess(
+        self,
+        images: List["torch.Tensor"],
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]],
+        return_segmentation_masks: bool,
+        masks_path: Optional[Union[str, pathlib.Path]],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        do_convert_annotations: bool,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        do_pad: bool,
+        pad_size: Optional[Dict[str, int]],
+        format: Optional[Union[str, AnnotationFormat]],
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+        """
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+        if (
+            masks_path is not None
+            and format == AnnotationFormat.COCO_PANOPTIC
+            and not isinstance(masks_path, (pathlib.Path, str))
+        ):
+            raise ValueError(
+                "The path to the directory containing the mask PNG files should be provided as a"
+                f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
+            )
+
+        data = {}
+
+        processed_images = []
+        processed_annotations = []
+        pixel_masks = []  # Initialize pixel_masks here
+        for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+            # prepare (COCO annotations as a list of Dict -> YOLOS target as a single Dict per image)
+            if annotations is not None:
+                annotation = self.prepare_annotation(
+                    image,
+                    annotation,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=ChannelDimension.FIRST,
+                )
+
+            if do_resize:
+                resized_image = self.resize(image, size=size, interpolation=interpolation)
+                if annotations is not None:
+                    annotation = self.resize_annotation(
+                        annotation,
+                        orig_size=image.size()[-2:],
+                        target_size=resized_image.size()[-2:],
+                    )
+                image = resized_image
+            # Fused rescale and normalize
+            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
+            if do_convert_annotations and annotations is not None:
+                annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST))
+
+            processed_images.append(image)
+            processed_annotations.append(annotation)
+        images = processed_images
+        annotations = processed_annotations if annotations is not None else None
+
+        if do_pad:
+            # depends on all resized image shapes so we need another loop
+            if pad_size is not None:
+                padded_size = (pad_size["height"], pad_size["width"])
+            else:
+                padded_size = get_max_height_width(images)
+
+            padded_images = []
+            padded_annotations = []
+            for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+                # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+                if padded_size == image.size()[-2:]:
+                    padded_images.append(image)
+                    pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
+                    padded_annotations.append(annotation)
+                    continue
+                image, pixel_mask, annotation = self.pad(
+                    image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
+                )
+                padded_images.append(image)
+                padded_annotations.append(annotation)
+                pixel_masks.append(pixel_mask)
+            images = padded_images
+            annotations = padded_annotations if annotations is not None else None
+            data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
+
+        data.update({"pixel_values": torch.stack(images, dim=0)})
+        encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+            ]
+        return encoded_inputs
+
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x,
+        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`YolosObjectDetectionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation). For visualization, this should be the image size
+                after data augment, but before padding.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+        )
+
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if len(out_logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        scores = topk_values
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+    def post_process_object_detection(
+        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
+    ):
+        """
+        Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x,
+        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`YolosObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            top_k (`int`, *optional*, defaults to 100):
+                Keep only top k bounding boxes before filtering by thresholding.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        prob = out_logits.sigmoid()
+        prob = prob.view(out_logits.shape[0], -1)
+        k_value = min(top_k, prob.size(1))
+        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
+        scores = topk_values
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            if isinstance(target_sizes, List):
+                img_h = torch.Tensor([i[0] for i in target_sizes])
+                img_w = torch.Tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        results = []
+        for s, l, b in zip(scores, labels, boxes):
+            score = s[s > threshold]
+            label = l[s > threshold]
+            box = b[s > threshold]
+            results.append({"scores": score, "labels": label, "boxes": box})
+
+        return results
+
+
+__all__ = ["YolosImageProcessorFast"]
--- a/src/transformers/models/yolos/modular_yolos.py
+++ b/src/transformers/models/yolos/modular_yolos.py
@@ -0,0 +1,193 @@
+from typing import List, Optional, Tuple, Union
+
+from transformers.models.detr.image_processing_detr_fast import DetrImageProcessorFast
+
+from ...image_transforms import center_to_corners_format
+from ...utils import (
+    TensorType,
+    is_torch_available,
+    logging,
+)
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+def get_size_with_aspect_ratio(
+    image_size: Tuple[int, int], size: int, max_size: Optional[int] = None, mod_size: int = 16
+) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size with multiple of divisible_size.
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+        mod_size (`int`, *optional*):
+            The size to make multiple of mod_size.
+    """
+    height, width = image_size
+    raw_size = None
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
+
+    if width < height:
+        ow = size
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
+    elif (height <= width and height == size) or (width <= height and width == size):
+        oh, ow = height, width
+    else:
+        oh = size
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
+    if mod_size is not None:
+        ow_mod = torch.remainder(torch.tensor(ow), mod_size).item()
+        oh_mod = torch.remainder(torch.tensor(oh), mod_size).item()
+        ow = ow - ow_mod
+        oh = oh - oh_mod
+
+    return (oh, ow)
+
+
+class YolosImageProcessorFast(DetrImageProcessorFast):
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x,
+        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`YolosObjectDetectionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation). For visualization, this should be the image size
+                after data augment, but before padding.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+        )
+
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if len(out_logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        scores = topk_values
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+    def post_process_object_detection(
+        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
+    ):
+        """
+        Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x,
+        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`YolosObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            top_k (`int`, *optional*, defaults to 100):
+                Keep only top k bounding boxes before filtering by thresholding.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        prob = out_logits.sigmoid()
+        prob = prob.view(out_logits.shape[0], -1)
+        k_value = min(top_k, prob.size(1))
+        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
+        scores = topk_values
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            if isinstance(target_sizes, List):
+                img_h = torch.Tensor([i[0] for i in target_sizes])
+                img_w = torch.Tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        results = []
+        for s, l, b in zip(scores, labels, boxes):
+            score = s[s > threshold]
+            label = l[s > threshold]
+            box = b[s > threshold]
+            results.append({"scores": score, "labels": label, "boxes": box})
+
+        return results
+
+    def post_process_segmentation():
+        raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.")
+
+    def post_process_instance():
+        raise NotImplementedError("Instance post-processing is not implemented for Deformable DETR yet.")
+
+    def post_process_panoptic():
+        raise NotImplementedError("Panoptic post-processing is not implemented for Deformable DETR yet.")
+
+    def post_process_instance_segmentation():
+        raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.")
+
+    def post_process_semantic_segmentation():
+        raise NotImplementedError("Semantic segmentation post-processing is not implemented for Deformable DETR yet.")
+
+    def post_process_panoptic_segmentation():
+        raise NotImplementedError("Panoptic segmentation post-processing is not implemented for Deformable DETR yet.")
+
+
+__all__ = ["YolosImageProcessorFast"]
--- a/tests/models/yolos/test_image_processing_yolos.py
+++ b/tests/models/yolos/test_image_processing_yolos.py
@@ -21,7 +21,7 @@ import numpy as np
 from parameterized import parameterized

 from transformers.testing_utils import require_torch, require_vision, slow
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available

 from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs

@@ -34,6 +34,9 @@ if is_vision_available():

    from transformers import YolosImageProcessor

+    if is_torchvision_available():
+        from transformers import YolosImageProcessorFast
+

 class YolosImageProcessingTester:
    def __init__(
@@ -143,6 +146,7 @@ class YolosImageProcessingTester:
@require_vision
 class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = YolosImageProcessor if is_vision_available() else None
+    fast_image_processing_class = YolosImageProcessorFast if is_torchvision_available() else None

    def setUp(self):
        super().setUp()
@@ -153,23 +157,25 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
        return self.image_processor_tester.prepare_image_processor_dict()

    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processing, "image_mean"))
+            self.assertTrue(hasattr(image_processing, "image_std"))
+            self.assertTrue(hasattr(image_processing, "do_normalize"))
+            self.assertTrue(hasattr(image_processing, "do_resize"))
+            self.assertTrue(hasattr(image_processing, "size"))

    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
-        self.assertEqual(image_processor.do_pad, True)
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class.from_dict(self.image_processor_dict)
+            self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
+            self.assertEqual(image_processor.do_pad, True)

-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
-        )
-        self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
-        self.assertEqual(image_processor.do_pad, False)
+            image_processor = image_processing_class.from_dict(
+                self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
+            )
+            self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
+            self.assertEqual(image_processor.do_pad, False)

    def test_equivalence_padding(self):
        # Initialize image_processings
@@ -199,21 +205,22 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
        ]
    )
    def test_resize_max_size_respected(self, image_size, longest_edge, shortest_edge):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)

-        # create torch tensors as image
-        image = torch.randint(0, 256, image_size, dtype=torch.uint8)
-        processed_image = image_processor(
-            image,
-            size={"longest_edge": longest_edge, "shortest_edge": shortest_edge},
-            do_pad=False,
-            return_tensors="pt",
-        )["pixel_values"]
+            # create torch tensors as image
+            image = torch.randint(0, 256, image_size, dtype=torch.uint8)
+            processed_image = image_processor(
+                image,
+                size={"longest_edge": longest_edge, "shortest_edge": shortest_edge},
+                do_pad=False,
+                return_tensors="pt",
+            )["pixel_values"]

-        shape = list(processed_image.shape[-2:])
-        max_size, min_size = max(shape), min(shape)
-        self.assertTrue(max_size <= 1333, f"Expected max_size <= 1333, got image shape {shape}")
-        self.assertTrue(min_size <= 800, f"Expected min_size <= 800, got image shape {shape}")
+            shape = list(processed_image.shape[-2:])
+            max_size, min_size = max(shape), min(shape)
+            self.assertTrue(max_size <= 1333, f"Expected max_size <= 1333, got image shape {shape}")
+            self.assertTrue(min_size <= 800, f"Expected min_size <= 800, got image shape {shape}")

    @slow
    def test_call_pytorch_with_coco_detection_annotations(self):
@@ -224,40 +231,41 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix

        target = {"image_id": 39769, "annotations": target}

-        # encode them
-        image_processing = YolosImageProcessor.from_pretrained("hustvl/yolos-small")
-        encoding = image_processing(images=image, annotations=target, return_tensors="pt")
+        for image_processing_class in self.image_processor_list:
+            # encode them
+            image_processing = image_processing_class.from_pretrained("hustvl/yolos-small")
+            encoding = image_processing(images=image, annotations=target, return_tensors="pt")

-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1056])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+            # verify pixel values
+            expected_shape = torch.Size([1, 3, 800, 1056])
+            self.assertEqual(encoding["pixel_values"].shape, expected_shape)

-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        torch.testing.assert_close(encoding["pixel_values"][0, 0, 0, :3], expected_slice, rtol=1e-4, atol=1e-4)
+            expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+            torch.testing.assert_close(encoding["pixel_values"][0, 0, 0, :3], expected_slice, rtol=1e-4, atol=1e-4)

-        # verify area
-        expected_area = torch.tensor([5832.7256, 11144.6689, 484763.2500, 829269.8125, 146579.4531, 164177.6250])
-        torch.testing.assert_close(encoding["labels"][0]["area"], expected_area)
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        torch.testing.assert_close(encoding["labels"][0]["boxes"][0], expected_boxes_slice, rtol=1e-3, atol=1e-3)
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        torch.testing.assert_close(encoding["labels"][0]["image_id"], expected_image_id)
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        torch.testing.assert_close(encoding["labels"][0]["iscrowd"], expected_is_crowd)
-        # verify class_labels
-        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
-        torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels)
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size)
-        # verify size
-        expected_size = torch.tensor([800, 1056])
-        torch.testing.assert_close(encoding["labels"][0]["size"], expected_size)
+            # verify area
+            expected_area = torch.tensor([5832.7256, 11144.6689, 484763.2500, 829269.8125, 146579.4531, 164177.6250])
+            torch.testing.assert_close(encoding["labels"][0]["area"], expected_area)
+            # verify boxes
+            expected_boxes_shape = torch.Size([6, 4])
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+            expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+            torch.testing.assert_close(encoding["labels"][0]["boxes"][0], expected_boxes_slice, rtol=1e-3, atol=1e-3)
+            # verify image_id
+            expected_image_id = torch.tensor([39769])
+            torch.testing.assert_close(encoding["labels"][0]["image_id"], expected_image_id)
+            # verify is_crowd
+            expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+            torch.testing.assert_close(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+            # verify class_labels
+            expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+            torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels)
+            # verify orig_size
+            expected_orig_size = torch.tensor([480, 640])
+            torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size)
+            # verify size
+            expected_size = torch.tensor([800, 1056])
+            torch.testing.assert_close(encoding["labels"][0]["size"], expected_size)

    @slow
    def test_call_pytorch_with_coco_panoptic_annotations(self):
@@ -270,43 +278,45 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix

        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")

-        # encode them
-        image_processing = YolosImageProcessor(format="coco_panoptic")
-        encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
+        for image_processing_class in self.image_processor_list:
+            # encode them
+            image_processing = image_processing_class(format="coco_panoptic")
+            encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")

-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1056])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+            # verify pixel values
+            expected_shape = torch.Size([1, 3, 800, 1056])
+            self.assertEqual(encoding["pixel_values"].shape, expected_shape)

-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        torch.testing.assert_close(encoding["pixel_values"][0, 0, 0, :3], expected_slice, rtol=1e-4, atol=1e-4)
+            expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+            torch.testing.assert_close(encoding["pixel_values"][0, 0, 0, :3], expected_slice, rtol=1e-4, atol=1e-4)

-        # verify area
-        expected_area = torch.tensor([146591.5000, 163974.2500, 480092.2500, 11187.0000, 5824.5000, 7562.5000])
-        torch.testing.assert_close(encoding["labels"][0]["area"], expected_area)
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        torch.testing.assert_close(encoding["labels"][0]["boxes"][0], expected_boxes_slice, rtol=1e-3, atol=1e-3)
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        torch.testing.assert_close(encoding["labels"][0]["image_id"], expected_image_id)
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        torch.testing.assert_close(encoding["labels"][0]["iscrowd"], expected_is_crowd)
-        # verify class_labels
-        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
-        torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels)
-        # verify masks
-        expected_masks_sum = 815161
-        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size)
-        # verify size
-        expected_size = torch.tensor([800, 1056])
-        torch.testing.assert_close(encoding["labels"][0]["size"], expected_size)
+            # verify area
+            expected_area = torch.tensor([146591.5000, 163974.2500, 480092.2500, 11187.0000, 5824.5000, 7562.5000])
+            torch.testing.assert_close(encoding["labels"][0]["area"], expected_area)
+            # verify boxes
+            expected_boxes_shape = torch.Size([6, 4])
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+            expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
+            torch.testing.assert_close(encoding["labels"][0]["boxes"][0], expected_boxes_slice, rtol=1e-3, atol=1e-3)
+            # verify image_id
+            expected_image_id = torch.tensor([39769])
+            torch.testing.assert_close(encoding["labels"][0]["image_id"], expected_image_id)
+            # verify is_crowd
+            expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+            torch.testing.assert_close(encoding["labels"][0]["iscrowd"], expected_is_crowd)
+            # verify class_labels
+            expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
+            torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels)
+            # verify masks
+            expected_masks_sum = 815161
+            relative_error = torch.abs(encoding["labels"][0]["masks"].sum() - expected_masks_sum) / expected_masks_sum
+            self.assertTrue(relative_error < 1e-3)
+            # verify orig_size
+            expected_orig_size = torch.tensor([480, 640])
+            torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size)
+            # verify size
+            expected_size = torch.tensor([800, 1056])
+            torch.testing.assert_close(encoding["labels"][0]["size"], expected_size)

    # Output size is slight different from DETR as yolos takes mod of 16
    @slow
@@ -336,96 +346,97 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
        images = [image_0, image_1]
        annotations = [annotations_0, annotations_1]

-        image_processing = YolosImageProcessor()
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            return_tensors="pt",  # do_convert_annotations=True
-        )
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class()
+            encoding = image_processing(
+                images=images,
+                annotations=annotations,
+                return_segmentation_masks=True,
+                return_tensors="pt",  # do_convert_annotations=True
+            )

-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1056
-        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+            # Check the pixel values have been padded
+            postprocessed_height, postprocessed_width = 800, 1056
+            expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
+            self.assertEqual(encoding["pixel_values"].shape, expected_shape)

-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        expected_boxes_0 = torch.tensor(
-            [
-                [0.6879, 0.4609, 0.0755, 0.3691],
-                [0.2118, 0.3359, 0.2601, 0.1566],
-                [0.5011, 0.5000, 0.9979, 1.0000],
-                [0.5010, 0.5020, 0.9979, 0.9959],
-                [0.3284, 0.5944, 0.5884, 0.8112],
-                [0.8394, 0.5445, 0.3213, 0.9110],
-            ]
-        )
-        expected_boxes_1 = torch.tensor(
-            [
-                [0.4169, 0.2765, 0.0458, 0.2215],
-                [0.1284, 0.2016, 0.1576, 0.0940],
-                [0.3792, 0.4933, 0.7559, 0.9865],
-                [0.3794, 0.5002, 0.7563, 0.9955],
-                [0.1990, 0.5456, 0.3566, 0.8646],
-                [0.5845, 0.4115, 0.3462, 0.7161],
-            ]
-        )
-        torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3, atol=1e-3)
-        torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3, atol=1e-3)
+            # Check the bounding boxes have been adjusted for padded images
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
+            self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
+            expected_boxes_0 = torch.tensor(
+                [
+                    [0.6879, 0.4609, 0.0755, 0.3691],
+                    [0.2118, 0.3359, 0.2601, 0.1566],
+                    [0.5011, 0.5000, 0.9979, 1.0000],
+                    [0.5010, 0.5020, 0.9979, 0.9959],
+                    [0.3284, 0.5944, 0.5884, 0.8112],
+                    [0.8394, 0.5445, 0.3213, 0.9110],
+                ]
+            )
+            expected_boxes_1 = torch.tensor(
+                [
+                    [0.4169, 0.2765, 0.0458, 0.2215],
+                    [0.1284, 0.2016, 0.1576, 0.0940],
+                    [0.3792, 0.4933, 0.7559, 0.9865],
+                    [0.3794, 0.5002, 0.7563, 0.9955],
+                    [0.1990, 0.5456, 0.3566, 0.8646],
+                    [0.5845, 0.4115, 0.3462, 0.7161],
+                ]
+            )
+            torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3, atol=1e-3)
+            torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3, atol=1e-3)

-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1056]))
-        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1056]))
+            # Check the masks have also been padded
+            self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1056]))
+            self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1056]))

-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="pt",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = torch.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = torch.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = torch.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = torch.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1, atol=1)
-        torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1, atol=1)
+            # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
+            # format and not in the range [0, 1]
+            encoding = image_processing(
+                images=images,
+                annotations=annotations,
+                return_segmentation_masks=True,
+                do_convert_annotations=False,
+                return_tensors="pt",
+            )
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
+            self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
+            # Convert to absolute coordinates
+            unnormalized_boxes_0 = torch.vstack(
+                [
+                    expected_boxes_0[:, 0] * postprocessed_width,
+                    expected_boxes_0[:, 1] * postprocessed_height,
+                    expected_boxes_0[:, 2] * postprocessed_width,
+                    expected_boxes_0[:, 3] * postprocessed_height,
+                ]
+            ).T
+            unnormalized_boxes_1 = torch.vstack(
+                [
+                    expected_boxes_1[:, 0] * postprocessed_width,
+                    expected_boxes_1[:, 1] * postprocessed_height,
+                    expected_boxes_1[:, 2] * postprocessed_width,
+                    expected_boxes_1[:, 3] * postprocessed_height,
+                ]
+            ).T
+            # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
+            expected_boxes_0 = torch.vstack(
+                [
+                    unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
+                    unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
+                    unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
+                    unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
+                ]
+            ).T
+            expected_boxes_1 = torch.vstack(
+                [
+                    unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
+                    unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
+                    unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
+                    unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
+                ]
+            ).T
+            torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1, atol=1)
+            torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1, atol=1)

    # Output size is slight different from DETR as yolos takes mod of 16
    def test_batched_coco_panoptic_annotations(self):
@@ -457,98 +468,100 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
        annotations = [annotation_0, annotation_1]

        # encode them
-        image_processing = YolosImageProcessor(format="coco_panoptic")
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_tensors="pt",
-            return_segmentation_masks=True,
-        )
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class()
+            image_processing = YolosImageProcessor(format="coco_panoptic")
+            encoding = image_processing(
+                images=images,
+                annotations=annotations,
+                masks_path=masks_path,
+                return_tensors="pt",
+                return_segmentation_masks=True,
+            )

-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1056
-        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+            # Check the pixel values have been padded
+            postprocessed_height, postprocessed_width = 800, 1056
+            expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
+            self.assertEqual(encoding["pixel_values"].shape, expected_shape)

-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        expected_boxes_0 = torch.tensor(
-            [
-                [0.2625, 0.5437, 0.4688, 0.8625],
-                [0.7719, 0.4104, 0.4531, 0.7125],
-                [0.5000, 0.4927, 0.9969, 0.9854],
-                [0.1688, 0.2000, 0.2063, 0.0917],
-                [0.5492, 0.2760, 0.0578, 0.2187],
-                [0.4992, 0.4990, 0.9984, 0.9979],
-            ]
-        )
-        expected_boxes_1 = torch.tensor(
-            [
-                [0.1591, 0.3262, 0.2841, 0.5175],
-                [0.4678, 0.2463, 0.2746, 0.4275],
-                [0.3030, 0.2956, 0.6042, 0.5913],
-                [0.1023, 0.1200, 0.1250, 0.0550],
-                [0.3329, 0.1656, 0.0350, 0.1312],
-                [0.3026, 0.2994, 0.6051, 0.5987],
-            ]
-        )
-        torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3, atol=1e-3)
-        torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3, atol=1e-3)
+            # Check the bounding boxes have been adjusted for padded images
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
+            self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
+            expected_boxes_0 = torch.tensor(
+                [
+                    [0.2625, 0.5437, 0.4688, 0.8625],
+                    [0.7719, 0.4104, 0.4531, 0.7125],
+                    [0.5000, 0.4927, 0.9969, 0.9854],
+                    [0.1688, 0.2000, 0.2063, 0.0917],
+                    [0.5492, 0.2760, 0.0578, 0.2187],
+                    [0.4992, 0.4990, 0.9984, 0.9979],
+                ]
+            )
+            expected_boxes_1 = torch.tensor(
+                [
+                    [0.1591, 0.3262, 0.2841, 0.5175],
+                    [0.4678, 0.2463, 0.2746, 0.4275],
+                    [0.3030, 0.2956, 0.6042, 0.5913],
+                    [0.1023, 0.1200, 0.1250, 0.0550],
+                    [0.3329, 0.1656, 0.0350, 0.1312],
+                    [0.3026, 0.2994, 0.6051, 0.5987],
+                ]
+            )
+            torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3, atol=1e-3)
+            torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3, atol=1e-3)

-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1056]))
-        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1056]))
+            # Check the masks have also been padded
+            self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1056]))
+            self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1056]))

-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="pt",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = torch.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = torch.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = torch.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = torch.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1, rtol=1)
-        torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1, rtol=1)
+            # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
+            # format and not in the range [0, 1]
+            encoding = image_processing(
+                images=images,
+                annotations=annotations,
+                masks_path=masks_path,
+                return_segmentation_masks=True,
+                do_convert_annotations=False,
+                return_tensors="pt",
+            )
+            self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
+            self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
+            # Convert to absolute coordinates
+            unnormalized_boxes_0 = torch.vstack(
+                [
+                    expected_boxes_0[:, 0] * postprocessed_width,
+                    expected_boxes_0[:, 1] * postprocessed_height,
+                    expected_boxes_0[:, 2] * postprocessed_width,
+                    expected_boxes_0[:, 3] * postprocessed_height,
+                ]
+            ).T
+            unnormalized_boxes_1 = torch.vstack(
+                [
+                    expected_boxes_1[:, 0] * postprocessed_width,
+                    expected_boxes_1[:, 1] * postprocessed_height,
+                    expected_boxes_1[:, 2] * postprocessed_width,
+                    expected_boxes_1[:, 3] * postprocessed_height,
+                ]
+            ).T
+            # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
+            expected_boxes_0 = torch.vstack(
+                [
+                    unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
+                    unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
+                    unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
+                    unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
+                ]
+            ).T
+            expected_boxes_1 = torch.vstack(
+                [
+                    unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
+                    unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
+                    unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
+                    unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
+                ]
+            ).T
+            torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1, rtol=1)
+            torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1, rtol=1)

    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Yolos
    def test_max_width_max_height_resizing_and_pad_strategy(self):