Add Fast Yolos Processor (#37292)
* Add Fast Yolos Processor * Update modular file * Fix copies --------- Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
This commit is contained in:
@@ -92,6 +92,11 @@ Use [`YolosImageProcessor`] for preparing images (and optional targets) for the
|
|||||||
|
|
||||||
[[autodoc]] YolosImageProcessor
|
[[autodoc]] YolosImageProcessor
|
||||||
- preprocess
|
- preprocess
|
||||||
|
|
||||||
|
## YolosImageProcessorFast
|
||||||
|
|
||||||
|
[[autodoc]] YolosImageProcessorFast
|
||||||
|
- preprocess
|
||||||
- pad
|
- pad
|
||||||
- post_process_object_detection
|
- post_process_object_detection
|
||||||
|
|
||||||
|
|||||||
@@ -167,7 +167,7 @@ else:
|
|||||||
("vit_msn", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
("vit_msn", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||||
("vitmatte", ("VitMatteImageProcessor",)),
|
("vitmatte", ("VitMatteImageProcessor",)),
|
||||||
("xclip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
|
("xclip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
|
||||||
("yolos", ("YolosImageProcessor",)),
|
("yolos", ("YolosImageProcessor", "YolosImageProcessorFast")),
|
||||||
("zoedepth", ("ZoeDepthImageProcessor",)),
|
("zoedepth", ("ZoeDepthImageProcessor",)),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ if TYPE_CHECKING:
|
|||||||
from .configuration_yolos import *
|
from .configuration_yolos import *
|
||||||
from .feature_extraction_yolos import *
|
from .feature_extraction_yolos import *
|
||||||
from .image_processing_yolos import *
|
from .image_processing_yolos import *
|
||||||
|
from .image_processing_yolos_fast import *
|
||||||
from .modeling_yolos import *
|
from .modeling_yolos import *
|
||||||
else:
|
else:
|
||||||
import sys
|
import sys
|
||||||
|
|||||||
893
src/transformers/models/yolos/image_processing_yolos_fast.py
Normal file
893
src/transformers/models/yolos/image_processing_yolos_fast.py
Normal file
@@ -0,0 +1,893 @@
|
|||||||
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
|
# This file was automatically generated from src/transformers/models/yolos/modular_yolos.py.
|
||||||
|
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||||
|
# the file from the modular. If any change should be done, please apply the change to the
|
||||||
|
# modular_yolos.py file directly. One of our CI enforces this.
|
||||||
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
|
import pathlib
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
from ...image_processing_utils import BatchFeature, get_size_dict
|
||||||
|
from ...image_processing_utils_fast import (
|
||||||
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
||||||
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
||||||
|
BaseImageProcessorFast,
|
||||||
|
DefaultFastImageProcessorKwargs,
|
||||||
|
SizeDict,
|
||||||
|
get_image_size_for_max_height_width,
|
||||||
|
get_max_height_width,
|
||||||
|
safe_squeeze,
|
||||||
|
)
|
||||||
|
from ...image_transforms import center_to_corners_format, corners_to_center_format
|
||||||
|
from ...image_utils import (
|
||||||
|
IMAGENET_DEFAULT_MEAN,
|
||||||
|
IMAGENET_DEFAULT_STD,
|
||||||
|
AnnotationFormat,
|
||||||
|
AnnotationType,
|
||||||
|
ChannelDimension,
|
||||||
|
ImageInput,
|
||||||
|
PILImageResampling,
|
||||||
|
get_image_size,
|
||||||
|
validate_annotations,
|
||||||
|
)
|
||||||
|
from ...processing_utils import Unpack
|
||||||
|
from ...utils import (
|
||||||
|
TensorType,
|
||||||
|
add_start_docstrings,
|
||||||
|
is_torch_available,
|
||||||
|
is_torchvision_available,
|
||||||
|
is_torchvision_v2_available,
|
||||||
|
logging,
|
||||||
|
)
|
||||||
|
from ...utils.import_utils import requires
|
||||||
|
|
||||||
|
|
||||||
|
if is_torch_available():
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
if is_torchvision_v2_available():
|
||||||
|
from torchvision.io import read_image
|
||||||
|
from torchvision.transforms.v2 import functional as F
|
||||||
|
elif is_torchvision_available():
|
||||||
|
from torchvision.io import read_image
|
||||||
|
from torchvision.transforms import functional as F
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class YolosFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||||
|
format: Optional[Union[str, AnnotationFormat]]
|
||||||
|
do_convert_annotations: Optional[bool]
|
||||||
|
do_pad: Optional[bool]
|
||||||
|
pad_size: Optional[Dict[str, int]]
|
||||||
|
return_segmentation_masks: Optional[bool]
|
||||||
|
|
||||||
|
|
||||||
|
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
||||||
|
|
||||||
|
|
||||||
|
# inspired by https://github.com/facebookresearch/yolos/blob/master/datasets/coco.py#L33
|
||||||
|
def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Convert a COCO polygon annotation to a mask.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
segmentations (`List[List[float]]`):
|
||||||
|
List of polygons, each polygon represented by a list of x-y coordinates.
|
||||||
|
height (`int`):
|
||||||
|
Height of the mask.
|
||||||
|
width (`int`):
|
||||||
|
Width of the mask.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from pycocotools import mask as coco_mask
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError("Pycocotools is not installed in your environment.")
|
||||||
|
|
||||||
|
masks = []
|
||||||
|
for polygons in segmentations:
|
||||||
|
rles = coco_mask.frPyObjects(polygons, height, width)
|
||||||
|
mask = coco_mask.decode(rles)
|
||||||
|
if len(mask.shape) < 3:
|
||||||
|
mask = mask[..., None]
|
||||||
|
mask = torch.as_tensor(mask, dtype=torch.uint8, device=device)
|
||||||
|
mask = torch.any(mask, axis=2)
|
||||||
|
masks.append(mask)
|
||||||
|
if masks:
|
||||||
|
masks = torch.stack(masks, axis=0)
|
||||||
|
else:
|
||||||
|
masks = torch.zeros((0, height, width), dtype=torch.uint8, device=device)
|
||||||
|
|
||||||
|
return masks
|
||||||
|
|
||||||
|
|
||||||
|
# inspired by https://github.com/facebookresearch/yolos/blob/master/datasets/coco.py#L50
|
||||||
|
def prepare_coco_detection_annotation(
|
||||||
|
image,
|
||||||
|
target,
|
||||||
|
return_segmentation_masks: bool = False,
|
||||||
|
input_data_format: Optional[Union[ChannelDimension, str]] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Convert the target in COCO format into the format expected by YOLOS.
|
||||||
|
"""
|
||||||
|
image_height, image_width = image.size()[-2:]
|
||||||
|
|
||||||
|
image_id = target["image_id"]
|
||||||
|
image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)
|
||||||
|
|
||||||
|
# Get all COCO annotations for the given image.
|
||||||
|
annotations = target["annotations"]
|
||||||
|
classes = []
|
||||||
|
area = []
|
||||||
|
boxes = []
|
||||||
|
keypoints = []
|
||||||
|
for obj in annotations:
|
||||||
|
if "iscrowd" not in obj or obj["iscrowd"] == 0:
|
||||||
|
classes.append(obj["category_id"])
|
||||||
|
area.append(obj["area"])
|
||||||
|
boxes.append(obj["bbox"])
|
||||||
|
if "keypoints" in obj:
|
||||||
|
keypoints.append(obj["keypoints"])
|
||||||
|
|
||||||
|
classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
|
||||||
|
area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
|
||||||
|
iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
|
||||||
|
# guard against no boxes via resizing
|
||||||
|
boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
|
||||||
|
boxes[:, 2:] += boxes[:, :2]
|
||||||
|
boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
|
||||||
|
boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
|
||||||
|
|
||||||
|
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
|
||||||
|
|
||||||
|
new_target = {
|
||||||
|
"image_id": image_id,
|
||||||
|
"class_labels": classes[keep],
|
||||||
|
"boxes": boxes[keep],
|
||||||
|
"area": area[keep],
|
||||||
|
"iscrowd": iscrowd[keep],
|
||||||
|
"orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
|
||||||
|
}
|
||||||
|
|
||||||
|
if keypoints:
|
||||||
|
keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
|
||||||
|
# Apply the keep mask here to filter the relevant annotations
|
||||||
|
keypoints = keypoints[keep]
|
||||||
|
num_keypoints = keypoints.shape[0]
|
||||||
|
keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
|
||||||
|
new_target["keypoints"] = keypoints
|
||||||
|
|
||||||
|
if return_segmentation_masks:
|
||||||
|
segmentation_masks = [obj["segmentation"] for obj in annotations]
|
||||||
|
masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width, device=image.device)
|
||||||
|
new_target["masks"] = masks[keep]
|
||||||
|
|
||||||
|
return new_target
|
||||||
|
|
||||||
|
|
||||||
|
def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Compute the bounding boxes around the provided panoptic segmentation masks.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
masks: masks in format `[number_masks, height, width]` where N is the number of masks
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
|
||||||
|
"""
|
||||||
|
if masks.numel() == 0:
|
||||||
|
return torch.zeros((0, 4), device=masks.device)
|
||||||
|
|
||||||
|
h, w = masks.shape[-2:]
|
||||||
|
y = torch.arange(0, h, dtype=torch.float32, device=masks.device)
|
||||||
|
x = torch.arange(0, w, dtype=torch.float32, device=masks.device)
|
||||||
|
# see https://github.com/pytorch/pytorch/issues/50276
|
||||||
|
y, x = torch.meshgrid(y, x, indexing="ij")
|
||||||
|
|
||||||
|
x_mask = masks * torch.unsqueeze(x, 0)
|
||||||
|
x_max = x_mask.view(x_mask.shape[0], -1).max(-1)[0]
|
||||||
|
x_min = (
|
||||||
|
torch.where(masks, x.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
|
||||||
|
)
|
||||||
|
|
||||||
|
y_mask = masks * torch.unsqueeze(y, 0)
|
||||||
|
y_max = y_mask.view(y_mask.shape[0], -1).max(-1)[0]
|
||||||
|
y_min = (
|
||||||
|
torch.where(masks, y.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
|
||||||
|
)
|
||||||
|
|
||||||
|
return torch.stack([x_min, y_min, x_max, y_max], 1)
|
||||||
|
|
||||||
|
|
||||||
|
# 2 functions below adapted from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
|
||||||
|
# Copyright (c) 2018, Alexander Kirillov
|
||||||
|
# All rights reserved.
|
||||||
|
def rgb_to_id(color):
|
||||||
|
"""
|
||||||
|
Converts RGB color to unique ID.
|
||||||
|
"""
|
||||||
|
if isinstance(color, torch.Tensor) and len(color.shape) == 3:
|
||||||
|
if color.dtype == torch.uint8:
|
||||||
|
color = color.to(torch.int32)
|
||||||
|
return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
|
||||||
|
return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_coco_panoptic_annotation(
|
||||||
|
image: torch.Tensor,
|
||||||
|
target: Dict,
|
||||||
|
masks_path: Union[str, pathlib.Path],
|
||||||
|
return_masks: bool = True,
|
||||||
|
input_data_format: Union[ChannelDimension, str] = None,
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
Prepare a coco panoptic annotation for YOLOS.
|
||||||
|
"""
|
||||||
|
image_height, image_width = get_image_size(image, channel_dim=input_data_format)
|
||||||
|
annotation_path = pathlib.Path(masks_path) / target["file_name"]
|
||||||
|
|
||||||
|
new_target = {}
|
||||||
|
new_target["image_id"] = torch.as_tensor(
|
||||||
|
[target["image_id"] if "image_id" in target else target["id"]], dtype=torch.int64, device=image.device
|
||||||
|
)
|
||||||
|
new_target["size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
|
||||||
|
new_target["orig_size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
|
||||||
|
|
||||||
|
if "segments_info" in target:
|
||||||
|
masks = read_image(annotation_path).permute(1, 2, 0).to(dtype=torch.int32, device=image.device)
|
||||||
|
masks = rgb_to_id(masks)
|
||||||
|
|
||||||
|
ids = torch.as_tensor([segment_info["id"] for segment_info in target["segments_info"]], device=image.device)
|
||||||
|
masks = masks == ids[:, None, None]
|
||||||
|
masks = masks.to(torch.bool)
|
||||||
|
if return_masks:
|
||||||
|
new_target["masks"] = masks
|
||||||
|
new_target["boxes"] = masks_to_boxes(masks)
|
||||||
|
new_target["class_labels"] = torch.as_tensor(
|
||||||
|
[segment_info["category_id"] for segment_info in target["segments_info"]],
|
||||||
|
dtype=torch.int64,
|
||||||
|
device=image.device,
|
||||||
|
)
|
||||||
|
new_target["iscrowd"] = torch.as_tensor(
|
||||||
|
[segment_info["iscrowd"] for segment_info in target["segments_info"]],
|
||||||
|
dtype=torch.int64,
|
||||||
|
device=image.device,
|
||||||
|
)
|
||||||
|
new_target["area"] = torch.as_tensor(
|
||||||
|
[segment_info["area"] for segment_info in target["segments_info"]],
|
||||||
|
dtype=torch.float32,
|
||||||
|
device=image.device,
|
||||||
|
)
|
||||||
|
|
||||||
|
return new_target
|
||||||
|
|
||||||
|
|
||||||
|
def get_size_with_aspect_ratio(
|
||||||
|
image_size: Tuple[int, int], size: int, max_size: Optional[int] = None, mod_size: int = 16
|
||||||
|
) -> Tuple[int, int]:
|
||||||
|
"""
|
||||||
|
Computes the output image size given the input image size and the desired output size with multiple of divisible_size.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_size (`Tuple[int, int]`):
|
||||||
|
The input image size.
|
||||||
|
size (`int`):
|
||||||
|
The desired output size.
|
||||||
|
max_size (`int`, *optional*):
|
||||||
|
The maximum allowed output size.
|
||||||
|
mod_size (`int`, *optional*):
|
||||||
|
The size to make multiple of mod_size.
|
||||||
|
"""
|
||||||
|
height, width = image_size
|
||||||
|
raw_size = None
|
||||||
|
if max_size is not None:
|
||||||
|
min_original_size = float(min((height, width)))
|
||||||
|
max_original_size = float(max((height, width)))
|
||||||
|
if max_original_size / min_original_size * size > max_size:
|
||||||
|
raw_size = max_size * min_original_size / max_original_size
|
||||||
|
size = int(round(raw_size))
|
||||||
|
|
||||||
|
if width < height:
|
||||||
|
ow = size
|
||||||
|
if max_size is not None and raw_size is not None:
|
||||||
|
oh = int(raw_size * height / width)
|
||||||
|
else:
|
||||||
|
oh = int(size * height / width)
|
||||||
|
elif (height <= width and height == size) or (width <= height and width == size):
|
||||||
|
oh, ow = height, width
|
||||||
|
else:
|
||||||
|
oh = size
|
||||||
|
if max_size is not None and raw_size is not None:
|
||||||
|
ow = int(raw_size * width / height)
|
||||||
|
else:
|
||||||
|
ow = int(size * width / height)
|
||||||
|
|
||||||
|
if mod_size is not None:
|
||||||
|
ow_mod = torch.remainder(torch.tensor(ow), mod_size).item()
|
||||||
|
oh_mod = torch.remainder(torch.tensor(oh), mod_size).item()
|
||||||
|
ow = ow - ow_mod
|
||||||
|
oh = oh - oh_mod
|
||||||
|
|
||||||
|
return (oh, ow)
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings(
|
||||||
|
"Constructs a fast Yolos image processor.",
|
||||||
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
||||||
|
"""
|
||||||
|
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||||
|
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||||
|
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||||
|
Controls whether to convert the annotations to the format expected by the YOLOS model. Converts the
|
||||||
|
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||||
|
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||||
|
do_pad (`bool`, *optional*, defaults to `True`):
|
||||||
|
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||||
|
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
|
||||||
|
If `pad_size` is provided, the image will be padded to the specified dimensions.
|
||||||
|
Otherwise, the image will be padded to the maximum height and width of the batch.
|
||||||
|
pad_size (`Dict[str, int]`, *optional*):
|
||||||
|
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||||
|
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||||
|
height and width in the batch.
|
||||||
|
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether to return segmentation masks.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
@requires(backends=("torchvision", "torch"))
|
||||||
|
class YolosImageProcessorFast(BaseImageProcessorFast):
|
||||||
|
resample = PILImageResampling.BILINEAR
|
||||||
|
image_mean = IMAGENET_DEFAULT_MEAN
|
||||||
|
image_std = IMAGENET_DEFAULT_STD
|
||||||
|
format = AnnotationFormat.COCO_DETECTION
|
||||||
|
do_resize = True
|
||||||
|
do_rescale = True
|
||||||
|
do_normalize = True
|
||||||
|
do_pad = True
|
||||||
|
size = {"shortest_edge": 800, "longest_edge": 1333}
|
||||||
|
default_to_square = False
|
||||||
|
model_input_names = ["pixel_values", "pixel_mask"]
|
||||||
|
valid_kwargs = YolosFastImageProcessorKwargs
|
||||||
|
|
||||||
|
def __init__(self, **kwargs: Unpack[YolosFastImageProcessorKwargs]) -> None:
|
||||||
|
if "pad_and_return_pixel_mask" in kwargs:
|
||||||
|
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||||
|
|
||||||
|
size = kwargs.pop("size", None)
|
||||||
|
if "max_size" in kwargs:
|
||||||
|
logger.warning_once(
|
||||||
|
"The `max_size` parameter is deprecated and will be removed in v4.26. "
|
||||||
|
"Please specify in `size['longest_edge'] instead`.",
|
||||||
|
)
|
||||||
|
max_size = kwargs.pop("max_size")
|
||||||
|
else:
|
||||||
|
max_size = None if size is None else 1333
|
||||||
|
|
||||||
|
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
|
||||||
|
self.size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
||||||
|
|
||||||
|
# Backwards compatibility
|
||||||
|
do_convert_annotations = kwargs.get("do_convert_annotations", None)
|
||||||
|
do_normalize = kwargs.get("do_normalize", None)
|
||||||
|
if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
|
||||||
|
self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
|
||||||
|
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
|
||||||
|
"""
|
||||||
|
Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
|
||||||
|
created using from_dict and kwargs e.g. `YolosImageProcessorFast.from_pretrained(checkpoint, size=600,
|
||||||
|
max_size=800)`
|
||||||
|
"""
|
||||||
|
image_processor_dict = image_processor_dict.copy()
|
||||||
|
if "max_size" in kwargs:
|
||||||
|
image_processor_dict["max_size"] = kwargs.pop("max_size")
|
||||||
|
if "pad_and_return_pixel_mask" in kwargs:
|
||||||
|
image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||||
|
return super().from_dict(image_processor_dict, **kwargs)
|
||||||
|
|
||||||
|
def prepare_annotation(
|
||||||
|
self,
|
||||||
|
image: torch.Tensor,
|
||||||
|
target: Dict,
|
||||||
|
format: Optional[AnnotationFormat] = None,
|
||||||
|
return_segmentation_masks: Optional[bool] = None,
|
||||||
|
masks_path: Optional[Union[str, pathlib.Path]] = None,
|
||||||
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
Prepare an annotation for feeding into YOLOS model.
|
||||||
|
"""
|
||||||
|
format = format if format is not None else self.format
|
||||||
|
|
||||||
|
if format == AnnotationFormat.COCO_DETECTION:
|
||||||
|
return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
|
||||||
|
target = prepare_coco_detection_annotation(
|
||||||
|
image, target, return_segmentation_masks, input_data_format=input_data_format
|
||||||
|
)
|
||||||
|
elif format == AnnotationFormat.COCO_PANOPTIC:
|
||||||
|
return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
|
||||||
|
target = prepare_coco_panoptic_annotation(
|
||||||
|
image,
|
||||||
|
target,
|
||||||
|
masks_path=masks_path,
|
||||||
|
return_masks=return_segmentation_masks,
|
||||||
|
input_data_format=input_data_format,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Format {format} is not supported.")
|
||||||
|
return target
|
||||||
|
|
||||||
|
def resize(
|
||||||
|
self,
|
||||||
|
image: torch.Tensor,
|
||||||
|
size: SizeDict,
|
||||||
|
interpolation: "F.InterpolationMode" = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
|
||||||
|
int, smaller edge of the image will be matched to this number.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image (`torch.Tensor`):
|
||||||
|
Image to resize.
|
||||||
|
size (`SizeDict`):
|
||||||
|
Size of the image's `(height, width)` dimensions after resizing. Available options are:
|
||||||
|
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||||
|
Do NOT keep the aspect ratio.
|
||||||
|
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||||
|
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||||
|
less or equal to `longest_edge`.
|
||||||
|
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||||
|
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||||
|
`max_width`.
|
||||||
|
interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
|
||||||
|
Resampling filter to use if resizing the image.
|
||||||
|
"""
|
||||||
|
interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
|
||||||
|
if size.shortest_edge and size.longest_edge:
|
||||||
|
# Resize the image so that the shortest edge or the longest edge is of the given size
|
||||||
|
# while maintaining the aspect ratio of the original image.
|
||||||
|
new_size = get_size_with_aspect_ratio(
|
||||||
|
image.size()[-2:],
|
||||||
|
size["shortest_edge"],
|
||||||
|
size["longest_edge"],
|
||||||
|
)
|
||||||
|
elif size.max_height and size.max_width:
|
||||||
|
new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"])
|
||||||
|
elif size.height and size.width:
|
||||||
|
new_size = (size["height"], size["width"])
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
|
||||||
|
f" {size.keys()}."
|
||||||
|
)
|
||||||
|
|
||||||
|
image = F.resize(
|
||||||
|
image,
|
||||||
|
size=new_size,
|
||||||
|
interpolation=interpolation,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
return image
|
||||||
|
|
||||||
|
def resize_annotation(
|
||||||
|
self,
|
||||||
|
annotation: Dict[str, Any],
|
||||||
|
orig_size: Tuple[int, int],
|
||||||
|
target_size: Tuple[int, int],
|
||||||
|
threshold: float = 0.5,
|
||||||
|
interpolation: "F.InterpolationMode" = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Resizes an annotation to a target size.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
annotation (`Dict[str, Any]`):
|
||||||
|
The annotation dictionary.
|
||||||
|
orig_size (`Tuple[int, int]`):
|
||||||
|
The original size of the input image.
|
||||||
|
target_size (`Tuple[int, int]`):
|
||||||
|
The target size of the image, as returned by the preprocessing `resize` step.
|
||||||
|
threshold (`float`, *optional*, defaults to 0.5):
|
||||||
|
The threshold used to binarize the segmentation masks.
|
||||||
|
resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`):
|
||||||
|
The resampling filter to use when resizing the masks.
|
||||||
|
"""
|
||||||
|
interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST
|
||||||
|
ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]
|
||||||
|
|
||||||
|
new_annotation = {}
|
||||||
|
new_annotation["size"] = target_size
|
||||||
|
|
||||||
|
for key, value in annotation.items():
|
||||||
|
if key == "boxes":
|
||||||
|
boxes = value
|
||||||
|
scaled_boxes = boxes * torch.as_tensor(
|
||||||
|
[ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device
|
||||||
|
)
|
||||||
|
new_annotation["boxes"] = scaled_boxes
|
||||||
|
elif key == "area":
|
||||||
|
area = value
|
||||||
|
scaled_area = area * (ratio_width * ratio_height)
|
||||||
|
new_annotation["area"] = scaled_area
|
||||||
|
elif key == "masks":
|
||||||
|
masks = value[:, None]
|
||||||
|
masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks]
|
||||||
|
masks = torch.stack(masks).to(torch.float32)
|
||||||
|
masks = masks[:, 0] > threshold
|
||||||
|
new_annotation["masks"] = masks
|
||||||
|
elif key == "size":
|
||||||
|
new_annotation["size"] = target_size
|
||||||
|
else:
|
||||||
|
new_annotation[key] = value
|
||||||
|
|
||||||
|
return new_annotation
|
||||||
|
|
||||||
|
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
||||||
|
image_height, image_width = image_size
|
||||||
|
norm_annotation = {}
|
||||||
|
for key, value in annotation.items():
|
||||||
|
if key == "boxes":
|
||||||
|
boxes = value
|
||||||
|
boxes = corners_to_center_format(boxes)
|
||||||
|
boxes /= torch.as_tensor(
|
||||||
|
[image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device
|
||||||
|
)
|
||||||
|
norm_annotation[key] = boxes
|
||||||
|
else:
|
||||||
|
norm_annotation[key] = value
|
||||||
|
return norm_annotation
|
||||||
|
|
||||||
|
def _update_annotation_for_padded_image(
|
||||||
|
self,
|
||||||
|
annotation: Dict,
|
||||||
|
input_image_size: Tuple[int, int],
|
||||||
|
output_image_size: Tuple[int, int],
|
||||||
|
padding,
|
||||||
|
update_bboxes,
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
Update the annotation for a padded image.
|
||||||
|
"""
|
||||||
|
new_annotation = {}
|
||||||
|
new_annotation["size"] = output_image_size
|
||||||
|
ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size))
|
||||||
|
|
||||||
|
for key, value in annotation.items():
|
||||||
|
if key == "masks":
|
||||||
|
masks = value
|
||||||
|
masks = F.pad(
|
||||||
|
masks,
|
||||||
|
padding,
|
||||||
|
fill=0,
|
||||||
|
)
|
||||||
|
masks = safe_squeeze(masks, 1)
|
||||||
|
new_annotation["masks"] = masks
|
||||||
|
elif key == "boxes" and update_bboxes:
|
||||||
|
boxes = value
|
||||||
|
boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device)
|
||||||
|
new_annotation["boxes"] = boxes
|
||||||
|
elif key == "size":
|
||||||
|
new_annotation["size"] = output_image_size
|
||||||
|
else:
|
||||||
|
new_annotation[key] = value
|
||||||
|
return new_annotation
|
||||||
|
|
||||||
|
def pad(
|
||||||
|
self,
|
||||||
|
image: torch.Tensor,
|
||||||
|
padded_size: Tuple[int, int],
|
||||||
|
annotation: Optional[Dict[str, Any]] = None,
|
||||||
|
update_bboxes: bool = True,
|
||||||
|
fill: int = 0,
|
||||||
|
):
|
||||||
|
original_size = image.size()[-2:]
|
||||||
|
padding_bottom = padded_size[0] - original_size[0]
|
||||||
|
padding_right = padded_size[1] - original_size[1]
|
||||||
|
if padding_bottom < 0 or padding_right < 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"Padding dimensions are negative. Please make sure that the padded size is larger than the "
|
||||||
|
f"original size. Got padded size: {padded_size}, original size: {original_size}."
|
||||||
|
)
|
||||||
|
if original_size != padded_size:
|
||||||
|
padding = [0, 0, padding_right, padding_bottom]
|
||||||
|
image = F.pad(image, padding, fill=fill)
|
||||||
|
if annotation is not None:
|
||||||
|
annotation = self._update_annotation_for_padded_image(
|
||||||
|
annotation, original_size, padded_size, padding, update_bboxes
|
||||||
|
)
|
||||||
|
|
||||||
|
# Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
|
||||||
|
pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device)
|
||||||
|
pixel_mask[: original_size[0], : original_size[1]] = 1
|
||||||
|
|
||||||
|
return image, pixel_mask, annotation
|
||||||
|
|
||||||
|
@add_start_docstrings(
|
||||||
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
||||||
|
"""
|
||||||
|
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
|
||||||
|
List of annotations associated with the image or batch of images. If annotation is for object
|
||||||
|
detection, the annotations should be a dictionary with the following keys:
|
||||||
|
- "image_id" (`int`): The image id.
|
||||||
|
- "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
|
||||||
|
dictionary. An image can have no annotations, in which case the list should be empty.
|
||||||
|
If annotation is for segmentation, the annotations should be a dictionary with the following keys:
|
||||||
|
- "image_id" (`int`): The image id.
|
||||||
|
- "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
|
||||||
|
An image can have no segments, in which case the list should be empty.
|
||||||
|
- "file_name" (`str`): The file name of the image.
|
||||||
|
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||||
|
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||||
|
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||||
|
Controls whether to convert the annotations to the format expected by the YOLOS model. Converts the
|
||||||
|
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||||
|
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||||
|
do_pad (`bool`, *optional*, defaults to `True`):
|
||||||
|
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||||
|
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
|
||||||
|
If `pad_size` is provided, the image will be padded to the specified dimensions.
|
||||||
|
Otherwise, the image will be padded to the maximum height and width of the batch.
|
||||||
|
pad_size (`Dict[str, int]`, *optional*):
|
||||||
|
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||||
|
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||||
|
height and width in the batch.
|
||||||
|
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether to return segmentation masks.
|
||||||
|
masks_path (`str` or `pathlib.Path`, *optional*):
|
||||||
|
Path to the directory containing the segmentation masks.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
def preprocess(
|
||||||
|
self,
|
||||||
|
images: ImageInput,
|
||||||
|
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
|
||||||
|
masks_path: Optional[Union[str, pathlib.Path]] = None,
|
||||||
|
**kwargs: Unpack[YolosFastImageProcessorKwargs],
|
||||||
|
) -> BatchFeature:
|
||||||
|
if "pad_and_return_pixel_mask" in kwargs:
|
||||||
|
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||||
|
logger.warning_once(
|
||||||
|
"The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
|
||||||
|
"use `do_pad` instead."
|
||||||
|
)
|
||||||
|
|
||||||
|
if "max_size" in kwargs:
|
||||||
|
logger.warning_once(
|
||||||
|
"The `max_size` argument is deprecated and will be removed in a future version, use"
|
||||||
|
" `size['longest_edge']` instead."
|
||||||
|
)
|
||||||
|
kwargs["size"] = kwargs.pop("max_size")
|
||||||
|
|
||||||
|
return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
|
||||||
|
|
||||||
|
def _preprocess(
|
||||||
|
self,
|
||||||
|
images: List["torch.Tensor"],
|
||||||
|
annotations: Optional[Union[AnnotationType, List[AnnotationType]]],
|
||||||
|
return_segmentation_masks: bool,
|
||||||
|
masks_path: Optional[Union[str, pathlib.Path]],
|
||||||
|
do_resize: bool,
|
||||||
|
size: SizeDict,
|
||||||
|
interpolation: Optional["F.InterpolationMode"],
|
||||||
|
do_center_crop: bool,
|
||||||
|
crop_size: SizeDict,
|
||||||
|
do_rescale: bool,
|
||||||
|
rescale_factor: float,
|
||||||
|
do_normalize: bool,
|
||||||
|
do_convert_annotations: bool,
|
||||||
|
image_mean: Optional[Union[float, List[float]]],
|
||||||
|
image_std: Optional[Union[float, List[float]]],
|
||||||
|
do_pad: bool,
|
||||||
|
pad_size: Optional[Dict[str, int]],
|
||||||
|
format: Optional[Union[str, AnnotationFormat]],
|
||||||
|
return_tensors: Optional[Union[str, TensorType]],
|
||||||
|
) -> BatchFeature:
|
||||||
|
"""
|
||||||
|
Preprocess an image or a batch of images so that it can be used by the model.
|
||||||
|
"""
|
||||||
|
if annotations is not None and isinstance(annotations, dict):
|
||||||
|
annotations = [annotations]
|
||||||
|
|
||||||
|
if annotations is not None and len(images) != len(annotations):
|
||||||
|
raise ValueError(
|
||||||
|
f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
|
||||||
|
)
|
||||||
|
|
||||||
|
format = AnnotationFormat(format)
|
||||||
|
if annotations is not None:
|
||||||
|
validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
|
||||||
|
|
||||||
|
if (
|
||||||
|
masks_path is not None
|
||||||
|
and format == AnnotationFormat.COCO_PANOPTIC
|
||||||
|
and not isinstance(masks_path, (pathlib.Path, str))
|
||||||
|
):
|
||||||
|
raise ValueError(
|
||||||
|
"The path to the directory containing the mask PNG files should be provided as a"
|
||||||
|
f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
|
||||||
|
)
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
processed_images = []
|
||||||
|
processed_annotations = []
|
||||||
|
pixel_masks = [] # Initialize pixel_masks here
|
||||||
|
for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
|
||||||
|
# prepare (COCO annotations as a list of Dict -> YOLOS target as a single Dict per image)
|
||||||
|
if annotations is not None:
|
||||||
|
annotation = self.prepare_annotation(
|
||||||
|
image,
|
||||||
|
annotation,
|
||||||
|
format,
|
||||||
|
return_segmentation_masks=return_segmentation_masks,
|
||||||
|
masks_path=masks_path,
|
||||||
|
input_data_format=ChannelDimension.FIRST,
|
||||||
|
)
|
||||||
|
|
||||||
|
if do_resize:
|
||||||
|
resized_image = self.resize(image, size=size, interpolation=interpolation)
|
||||||
|
if annotations is not None:
|
||||||
|
annotation = self.resize_annotation(
|
||||||
|
annotation,
|
||||||
|
orig_size=image.size()[-2:],
|
||||||
|
target_size=resized_image.size()[-2:],
|
||||||
|
)
|
||||||
|
image = resized_image
|
||||||
|
# Fused rescale and normalize
|
||||||
|
image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
|
||||||
|
if do_convert_annotations and annotations is not None:
|
||||||
|
annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST))
|
||||||
|
|
||||||
|
processed_images.append(image)
|
||||||
|
processed_annotations.append(annotation)
|
||||||
|
images = processed_images
|
||||||
|
annotations = processed_annotations if annotations is not None else None
|
||||||
|
|
||||||
|
if do_pad:
|
||||||
|
# depends on all resized image shapes so we need another loop
|
||||||
|
if pad_size is not None:
|
||||||
|
padded_size = (pad_size["height"], pad_size["width"])
|
||||||
|
else:
|
||||||
|
padded_size = get_max_height_width(images)
|
||||||
|
|
||||||
|
padded_images = []
|
||||||
|
padded_annotations = []
|
||||||
|
for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
|
||||||
|
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
|
||||||
|
if padded_size == image.size()[-2:]:
|
||||||
|
padded_images.append(image)
|
||||||
|
pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
|
||||||
|
padded_annotations.append(annotation)
|
||||||
|
continue
|
||||||
|
image, pixel_mask, annotation = self.pad(
|
||||||
|
image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
|
||||||
|
)
|
||||||
|
padded_images.append(image)
|
||||||
|
padded_annotations.append(annotation)
|
||||||
|
pixel_masks.append(pixel_mask)
|
||||||
|
images = padded_images
|
||||||
|
annotations = padded_annotations if annotations is not None else None
|
||||||
|
data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
|
||||||
|
|
||||||
|
data.update({"pixel_values": torch.stack(images, dim=0)})
|
||||||
|
encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
|
||||||
|
if annotations is not None:
|
||||||
|
encoded_inputs["labels"] = [
|
||||||
|
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
||||||
|
]
|
||||||
|
return encoded_inputs
|
||||||
|
|
||||||
|
def post_process(self, outputs, target_sizes):
|
||||||
|
"""
|
||||||
|
Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x,
|
||||||
|
top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
outputs ([`YolosObjectDetectionOutput`]):
|
||||||
|
Raw outputs of the model.
|
||||||
|
target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
|
||||||
|
Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
|
||||||
|
original image size (before any data augmentation). For visualization, this should be the image size
|
||||||
|
after data augment, but before padding.
|
||||||
|
Returns:
|
||||||
|
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
||||||
|
in the batch as predicted by the model.
|
||||||
|
"""
|
||||||
|
logger.warning_once(
|
||||||
|
"`post_process` is deprecated and will be removed in v5 of Transformers, please use"
|
||||||
|
" `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
|
||||||
|
)
|
||||||
|
|
||||||
|
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
|
||||||
|
|
||||||
|
if len(out_logits) != len(target_sizes):
|
||||||
|
raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
|
||||||
|
if target_sizes.shape[1] != 2:
|
||||||
|
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
|
||||||
|
|
||||||
|
prob = out_logits.sigmoid()
|
||||||
|
topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
|
||||||
|
scores = topk_values
|
||||||
|
topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
|
||||||
|
labels = topk_indexes % out_logits.shape[2]
|
||||||
|
boxes = center_to_corners_format(out_bbox)
|
||||||
|
boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
|
||||||
|
|
||||||
|
# and from relative [0, 1] to absolute [0, height] coordinates
|
||||||
|
img_h, img_w = target_sizes.unbind(1)
|
||||||
|
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
|
||||||
|
boxes = boxes * scale_fct[:, None, :]
|
||||||
|
|
||||||
|
results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def post_process_object_detection(
|
||||||
|
self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x,
|
||||||
|
top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
outputs ([`YolosObjectDetectionOutput`]):
|
||||||
|
Raw outputs of the model.
|
||||||
|
threshold (`float`, *optional*):
|
||||||
|
Score threshold to keep object detection predictions.
|
||||||
|
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
|
||||||
|
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
||||||
|
(height, width) of each image in the batch. If left to None, predictions will not be resized.
|
||||||
|
top_k (`int`, *optional*, defaults to 100):
|
||||||
|
Keep only top k bounding boxes before filtering by thresholding.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
||||||
|
in the batch as predicted by the model.
|
||||||
|
"""
|
||||||
|
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
|
||||||
|
|
||||||
|
if target_sizes is not None:
|
||||||
|
if len(out_logits) != len(target_sizes):
|
||||||
|
raise ValueError(
|
||||||
|
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
|
||||||
|
)
|
||||||
|
|
||||||
|
prob = out_logits.sigmoid()
|
||||||
|
prob = prob.view(out_logits.shape[0], -1)
|
||||||
|
k_value = min(top_k, prob.size(1))
|
||||||
|
topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
|
||||||
|
scores = topk_values
|
||||||
|
topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
|
||||||
|
labels = topk_indexes % out_logits.shape[2]
|
||||||
|
boxes = center_to_corners_format(out_bbox)
|
||||||
|
boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
|
||||||
|
|
||||||
|
# and from relative [0, 1] to absolute [0, height] coordinates
|
||||||
|
if target_sizes is not None:
|
||||||
|
if isinstance(target_sizes, List):
|
||||||
|
img_h = torch.Tensor([i[0] for i in target_sizes])
|
||||||
|
img_w = torch.Tensor([i[1] for i in target_sizes])
|
||||||
|
else:
|
||||||
|
img_h, img_w = target_sizes.unbind(1)
|
||||||
|
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
|
||||||
|
boxes = boxes * scale_fct[:, None, :]
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for s, l, b in zip(scores, labels, boxes):
|
||||||
|
score = s[s > threshold]
|
||||||
|
label = l[s > threshold]
|
||||||
|
box = b[s > threshold]
|
||||||
|
results.append({"scores": score, "labels": label, "boxes": box})
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["YolosImageProcessorFast"]
|
||||||
193
src/transformers/models/yolos/modular_yolos.py
Normal file
193
src/transformers/models/yolos/modular_yolos.py
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
from typing import List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
from transformers.models.detr.image_processing_detr_fast import DetrImageProcessorFast
|
||||||
|
|
||||||
|
from ...image_transforms import center_to_corners_format
|
||||||
|
from ...utils import (
|
||||||
|
TensorType,
|
||||||
|
is_torch_available,
|
||||||
|
logging,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if is_torch_available():
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def get_size_with_aspect_ratio(
|
||||||
|
image_size: Tuple[int, int], size: int, max_size: Optional[int] = None, mod_size: int = 16
|
||||||
|
) -> Tuple[int, int]:
|
||||||
|
"""
|
||||||
|
Computes the output image size given the input image size and the desired output size with multiple of divisible_size.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_size (`Tuple[int, int]`):
|
||||||
|
The input image size.
|
||||||
|
size (`int`):
|
||||||
|
The desired output size.
|
||||||
|
max_size (`int`, *optional*):
|
||||||
|
The maximum allowed output size.
|
||||||
|
mod_size (`int`, *optional*):
|
||||||
|
The size to make multiple of mod_size.
|
||||||
|
"""
|
||||||
|
height, width = image_size
|
||||||
|
raw_size = None
|
||||||
|
if max_size is not None:
|
||||||
|
min_original_size = float(min((height, width)))
|
||||||
|
max_original_size = float(max((height, width)))
|
||||||
|
if max_original_size / min_original_size * size > max_size:
|
||||||
|
raw_size = max_size * min_original_size / max_original_size
|
||||||
|
size = int(round(raw_size))
|
||||||
|
|
||||||
|
if width < height:
|
||||||
|
ow = size
|
||||||
|
if max_size is not None and raw_size is not None:
|
||||||
|
oh = int(raw_size * height / width)
|
||||||
|
else:
|
||||||
|
oh = int(size * height / width)
|
||||||
|
elif (height <= width and height == size) or (width <= height and width == size):
|
||||||
|
oh, ow = height, width
|
||||||
|
else:
|
||||||
|
oh = size
|
||||||
|
if max_size is not None and raw_size is not None:
|
||||||
|
ow = int(raw_size * width / height)
|
||||||
|
else:
|
||||||
|
ow = int(size * width / height)
|
||||||
|
|
||||||
|
if mod_size is not None:
|
||||||
|
ow_mod = torch.remainder(torch.tensor(ow), mod_size).item()
|
||||||
|
oh_mod = torch.remainder(torch.tensor(oh), mod_size).item()
|
||||||
|
ow = ow - ow_mod
|
||||||
|
oh = oh - oh_mod
|
||||||
|
|
||||||
|
return (oh, ow)
|
||||||
|
|
||||||
|
|
||||||
|
class YolosImageProcessorFast(DetrImageProcessorFast):
|
||||||
|
def post_process(self, outputs, target_sizes):
|
||||||
|
"""
|
||||||
|
Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x,
|
||||||
|
top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
outputs ([`YolosObjectDetectionOutput`]):
|
||||||
|
Raw outputs of the model.
|
||||||
|
target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
|
||||||
|
Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
|
||||||
|
original image size (before any data augmentation). For visualization, this should be the image size
|
||||||
|
after data augment, but before padding.
|
||||||
|
Returns:
|
||||||
|
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
||||||
|
in the batch as predicted by the model.
|
||||||
|
"""
|
||||||
|
logger.warning_once(
|
||||||
|
"`post_process` is deprecated and will be removed in v5 of Transformers, please use"
|
||||||
|
" `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
|
||||||
|
)
|
||||||
|
|
||||||
|
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
|
||||||
|
|
||||||
|
if len(out_logits) != len(target_sizes):
|
||||||
|
raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
|
||||||
|
if target_sizes.shape[1] != 2:
|
||||||
|
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
|
||||||
|
|
||||||
|
prob = out_logits.sigmoid()
|
||||||
|
topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
|
||||||
|
scores = topk_values
|
||||||
|
topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
|
||||||
|
labels = topk_indexes % out_logits.shape[2]
|
||||||
|
boxes = center_to_corners_format(out_bbox)
|
||||||
|
boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
|
||||||
|
|
||||||
|
# and from relative [0, 1] to absolute [0, height] coordinates
|
||||||
|
img_h, img_w = target_sizes.unbind(1)
|
||||||
|
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
|
||||||
|
boxes = boxes * scale_fct[:, None, :]
|
||||||
|
|
||||||
|
results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def post_process_object_detection(
|
||||||
|
self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x,
|
||||||
|
top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
outputs ([`YolosObjectDetectionOutput`]):
|
||||||
|
Raw outputs of the model.
|
||||||
|
threshold (`float`, *optional*):
|
||||||
|
Score threshold to keep object detection predictions.
|
||||||
|
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
|
||||||
|
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
||||||
|
(height, width) of each image in the batch. If left to None, predictions will not be resized.
|
||||||
|
top_k (`int`, *optional*, defaults to 100):
|
||||||
|
Keep only top k bounding boxes before filtering by thresholding.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
||||||
|
in the batch as predicted by the model.
|
||||||
|
"""
|
||||||
|
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
|
||||||
|
|
||||||
|
if target_sizes is not None:
|
||||||
|
if len(out_logits) != len(target_sizes):
|
||||||
|
raise ValueError(
|
||||||
|
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
|
||||||
|
)
|
||||||
|
|
||||||
|
prob = out_logits.sigmoid()
|
||||||
|
prob = prob.view(out_logits.shape[0], -1)
|
||||||
|
k_value = min(top_k, prob.size(1))
|
||||||
|
topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
|
||||||
|
scores = topk_values
|
||||||
|
topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
|
||||||
|
labels = topk_indexes % out_logits.shape[2]
|
||||||
|
boxes = center_to_corners_format(out_bbox)
|
||||||
|
boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
|
||||||
|
|
||||||
|
# and from relative [0, 1] to absolute [0, height] coordinates
|
||||||
|
if target_sizes is not None:
|
||||||
|
if isinstance(target_sizes, List):
|
||||||
|
img_h = torch.Tensor([i[0] for i in target_sizes])
|
||||||
|
img_w = torch.Tensor([i[1] for i in target_sizes])
|
||||||
|
else:
|
||||||
|
img_h, img_w = target_sizes.unbind(1)
|
||||||
|
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
|
||||||
|
boxes = boxes * scale_fct[:, None, :]
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for s, l, b in zip(scores, labels, boxes):
|
||||||
|
score = s[s > threshold]
|
||||||
|
label = l[s > threshold]
|
||||||
|
box = b[s > threshold]
|
||||||
|
results.append({"scores": score, "labels": label, "boxes": box})
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def post_process_segmentation():
|
||||||
|
raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.")
|
||||||
|
|
||||||
|
def post_process_instance():
|
||||||
|
raise NotImplementedError("Instance post-processing is not implemented for Deformable DETR yet.")
|
||||||
|
|
||||||
|
def post_process_panoptic():
|
||||||
|
raise NotImplementedError("Panoptic post-processing is not implemented for Deformable DETR yet.")
|
||||||
|
|
||||||
|
def post_process_instance_segmentation():
|
||||||
|
raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.")
|
||||||
|
|
||||||
|
def post_process_semantic_segmentation():
|
||||||
|
raise NotImplementedError("Semantic segmentation post-processing is not implemented for Deformable DETR yet.")
|
||||||
|
|
||||||
|
def post_process_panoptic_segmentation():
|
||||||
|
raise NotImplementedError("Panoptic segmentation post-processing is not implemented for Deformable DETR yet.")
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["YolosImageProcessorFast"]
|
||||||
@@ -21,7 +21,7 @@ import numpy as np
|
|||||||
from parameterized import parameterized
|
from parameterized import parameterized
|
||||||
|
|
||||||
from transformers.testing_utils import require_torch, require_vision, slow
|
from transformers.testing_utils import require_torch, require_vision, slow
|
||||||
from transformers.utils import is_torch_available, is_vision_available
|
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||||
|
|
||||||
from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
|
from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
|
||||||
|
|
||||||
@@ -34,6 +34,9 @@ if is_vision_available():
|
|||||||
|
|
||||||
from transformers import YolosImageProcessor
|
from transformers import YolosImageProcessor
|
||||||
|
|
||||||
|
if is_torchvision_available():
|
||||||
|
from transformers import YolosImageProcessorFast
|
||||||
|
|
||||||
|
|
||||||
class YolosImageProcessingTester:
|
class YolosImageProcessingTester:
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -143,6 +146,7 @@ class YolosImageProcessingTester:
|
|||||||
@require_vision
|
@require_vision
|
||||||
class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
|
class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
|
||||||
image_processing_class = YolosImageProcessor if is_vision_available() else None
|
image_processing_class = YolosImageProcessor if is_vision_available() else None
|
||||||
|
fast_image_processing_class = YolosImageProcessorFast if is_torchvision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super().setUp()
|
super().setUp()
|
||||||
@@ -153,7 +157,8 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
|
|||||||
return self.image_processor_tester.prepare_image_processor_dict()
|
return self.image_processor_tester.prepare_image_processor_dict()
|
||||||
|
|
||||||
def test_image_processor_properties(self):
|
def test_image_processor_properties(self):
|
||||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
for image_processing_class in self.image_processor_list:
|
||||||
|
image_processing = image_processing_class(**self.image_processor_dict)
|
||||||
self.assertTrue(hasattr(image_processing, "image_mean"))
|
self.assertTrue(hasattr(image_processing, "image_mean"))
|
||||||
self.assertTrue(hasattr(image_processing, "image_std"))
|
self.assertTrue(hasattr(image_processing, "image_std"))
|
||||||
self.assertTrue(hasattr(image_processing, "do_normalize"))
|
self.assertTrue(hasattr(image_processing, "do_normalize"))
|
||||||
@@ -161,11 +166,12 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
|
|||||||
self.assertTrue(hasattr(image_processing, "size"))
|
self.assertTrue(hasattr(image_processing, "size"))
|
||||||
|
|
||||||
def test_image_processor_from_dict_with_kwargs(self):
|
def test_image_processor_from_dict_with_kwargs(self):
|
||||||
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
|
for image_processing_class in self.image_processor_list:
|
||||||
|
image_processor = image_processing_class.from_dict(self.image_processor_dict)
|
||||||
self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
|
self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
|
||||||
self.assertEqual(image_processor.do_pad, True)
|
self.assertEqual(image_processor.do_pad, True)
|
||||||
|
|
||||||
image_processor = self.image_processing_class.from_dict(
|
image_processor = image_processing_class.from_dict(
|
||||||
self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
|
self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
|
||||||
)
|
)
|
||||||
self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
|
self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
|
||||||
@@ -199,7 +205,8 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
def test_resize_max_size_respected(self, image_size, longest_edge, shortest_edge):
|
def test_resize_max_size_respected(self, image_size, longest_edge, shortest_edge):
|
||||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
for image_processing_class in self.image_processor_list:
|
||||||
|
image_processor = image_processing_class(**self.image_processor_dict)
|
||||||
|
|
||||||
# create torch tensors as image
|
# create torch tensors as image
|
||||||
image = torch.randint(0, 256, image_size, dtype=torch.uint8)
|
image = torch.randint(0, 256, image_size, dtype=torch.uint8)
|
||||||
@@ -224,8 +231,9 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
|
|||||||
|
|
||||||
target = {"image_id": 39769, "annotations": target}
|
target = {"image_id": 39769, "annotations": target}
|
||||||
|
|
||||||
|
for image_processing_class in self.image_processor_list:
|
||||||
# encode them
|
# encode them
|
||||||
image_processing = YolosImageProcessor.from_pretrained("hustvl/yolos-small")
|
image_processing = image_processing_class.from_pretrained("hustvl/yolos-small")
|
||||||
encoding = image_processing(images=image, annotations=target, return_tensors="pt")
|
encoding = image_processing(images=image, annotations=target, return_tensors="pt")
|
||||||
|
|
||||||
# verify pixel values
|
# verify pixel values
|
||||||
@@ -270,8 +278,9 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
|
|||||||
|
|
||||||
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
|
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
|
||||||
|
|
||||||
|
for image_processing_class in self.image_processor_list:
|
||||||
# encode them
|
# encode them
|
||||||
image_processing = YolosImageProcessor(format="coco_panoptic")
|
image_processing = image_processing_class(format="coco_panoptic")
|
||||||
encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
|
encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
|
||||||
|
|
||||||
# verify pixel values
|
# verify pixel values
|
||||||
@@ -300,7 +309,8 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
|
|||||||
torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels)
|
torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels)
|
||||||
# verify masks
|
# verify masks
|
||||||
expected_masks_sum = 815161
|
expected_masks_sum = 815161
|
||||||
self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
|
relative_error = torch.abs(encoding["labels"][0]["masks"].sum() - expected_masks_sum) / expected_masks_sum
|
||||||
|
self.assertTrue(relative_error < 1e-3)
|
||||||
# verify orig_size
|
# verify orig_size
|
||||||
expected_orig_size = torch.tensor([480, 640])
|
expected_orig_size = torch.tensor([480, 640])
|
||||||
torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size)
|
torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size)
|
||||||
@@ -336,7 +346,8 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
|
|||||||
images = [image_0, image_1]
|
images = [image_0, image_1]
|
||||||
annotations = [annotations_0, annotations_1]
|
annotations = [annotations_0, annotations_1]
|
||||||
|
|
||||||
image_processing = YolosImageProcessor()
|
for image_processing_class in self.image_processor_list:
|
||||||
|
image_processing = image_processing_class()
|
||||||
encoding = image_processing(
|
encoding = image_processing(
|
||||||
images=images,
|
images=images,
|
||||||
annotations=annotations,
|
annotations=annotations,
|
||||||
@@ -457,6 +468,8 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
|
|||||||
annotations = [annotation_0, annotation_1]
|
annotations = [annotation_0, annotation_1]
|
||||||
|
|
||||||
# encode them
|
# encode them
|
||||||
|
for image_processing_class in self.image_processor_list:
|
||||||
|
image_processing = image_processing_class()
|
||||||
image_processing = YolosImageProcessor(format="coco_panoptic")
|
image_processing = YolosImageProcessor(format="coco_panoptic")
|
||||||
encoding = image_processing(
|
encoding = image_processing(
|
||||||
images=images,
|
images=images,
|
||||||
|
|||||||
Reference in New Issue
Block a user