Add support for modular with fast image processors (#35379)
* Add support for modular with fast image processors * fix order and remove copied from * add comment for "image_processing*_fast"
This commit is contained in:
@@ -1,19 +1,9 @@
|
|||||||
# coding=utf-8
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
# This file was automatically generated from src/transformers/models/deformable_detr/modular_deformable_detr.py.
|
||||||
#
|
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
# the file from the modular. If any change should be done, please apply the change to the
|
||||||
# you may not use this file except in compliance with the License.
|
# modular_deformable_detr.py file directly. One of our CI enforces this.
|
||||||
# You may obtain a copy of the License at
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
"""Fast Image processor class for Deformable DETR."""
|
|
||||||
|
|
||||||
import functools
|
import functools
|
||||||
import pathlib
|
import pathlib
|
||||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
@@ -26,10 +16,7 @@ from ...image_processing_utils_fast import (
|
|||||||
get_max_height_width,
|
get_max_height_width,
|
||||||
safe_squeeze,
|
safe_squeeze,
|
||||||
)
|
)
|
||||||
from ...image_transforms import (
|
from ...image_transforms import center_to_corners_format, corners_to_center_format
|
||||||
center_to_corners_format,
|
|
||||||
corners_to_center_format,
|
|
||||||
)
|
|
||||||
from ...image_utils import (
|
from ...image_utils import (
|
||||||
IMAGENET_DEFAULT_MEAN,
|
IMAGENET_DEFAULT_MEAN,
|
||||||
IMAGENET_DEFAULT_STD,
|
IMAGENET_DEFAULT_STD,
|
||||||
@@ -43,7 +30,6 @@ from ...image_utils import (
|
|||||||
get_image_type,
|
get_image_type,
|
||||||
infer_channel_dimension_format,
|
infer_channel_dimension_format,
|
||||||
make_list_of_images,
|
make_list_of_images,
|
||||||
pil_torch_interpolation_mapping,
|
|
||||||
validate_annotations,
|
validate_annotations,
|
||||||
validate_kwargs,
|
validate_kwargs,
|
||||||
)
|
)
|
||||||
@@ -55,23 +41,21 @@ from ...utils import (
|
|||||||
is_vision_available,
|
is_vision_available,
|
||||||
logging,
|
logging,
|
||||||
)
|
)
|
||||||
from .image_processing_deformable_detr import (
|
from .image_processing_deformable_detr import get_size_with_aspect_ratio
|
||||||
get_size_with_aspect_ratio,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
if is_torchvision_available():
|
|
||||||
from torchvision.io import read_image
|
|
||||||
|
|
||||||
if is_vision_available():
|
if is_vision_available():
|
||||||
from ...image_utils import pil_torch_interpolation_mapping
|
from ...image_utils import pil_torch_interpolation_mapping
|
||||||
|
|
||||||
|
|
||||||
if is_torchvision_v2_available():
|
if is_torchvision_v2_available():
|
||||||
|
from torchvision.io import read_image
|
||||||
from torchvision.transforms.v2 import functional as F
|
from torchvision.transforms.v2 import functional as F
|
||||||
else:
|
elif is_torchvision_available():
|
||||||
|
from torchvision.io import read_image
|
||||||
from torchvision.transforms import functional as F
|
from torchvision.transforms import functional as F
|
||||||
|
|
||||||
|
|
||||||
@@ -80,7 +64,7 @@ logger = logging.get_logger(__name__)
|
|||||||
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.convert_coco_poly_to_mask
|
# inspired by https://github.com/facebookresearch/deformable_detr/blob/master/datasets/coco.py#L33
|
||||||
def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor:
|
def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor:
|
||||||
"""
|
"""
|
||||||
Convert a COCO polygon annotation to a mask.
|
Convert a COCO polygon annotation to a mask.
|
||||||
@@ -115,7 +99,7 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: to
|
|||||||
return masks
|
return masks
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.prepare_coco_detection_annotation with DETR->DeformableDetr
|
# inspired by https://github.com/facebookresearch/deformable_detr/blob/master/datasets/coco.py#L50
|
||||||
def prepare_coco_detection_annotation(
|
def prepare_coco_detection_annotation(
|
||||||
image,
|
image,
|
||||||
target,
|
target,
|
||||||
@@ -123,7 +107,7 @@ def prepare_coco_detection_annotation(
|
|||||||
input_data_format: Optional[Union[ChannelDimension, str]] = None,
|
input_data_format: Optional[Union[ChannelDimension, str]] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Convert the target in COCO format into the format expected by DeformableDetr.
|
Convert the target in COCO format into the format expected by DEFORMABLE_DETR.
|
||||||
"""
|
"""
|
||||||
image_height, image_width = image.size()[-2:]
|
image_height, image_width = image.size()[-2:]
|
||||||
|
|
||||||
@@ -180,7 +164,6 @@ def prepare_coco_detection_annotation(
|
|||||||
return new_target
|
return new_target
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.masks_to_boxes
|
|
||||||
def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
|
def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
|
||||||
"""
|
"""
|
||||||
Compute the bounding boxes around the provided panoptic segmentation masks.
|
Compute the bounding boxes around the provided panoptic segmentation masks.
|
||||||
@@ -215,7 +198,9 @@ def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
|
|||||||
return torch.stack([x_min, y_min, x_max, y_max], 1)
|
return torch.stack([x_min, y_min, x_max, y_max], 1)
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.rgb_to_id
|
# 2 functions below adapted from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
|
||||||
|
# Copyright (c) 2018, Alexander Kirillov
|
||||||
|
# All rights reserved.
|
||||||
def rgb_to_id(color):
|
def rgb_to_id(color):
|
||||||
"""
|
"""
|
||||||
Converts RGB color to unique ID.
|
Converts RGB color to unique ID.
|
||||||
@@ -227,7 +212,6 @@ def rgb_to_id(color):
|
|||||||
return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
|
return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.prepare_coco_panoptic_annotation with DETR->DeformableDetr
|
|
||||||
def prepare_coco_panoptic_annotation(
|
def prepare_coco_panoptic_annotation(
|
||||||
image: torch.Tensor,
|
image: torch.Tensor,
|
||||||
target: Dict,
|
target: Dict,
|
||||||
@@ -236,7 +220,7 @@ def prepare_coco_panoptic_annotation(
|
|||||||
input_data_format: Union[ChannelDimension, str] = None,
|
input_data_format: Union[ChannelDimension, str] = None,
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""
|
"""
|
||||||
Prepare a coco panoptic annotation for DeformableDetr.
|
Prepare a coco panoptic annotation for DEFORMABLE_DETR.
|
||||||
"""
|
"""
|
||||||
image_height, image_width = get_image_size(image, channel_dim=input_data_format)
|
image_height, image_width = get_image_size(image, channel_dim=input_data_format)
|
||||||
annotation_path = pathlib.Path(masks_path) / target["file_name"]
|
annotation_path = pathlib.Path(masks_path) / target["file_name"]
|
||||||
@@ -279,13 +263,13 @@ def prepare_coco_panoptic_annotation(
|
|||||||
|
|
||||||
class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
||||||
r"""
|
r"""
|
||||||
Constructs a fast Deformable DETR image processor.
|
Constructs a fast DeformableDetr image processor.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||||
do_resize (`bool`, *optional*, defaults to `True`):
|
do_resize (`bool`, *optional*, defaults to `True`):
|
||||||
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
|
Controls whether to resize the image's `(height, width)` dimensions to the specified `size`. Can be
|
||||||
overridden by the `do_resize` parameter in the `preprocess` method.
|
overridden by the `do_resize` parameter in the `preprocess` method.
|
||||||
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
|
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
|
||||||
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
|
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
|
||||||
@@ -316,7 +300,7 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
||||||
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||||
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||||
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
|
Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
|
||||||
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||||
do_pad (`bool`, *optional*, defaults to `True`):
|
do_pad (`bool`, *optional*, defaults to `True`):
|
||||||
@@ -332,7 +316,6 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
|
|
||||||
model_input_names = ["pixel_values", "pixel_mask"]
|
model_input_names = ["pixel_values", "pixel_mask"]
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.__init__
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
|
format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
|
||||||
@@ -404,7 +387,6 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
]
|
]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.from_dict with Detr->DeformableDetr
|
|
||||||
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
|
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
|
||||||
"""
|
"""
|
||||||
Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
|
Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
|
||||||
@@ -418,7 +400,6 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
|
image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||||
return super().from_dict(image_processor_dict, **kwargs)
|
return super().from_dict(image_processor_dict, **kwargs)
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.prepare_annotation with DETR->DeformableDetr
|
|
||||||
def prepare_annotation(
|
def prepare_annotation(
|
||||||
self,
|
self,
|
||||||
image: torch.Tensor,
|
image: torch.Tensor,
|
||||||
@@ -429,7 +410,7 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""
|
"""
|
||||||
Prepare an annotation for feeding into DeformableDetr model.
|
Prepare an annotation for feeding into DEFORMABLE_DETR model.
|
||||||
"""
|
"""
|
||||||
format = format if format is not None else self.format
|
format = format if format is not None else self.format
|
||||||
|
|
||||||
@@ -451,7 +432,6 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
raise ValueError(f"Format {format} is not supported.")
|
raise ValueError(f"Format {format} is not supported.")
|
||||||
return target
|
return target
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize
|
|
||||||
def resize(
|
def resize(
|
||||||
self,
|
self,
|
||||||
image: torch.Tensor,
|
image: torch.Tensor,
|
||||||
@@ -506,7 +486,6 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
)
|
)
|
||||||
return image
|
return image
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize_annotation
|
|
||||||
def resize_annotation(
|
def resize_annotation(
|
||||||
self,
|
self,
|
||||||
annotation: Dict[str, Any],
|
annotation: Dict[str, Any],
|
||||||
@@ -560,7 +539,6 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
|
|
||||||
return new_annotation
|
return new_annotation
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.normalize_annotation
|
|
||||||
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
||||||
image_height, image_width = image_size
|
image_height, image_width = image_size
|
||||||
norm_annotation = {}
|
norm_annotation = {}
|
||||||
@@ -576,7 +554,6 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
norm_annotation[key] = value
|
norm_annotation[key] = value
|
||||||
return norm_annotation
|
return norm_annotation
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._update_annotation_for_padded_image
|
|
||||||
def _update_annotation_for_padded_image(
|
def _update_annotation_for_padded_image(
|
||||||
self,
|
self,
|
||||||
annotation: Dict,
|
annotation: Dict,
|
||||||
@@ -612,7 +589,6 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
new_annotation[key] = value
|
new_annotation[key] = value
|
||||||
return new_annotation
|
return new_annotation
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.pad
|
|
||||||
def pad(
|
def pad(
|
||||||
self,
|
self,
|
||||||
image: torch.Tensor,
|
image: torch.Tensor,
|
||||||
@@ -644,7 +620,6 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
return image, pixel_mask, annotation
|
return image, pixel_mask, annotation
|
||||||
|
|
||||||
@functools.lru_cache(maxsize=1)
|
@functools.lru_cache(maxsize=1)
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._validate_input_arguments
|
|
||||||
def _validate_input_arguments(
|
def _validate_input_arguments(
|
||||||
self,
|
self,
|
||||||
do_rescale: bool,
|
do_rescale: bool,
|
||||||
@@ -673,7 +648,6 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
if do_normalize and None in (image_mean, image_std):
|
if do_normalize and None in (image_mean, image_std):
|
||||||
raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
|
raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.preprocess
|
|
||||||
def preprocess(
|
def preprocess(
|
||||||
self,
|
self,
|
||||||
images: ImageInput,
|
images: ImageInput,
|
||||||
@@ -874,7 +848,7 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
processed_annotations = []
|
processed_annotations = []
|
||||||
pixel_masks = [] # Initialize pixel_masks here
|
pixel_masks = [] # Initialize pixel_masks here
|
||||||
for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
|
for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
|
||||||
# prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
|
# prepare (COCO annotations as a list of Dict -> DEFORMABLE_DETR target as a single Dict per image)
|
||||||
if annotations is not None:
|
if annotations is not None:
|
||||||
annotation = self.prepare_annotation(
|
annotation = self.prepare_annotation(
|
||||||
image,
|
image,
|
||||||
@@ -950,7 +924,6 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
]
|
]
|
||||||
return encoded_inputs
|
return encoded_inputs
|
||||||
|
|
||||||
# Copied from transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process
|
|
||||||
def post_process(self, outputs, target_sizes):
|
def post_process(self, outputs, target_sizes):
|
||||||
"""
|
"""
|
||||||
Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
|
Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
|
||||||
@@ -996,7 +969,6 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
# Copied from transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process_object_detection
|
|
||||||
def post_process_object_detection(
|
def post_process_object_detection(
|
||||||
self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
|
self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
|
||||||
):
|
):
|
||||||
|
|||||||
@@ -0,0 +1,144 @@
|
|||||||
|
from typing import List, Tuple, Union
|
||||||
|
|
||||||
|
from transformers.models.detr.image_processing_detr_fast import DetrImageProcessorFast
|
||||||
|
|
||||||
|
from ...image_transforms import center_to_corners_format
|
||||||
|
from ...utils import (
|
||||||
|
TensorType,
|
||||||
|
is_torch_available,
|
||||||
|
logging,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if is_torch_available():
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class DeformableDetrImageProcessorFast(DetrImageProcessorFast):
|
||||||
|
def post_process(self, outputs, target_sizes):
|
||||||
|
"""
|
||||||
|
Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
|
||||||
|
top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
outputs ([`DeformableDetrObjectDetectionOutput`]):
|
||||||
|
Raw outputs of the model.
|
||||||
|
target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
|
||||||
|
Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
|
||||||
|
original image size (before any data augmentation). For visualization, this should be the image size
|
||||||
|
after data augment, but before padding.
|
||||||
|
Returns:
|
||||||
|
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
||||||
|
in the batch as predicted by the model.
|
||||||
|
"""
|
||||||
|
logger.warning_once(
|
||||||
|
"`post_process` is deprecated and will be removed in v5 of Transformers, please use"
|
||||||
|
" `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
|
||||||
|
)
|
||||||
|
|
||||||
|
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
|
||||||
|
|
||||||
|
if len(out_logits) != len(target_sizes):
|
||||||
|
raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
|
||||||
|
if target_sizes.shape[1] != 2:
|
||||||
|
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
|
||||||
|
|
||||||
|
prob = out_logits.sigmoid()
|
||||||
|
topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
|
||||||
|
scores = topk_values
|
||||||
|
topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
|
||||||
|
labels = topk_indexes % out_logits.shape[2]
|
||||||
|
boxes = center_to_corners_format(out_bbox)
|
||||||
|
boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
|
||||||
|
|
||||||
|
# and from relative [0, 1] to absolute [0, height] coordinates
|
||||||
|
img_h, img_w = target_sizes.unbind(1)
|
||||||
|
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
|
||||||
|
boxes = boxes * scale_fct[:, None, :]
|
||||||
|
|
||||||
|
results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def post_process_object_detection(
|
||||||
|
self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
|
||||||
|
top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
outputs ([`DetrObjectDetectionOutput`]):
|
||||||
|
Raw outputs of the model.
|
||||||
|
threshold (`float`, *optional*):
|
||||||
|
Score threshold to keep object detection predictions.
|
||||||
|
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
|
||||||
|
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
||||||
|
(height, width) of each image in the batch. If left to None, predictions will not be resized.
|
||||||
|
top_k (`int`, *optional*, defaults to 100):
|
||||||
|
Keep only top k bounding boxes before filtering by thresholding.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
||||||
|
in the batch as predicted by the model.
|
||||||
|
"""
|
||||||
|
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
|
||||||
|
|
||||||
|
if target_sizes is not None:
|
||||||
|
if len(out_logits) != len(target_sizes):
|
||||||
|
raise ValueError(
|
||||||
|
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
|
||||||
|
)
|
||||||
|
|
||||||
|
prob = out_logits.sigmoid()
|
||||||
|
prob = prob.view(out_logits.shape[0], -1)
|
||||||
|
k_value = min(top_k, prob.size(1))
|
||||||
|
topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
|
||||||
|
scores = topk_values
|
||||||
|
topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
|
||||||
|
labels = topk_indexes % out_logits.shape[2]
|
||||||
|
boxes = center_to_corners_format(out_bbox)
|
||||||
|
boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
|
||||||
|
|
||||||
|
# and from relative [0, 1] to absolute [0, height] coordinates
|
||||||
|
if target_sizes is not None:
|
||||||
|
if isinstance(target_sizes, List):
|
||||||
|
img_h = torch.Tensor([i[0] for i in target_sizes])
|
||||||
|
img_w = torch.Tensor([i[1] for i in target_sizes])
|
||||||
|
else:
|
||||||
|
img_h, img_w = target_sizes.unbind(1)
|
||||||
|
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
|
||||||
|
boxes = boxes * scale_fct[:, None, :]
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for s, l, b in zip(scores, labels, boxes):
|
||||||
|
score = s[s > threshold]
|
||||||
|
label = l[s > threshold]
|
||||||
|
box = b[s > threshold]
|
||||||
|
results.append({"scores": score, "labels": label, "boxes": box})
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def post_process_segmentation():
|
||||||
|
raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.")
|
||||||
|
|
||||||
|
def post_process_instance():
|
||||||
|
raise NotImplementedError("Instance post-processing is not implemented for Deformable DETR yet.")
|
||||||
|
|
||||||
|
def post_process_panoptic():
|
||||||
|
raise NotImplementedError("Panoptic post-processing is not implemented for Deformable DETR yet.")
|
||||||
|
|
||||||
|
def post_process_instance_segmentation():
|
||||||
|
raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.")
|
||||||
|
|
||||||
|
def post_process_semantic_segmentation():
|
||||||
|
raise NotImplementedError("Semantic segmentation post-processing is not implemented for Deformable DETR yet.")
|
||||||
|
|
||||||
|
def post_process_panoptic_segmentation():
|
||||||
|
raise NotImplementedError("Panoptic segmentation post-processing is not implemented for Deformable DETR yet.")
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["DeformableDetrImageProcessorFast"]
|
||||||
@@ -72,16 +72,14 @@ if is_torch_available():
|
|||||||
if is_vision_available():
|
if is_vision_available():
|
||||||
import PIL
|
import PIL
|
||||||
|
|
||||||
|
|
||||||
if is_torchvision_available():
|
|
||||||
from torchvision.io import read_image
|
|
||||||
|
|
||||||
if is_vision_available():
|
|
||||||
from ...image_utils import pil_torch_interpolation_mapping
|
from ...image_utils import pil_torch_interpolation_mapping
|
||||||
|
|
||||||
|
|
||||||
if is_torchvision_v2_available():
|
if is_torchvision_v2_available():
|
||||||
|
from torchvision.io import read_image
|
||||||
from torchvision.transforms.v2 import functional as F
|
from torchvision.transforms.v2 import functional as F
|
||||||
else:
|
elif is_torchvision_available():
|
||||||
|
from torchvision.io import read_image
|
||||||
from torchvision.transforms import functional as F
|
from torchvision.transforms import functional as F
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,19 +1,9 @@
|
|||||||
# coding=utf-8
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
# This file was automatically generated from src/transformers/models/rt_detr/modular_rt_detr.py.
|
||||||
#
|
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
# the file from the modular. If any change should be done, please apply the change to the
|
||||||
# you may not use this file except in compliance with the License.
|
# modular_rt_detr.py file directly. One of our CI enforces this.
|
||||||
# You may obtain a copy of the License at
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
"""Fast Image processor class for RT-DETR."""
|
|
||||||
|
|
||||||
import functools
|
import functools
|
||||||
import pathlib
|
import pathlib
|
||||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
@@ -26,10 +16,7 @@ from ...image_processing_utils_fast import (
|
|||||||
get_max_height_width,
|
get_max_height_width,
|
||||||
safe_squeeze,
|
safe_squeeze,
|
||||||
)
|
)
|
||||||
from ...image_transforms import (
|
from ...image_transforms import center_to_corners_format, corners_to_center_format
|
||||||
center_to_corners_format,
|
|
||||||
corners_to_center_format,
|
|
||||||
)
|
|
||||||
from ...image_utils import (
|
from ...image_utils import (
|
||||||
IMAGENET_DEFAULT_MEAN,
|
IMAGENET_DEFAULT_MEAN,
|
||||||
IMAGENET_DEFAULT_STD,
|
IMAGENET_DEFAULT_STD,
|
||||||
@@ -51,30 +38,25 @@ from ...utils import (
|
|||||||
is_torch_available,
|
is_torch_available,
|
||||||
is_torchvision_available,
|
is_torchvision_available,
|
||||||
is_torchvision_v2_available,
|
is_torchvision_v2_available,
|
||||||
logging,
|
is_vision_available,
|
||||||
requires_backends,
|
requires_backends,
|
||||||
)
|
)
|
||||||
from .image_processing_rt_detr import (
|
from .image_processing_rt_detr import get_size_with_aspect_ratio
|
||||||
get_size_with_aspect_ratio,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
if is_vision_available():
|
||||||
if is_torchvision_available():
|
|
||||||
from ...image_utils import pil_torch_interpolation_mapping
|
from ...image_utils import pil_torch_interpolation_mapping
|
||||||
|
|
||||||
|
|
||||||
if is_torchvision_v2_available():
|
if is_torchvision_v2_available():
|
||||||
from torchvision.transforms.v2 import functional as F
|
from torchvision.transforms.v2 import functional as F
|
||||||
else:
|
elif is_torchvision_available():
|
||||||
from torchvision.transforms import functional as F
|
from torchvision.transforms import functional as F
|
||||||
|
|
||||||
|
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
||||||
logger = logging.get_logger(__name__)
|
|
||||||
|
|
||||||
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,)
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_coco_detection_annotation(
|
def prepare_coco_detection_annotation(
|
||||||
@@ -138,13 +120,13 @@ def prepare_coco_detection_annotation(
|
|||||||
|
|
||||||
class RTDetrImageProcessorFast(BaseImageProcessorFast):
|
class RTDetrImageProcessorFast(BaseImageProcessorFast):
|
||||||
r"""
|
r"""
|
||||||
Constructs a fast RT-DETR DETR image processor.
|
Constructs a fast RTDetr image processor.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||||
do_resize (`bool`, *optional*, defaults to `True`):
|
do_resize (`bool`, *optional*, defaults to `True`):
|
||||||
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
|
Controls whether to resize the image's `(height, width)` dimensions to the specified `size`. Can be
|
||||||
overridden by the `do_resize` parameter in the `preprocess` method.
|
overridden by the `do_resize` parameter in the `preprocess` method.
|
||||||
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
|
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
|
||||||
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
|
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
|
||||||
@@ -175,7 +157,7 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
||||||
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||||
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||||
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
|
Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the
|
||||||
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||||
do_pad (`bool`, *optional*, defaults to `False`):
|
do_pad (`bool`, *optional*, defaults to `False`):
|
||||||
@@ -237,7 +219,7 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""
|
"""
|
||||||
Prepare an annotation for feeding into RTDETR model.
|
Prepare an annotation for feeding into RT_DETR model.
|
||||||
"""
|
"""
|
||||||
format = format if format is not None else self.format
|
format = format if format is not None else self.format
|
||||||
|
|
||||||
@@ -250,7 +232,6 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
raise ValueError(f"Format {format} is not supported.")
|
raise ValueError(f"Format {format} is not supported.")
|
||||||
return target
|
return target
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize
|
|
||||||
def resize(
|
def resize(
|
||||||
self,
|
self,
|
||||||
image: torch.Tensor,
|
image: torch.Tensor,
|
||||||
@@ -305,7 +286,6 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
)
|
)
|
||||||
return image
|
return image
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize_annotation
|
|
||||||
def resize_annotation(
|
def resize_annotation(
|
||||||
self,
|
self,
|
||||||
annotation: Dict[str, Any],
|
annotation: Dict[str, Any],
|
||||||
@@ -359,7 +339,6 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
|
|
||||||
return new_annotation
|
return new_annotation
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.normalize_annotation
|
|
||||||
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
||||||
image_height, image_width = image_size
|
image_height, image_width = image_size
|
||||||
norm_annotation = {}
|
norm_annotation = {}
|
||||||
@@ -375,7 +354,6 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
norm_annotation[key] = value
|
norm_annotation[key] = value
|
||||||
return norm_annotation
|
return norm_annotation
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._update_annotation_for_padded_image
|
|
||||||
def _update_annotation_for_padded_image(
|
def _update_annotation_for_padded_image(
|
||||||
self,
|
self,
|
||||||
annotation: Dict,
|
annotation: Dict,
|
||||||
@@ -411,7 +389,6 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
new_annotation[key] = value
|
new_annotation[key] = value
|
||||||
return new_annotation
|
return new_annotation
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.pad
|
|
||||||
def pad(
|
def pad(
|
||||||
self,
|
self,
|
||||||
image: torch.Tensor,
|
image: torch.Tensor,
|
||||||
@@ -443,7 +420,6 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
return image, pixel_mask, annotation
|
return image, pixel_mask, annotation
|
||||||
|
|
||||||
@functools.lru_cache(maxsize=1)
|
@functools.lru_cache(maxsize=1)
|
||||||
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._validate_input_arguments
|
|
||||||
def _validate_input_arguments(
|
def _validate_input_arguments(
|
||||||
self,
|
self,
|
||||||
do_rescale: bool,
|
do_rescale: bool,
|
||||||
@@ -726,7 +702,6 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
]
|
]
|
||||||
return encoded_inputs
|
return encoded_inputs
|
||||||
|
|
||||||
# Copied from transformers.models.rt_detr.image_processing_rt_detr.RTDetrImageProcessor.post_process_object_detection
|
|
||||||
def post_process_object_detection(
|
def post_process_object_detection(
|
||||||
self,
|
self,
|
||||||
outputs,
|
outputs,
|
||||||
|
|||||||
577
src/transformers/models/rt_detr/modular_rt_detr.py
Normal file
577
src/transformers/models/rt_detr/modular_rt_detr.py
Normal file
@@ -0,0 +1,577 @@
|
|||||||
|
import pathlib
|
||||||
|
from typing import Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
from transformers.models.detr.image_processing_detr_fast import DetrImageProcessorFast
|
||||||
|
|
||||||
|
from ...image_processing_utils import BatchFeature, get_size_dict
|
||||||
|
from ...image_processing_utils_fast import (
|
||||||
|
BaseImageProcessorFast,
|
||||||
|
SizeDict,
|
||||||
|
get_max_height_width,
|
||||||
|
)
|
||||||
|
from ...image_transforms import center_to_corners_format
|
||||||
|
from ...image_utils import (
|
||||||
|
IMAGENET_DEFAULT_MEAN,
|
||||||
|
IMAGENET_DEFAULT_STD,
|
||||||
|
AnnotationFormat,
|
||||||
|
AnnotationType,
|
||||||
|
ChannelDimension,
|
||||||
|
ImageInput,
|
||||||
|
ImageType,
|
||||||
|
PILImageResampling,
|
||||||
|
get_image_size,
|
||||||
|
get_image_type,
|
||||||
|
infer_channel_dimension_format,
|
||||||
|
make_list_of_images,
|
||||||
|
validate_annotations,
|
||||||
|
)
|
||||||
|
from ...utils import (
|
||||||
|
TensorType,
|
||||||
|
filter_out_non_signature_kwargs,
|
||||||
|
is_torch_available,
|
||||||
|
is_torchvision_available,
|
||||||
|
is_torchvision_v2_available,
|
||||||
|
is_vision_available,
|
||||||
|
logging,
|
||||||
|
requires_backends,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if is_torch_available():
|
||||||
|
import torch
|
||||||
|
|
||||||
|
if is_vision_available():
|
||||||
|
from ...image_utils import pil_torch_interpolation_mapping
|
||||||
|
|
||||||
|
|
||||||
|
if is_torchvision_v2_available():
|
||||||
|
from torchvision.transforms.v2 import functional as F
|
||||||
|
elif is_torchvision_available():
|
||||||
|
from torchvision.transforms import functional as F
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,)
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_coco_detection_annotation(
|
||||||
|
image,
|
||||||
|
target,
|
||||||
|
return_segmentation_masks: bool = False,
|
||||||
|
input_data_format: Optional[Union[ChannelDimension, str]] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Convert the target in COCO format into the format expected by RT-DETR.
|
||||||
|
"""
|
||||||
|
image_height, image_width = image.size()[-2:]
|
||||||
|
|
||||||
|
image_id = target["image_id"]
|
||||||
|
image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)
|
||||||
|
|
||||||
|
# Get all COCO annotations for the given image.
|
||||||
|
annotations = target["annotations"]
|
||||||
|
classes = []
|
||||||
|
area = []
|
||||||
|
boxes = []
|
||||||
|
keypoints = []
|
||||||
|
for obj in annotations:
|
||||||
|
if "iscrowd" not in obj or obj["iscrowd"] == 0:
|
||||||
|
classes.append(obj["category_id"])
|
||||||
|
area.append(obj["area"])
|
||||||
|
boxes.append(obj["bbox"])
|
||||||
|
if "keypoints" in obj:
|
||||||
|
keypoints.append(obj["keypoints"])
|
||||||
|
|
||||||
|
classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
|
||||||
|
area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
|
||||||
|
iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
|
||||||
|
# guard against no boxes via resizing
|
||||||
|
boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
|
||||||
|
boxes[:, 2:] += boxes[:, :2]
|
||||||
|
boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
|
||||||
|
boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
|
||||||
|
|
||||||
|
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
|
||||||
|
|
||||||
|
new_target = {
|
||||||
|
"image_id": image_id,
|
||||||
|
"class_labels": classes[keep],
|
||||||
|
"boxes": boxes[keep],
|
||||||
|
"area": area[keep],
|
||||||
|
"iscrowd": iscrowd[keep],
|
||||||
|
"orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
|
||||||
|
}
|
||||||
|
|
||||||
|
if keypoints:
|
||||||
|
keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
|
||||||
|
# Apply the keep mask here to filter the relevant annotations
|
||||||
|
keypoints = keypoints[keep]
|
||||||
|
num_keypoints = keypoints.shape[0]
|
||||||
|
keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
|
||||||
|
new_target["keypoints"] = keypoints
|
||||||
|
|
||||||
|
return new_target
|
||||||
|
|
||||||
|
|
||||||
|
class RTDetrImageProcessorFast(DetrImageProcessorFast, BaseImageProcessorFast):
|
||||||
|
r"""
|
||||||
|
Constructs a fast RTDetr image processor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||||
|
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||||
|
do_resize (`bool`, *optional*, defaults to `True`):
|
||||||
|
Controls whether to resize the image's `(height, width)` dimensions to the specified `size`. Can be
|
||||||
|
overridden by the `do_resize` parameter in the `preprocess` method.
|
||||||
|
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
|
||||||
|
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
|
||||||
|
in the `preprocess` method. Available options are:
|
||||||
|
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||||
|
Do NOT keep the aspect ratio.
|
||||||
|
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||||
|
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||||
|
less or equal to `longest_edge`.
|
||||||
|
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||||
|
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||||
|
`max_width`.
|
||||||
|
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
||||||
|
Resampling filter to use if resizing the image.
|
||||||
|
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||||
|
Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
|
||||||
|
`do_rescale` parameter in the `preprocess` method.
|
||||||
|
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||||
|
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
|
||||||
|
`preprocess` method.
|
||||||
|
do_normalize (`bool`, *optional*, defaults to `False`):
|
||||||
|
Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
|
||||||
|
`preprocess` method.
|
||||||
|
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
|
||||||
|
Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
|
||||||
|
channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
|
||||||
|
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
|
||||||
|
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
||||||
|
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||||
|
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||||
|
Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the
|
||||||
|
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||||
|
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||||
|
do_pad (`bool`, *optional*, defaults to `False`):
|
||||||
|
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||||
|
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
|
||||||
|
If `pad_size` is provided, the image will be padded to the specified dimensions.
|
||||||
|
Otherwise, the image will be padded to the maximum height and width of the batch.
|
||||||
|
pad_size (`Dict[str, int]`, *optional*):
|
||||||
|
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||||
|
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||||
|
height and width in the batch.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
|
||||||
|
do_resize: bool = True,
|
||||||
|
size: Dict[str, int] = None,
|
||||||
|
resample: Union[PILImageResampling, "F.InterpolationMode"] = PILImageResampling.BILINEAR,
|
||||||
|
do_rescale: bool = True,
|
||||||
|
rescale_factor: Union[int, float] = 1 / 255,
|
||||||
|
do_normalize: bool = False,
|
||||||
|
image_mean: Union[float, List[float]] = None,
|
||||||
|
image_std: Union[float, List[float]] = None,
|
||||||
|
do_convert_annotations: bool = True,
|
||||||
|
do_pad: bool = False,
|
||||||
|
pad_size: Optional[Dict[str, int]] = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
size = size if size is not None else {"height": 640, "width": 640}
|
||||||
|
size = get_size_dict(size, default_to_square=False)
|
||||||
|
|
||||||
|
if do_convert_annotations is None:
|
||||||
|
do_convert_annotations = do_normalize
|
||||||
|
|
||||||
|
BaseImageProcessorFast.__init__(**kwargs)
|
||||||
|
self.format = format
|
||||||
|
self.do_resize = do_resize
|
||||||
|
self.size = size
|
||||||
|
self.resample = resample
|
||||||
|
self.do_rescale = do_rescale
|
||||||
|
self.rescale_factor = rescale_factor
|
||||||
|
self.do_normalize = do_normalize
|
||||||
|
self.do_convert_annotations = do_convert_annotations
|
||||||
|
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||||
|
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||||
|
self.do_pad = do_pad
|
||||||
|
self.pad_size = pad_size
|
||||||
|
|
||||||
|
def prepare_annotation(
|
||||||
|
self,
|
||||||
|
image: torch.Tensor,
|
||||||
|
target: Dict,
|
||||||
|
format: Optional[AnnotationFormat] = None,
|
||||||
|
return_segmentation_masks: bool = None,
|
||||||
|
masks_path: Optional[Union[str, pathlib.Path]] = None,
|
||||||
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
) -> Dict:
|
||||||
|
format = format if format is not None else self.format
|
||||||
|
|
||||||
|
if format == AnnotationFormat.COCO_DETECTION:
|
||||||
|
return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
|
||||||
|
target = prepare_coco_detection_annotation(
|
||||||
|
image, target, return_segmentation_masks, input_data_format=input_data_format
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Format {format} is not supported.")
|
||||||
|
return target
|
||||||
|
|
||||||
|
@filter_out_non_signature_kwargs(extra=["device"])
|
||||||
|
def preprocess(
|
||||||
|
self,
|
||||||
|
images: ImageInput,
|
||||||
|
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
|
||||||
|
return_segmentation_masks: bool = None,
|
||||||
|
masks_path: Optional[Union[str, pathlib.Path]] = None,
|
||||||
|
do_resize: Optional[bool] = None,
|
||||||
|
size: Optional[Dict[str, int]] = None,
|
||||||
|
resample: Optional[Union[PILImageResampling, "F.InterpolationMode"]] = None,
|
||||||
|
do_rescale: Optional[bool] = None,
|
||||||
|
rescale_factor: Optional[Union[int, float]] = None,
|
||||||
|
do_normalize: Optional[bool] = None,
|
||||||
|
do_convert_annotations: Optional[bool] = None,
|
||||||
|
image_mean: Optional[Union[float, List[float]]] = None,
|
||||||
|
image_std: Optional[Union[float, List[float]]] = None,
|
||||||
|
do_pad: Optional[bool] = None,
|
||||||
|
format: Optional[Union[str, AnnotationFormat]] = None,
|
||||||
|
return_tensors: Optional[Union[TensorType, str]] = None,
|
||||||
|
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
|
||||||
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
pad_size: Optional[Dict[str, int]] = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> BatchFeature:
|
||||||
|
"""
|
||||||
|
Preprocess an image or a batch of images so that it can be used by the model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
images (`ImageInput`):
|
||||||
|
Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
|
||||||
|
from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
||||||
|
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
|
||||||
|
List of annotations associated with the image or batch of images. If annotation is for object
|
||||||
|
detection, the annotations should be a dictionary with the following keys:
|
||||||
|
- "image_id" (`int`): The image id.
|
||||||
|
- "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
|
||||||
|
dictionary. An image can have no annotations, in which case the list should be empty.
|
||||||
|
If annotation is for segmentation, the annotations should be a dictionary with the following keys:
|
||||||
|
- "image_id" (`int`): The image id.
|
||||||
|
- "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
|
||||||
|
An image can have no segments, in which case the list should be empty.
|
||||||
|
- "file_name" (`str`): The file name of the image.
|
||||||
|
return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
|
||||||
|
Whether to return segmentation masks.
|
||||||
|
masks_path (`str` or `pathlib.Path`, *optional*):
|
||||||
|
Path to the directory containing the segmentation masks.
|
||||||
|
do_resize (`bool`, *optional*, defaults to self.do_resize):
|
||||||
|
Whether to resize the image.
|
||||||
|
size (`Dict[str, int]`, *optional*, defaults to self.size):
|
||||||
|
Size of the image's `(height, width)` dimensions after resizing. Available options are:
|
||||||
|
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||||
|
Do NOT keep the aspect ratio.
|
||||||
|
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||||
|
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||||
|
less or equal to `longest_edge`.
|
||||||
|
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||||
|
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||||
|
`max_width`.
|
||||||
|
resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to self.resample):
|
||||||
|
Resampling filter to use when resizing the image.
|
||||||
|
do_rescale (`bool`, *optional*, defaults to self.do_rescale):
|
||||||
|
Whether to rescale the image.
|
||||||
|
rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
|
||||||
|
Rescale factor to use when rescaling the image.
|
||||||
|
do_normalize (`bool`, *optional*, defaults to self.do_normalize):
|
||||||
|
Whether to normalize the image.
|
||||||
|
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
|
||||||
|
Whether to convert the annotations to the format expected by the model. Converts the bounding
|
||||||
|
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
|
||||||
|
and in relative coordinates.
|
||||||
|
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
|
||||||
|
Mean to use when normalizing the image.
|
||||||
|
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
||||||
|
Standard deviation to use when normalizing the image.
|
||||||
|
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
||||||
|
Whether to pad the image. If `True`, padding will be applied to the bottom and right of
|
||||||
|
the image with zeros. If `pad_size` is provided, the image will be padded to the specified
|
||||||
|
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
|
||||||
|
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
||||||
|
Format of the annotations.
|
||||||
|
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
||||||
|
Type of tensors to return. If `None`, will return the list of images.
|
||||||
|
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||||
|
The channel dimension format for the output image. Can be one of:
|
||||||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||||
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||||
|
- Unset: Use the channel dimension format of the input image.
|
||||||
|
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||||
|
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||||
|
from the input image. Can be one of:
|
||||||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||||
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||||
|
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||||
|
pad_size (`Dict[str, int]`, *optional*):
|
||||||
|
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||||
|
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||||
|
height and width in the batch.
|
||||||
|
"""
|
||||||
|
do_resize = self.do_resize if do_resize is None else do_resize
|
||||||
|
size = self.size if size is None else size
|
||||||
|
size = get_size_dict(size=size, default_to_square=True)
|
||||||
|
resample = self.resample if resample is None else resample
|
||||||
|
do_rescale = self.do_rescale if do_rescale is None else do_rescale
|
||||||
|
rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
|
||||||
|
do_normalize = self.do_normalize if do_normalize is None else do_normalize
|
||||||
|
image_mean = self.image_mean if image_mean is None else image_mean
|
||||||
|
image_std = self.image_std if image_std is None else image_std
|
||||||
|
do_convert_annotations = (
|
||||||
|
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
|
||||||
|
)
|
||||||
|
do_pad = self.do_pad if do_pad is None else do_pad
|
||||||
|
pad_size = self.pad_size if pad_size is None else pad_size
|
||||||
|
format = self.format if format is None else format
|
||||||
|
return_tensors = "pt" if return_tensors is None else return_tensors
|
||||||
|
device = kwargs.pop("device", None)
|
||||||
|
|
||||||
|
# Make hashable for cache
|
||||||
|
size = SizeDict(**size)
|
||||||
|
image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean
|
||||||
|
image_std = tuple(image_std) if isinstance(image_std, list) else image_std
|
||||||
|
|
||||||
|
images = make_list_of_images(images)
|
||||||
|
image_type = get_image_type(images[0])
|
||||||
|
|
||||||
|
if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
|
||||||
|
raise ValueError(f"Unsupported input image type {image_type}")
|
||||||
|
|
||||||
|
self._validate_input_arguments(
|
||||||
|
do_rescale=do_rescale,
|
||||||
|
rescale_factor=rescale_factor,
|
||||||
|
do_normalize=do_normalize,
|
||||||
|
image_mean=image_mean,
|
||||||
|
image_std=image_std,
|
||||||
|
do_resize=do_resize,
|
||||||
|
size=size,
|
||||||
|
resample=resample,
|
||||||
|
return_tensors=return_tensors,
|
||||||
|
data_format=data_format,
|
||||||
|
)
|
||||||
|
|
||||||
|
if annotations is not None and isinstance(annotations, dict):
|
||||||
|
annotations = [annotations]
|
||||||
|
|
||||||
|
if annotations is not None and len(images) != len(annotations):
|
||||||
|
raise ValueError(
|
||||||
|
f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
|
||||||
|
)
|
||||||
|
|
||||||
|
format = AnnotationFormat(format)
|
||||||
|
if annotations is not None:
|
||||||
|
validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
if image_type == ImageType.PIL:
|
||||||
|
images = [F.pil_to_tensor(image) for image in images]
|
||||||
|
elif image_type == ImageType.NUMPY:
|
||||||
|
# not using F.to_tensor as it doesn't handle (C, H, W) numpy arrays
|
||||||
|
images = [torch.from_numpy(image).contiguous() for image in images]
|
||||||
|
|
||||||
|
if device is not None:
|
||||||
|
images = [image.to(device) for image in images]
|
||||||
|
|
||||||
|
# We assume that all images have the same channel dimension format.
|
||||||
|
if input_data_format is None:
|
||||||
|
input_data_format = infer_channel_dimension_format(images[0])
|
||||||
|
if input_data_format == ChannelDimension.LAST:
|
||||||
|
images = [image.permute(2, 0, 1).contiguous() for image in images]
|
||||||
|
input_data_format = ChannelDimension.FIRST
|
||||||
|
|
||||||
|
if do_rescale and do_normalize:
|
||||||
|
# fused rescale and normalize
|
||||||
|
new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor)
|
||||||
|
new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor)
|
||||||
|
|
||||||
|
processed_images = []
|
||||||
|
processed_annotations = []
|
||||||
|
pixel_masks = [] # Initialize pixel_masks here
|
||||||
|
for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
|
||||||
|
# prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
|
||||||
|
if annotations is not None:
|
||||||
|
annotation = self.prepare_annotation(
|
||||||
|
image,
|
||||||
|
annotation,
|
||||||
|
format,
|
||||||
|
return_segmentation_masks=return_segmentation_masks,
|
||||||
|
masks_path=masks_path,
|
||||||
|
input_data_format=input_data_format,
|
||||||
|
)
|
||||||
|
|
||||||
|
if do_resize:
|
||||||
|
interpolation = (
|
||||||
|
pil_torch_interpolation_mapping[resample]
|
||||||
|
if isinstance(resample, (PILImageResampling, int))
|
||||||
|
else resample
|
||||||
|
)
|
||||||
|
resized_image = self.resize(image, size=size, interpolation=interpolation)
|
||||||
|
if annotations is not None:
|
||||||
|
annotation = self.resize_annotation(
|
||||||
|
annotation,
|
||||||
|
orig_size=image.size()[-2:],
|
||||||
|
target_size=resized_image.size()[-2:],
|
||||||
|
)
|
||||||
|
image = resized_image
|
||||||
|
|
||||||
|
if do_rescale and do_normalize:
|
||||||
|
# fused rescale and normalize
|
||||||
|
image = F.normalize(image.to(dtype=torch.float32), new_mean, new_std)
|
||||||
|
elif do_rescale:
|
||||||
|
image = image * rescale_factor
|
||||||
|
elif do_normalize:
|
||||||
|
image = F.normalize(image, image_mean, image_std)
|
||||||
|
|
||||||
|
if do_convert_annotations and annotations is not None:
|
||||||
|
annotation = self.normalize_annotation(annotation, get_image_size(image, input_data_format))
|
||||||
|
|
||||||
|
processed_images.append(image)
|
||||||
|
processed_annotations.append(annotation)
|
||||||
|
images = processed_images
|
||||||
|
annotations = processed_annotations if annotations is not None else None
|
||||||
|
|
||||||
|
if do_pad:
|
||||||
|
# depends on all resized image shapes so we need another loop
|
||||||
|
if pad_size is not None:
|
||||||
|
padded_size = (pad_size["height"], pad_size["width"])
|
||||||
|
else:
|
||||||
|
padded_size = get_max_height_width(images)
|
||||||
|
|
||||||
|
padded_images = []
|
||||||
|
padded_annotations = []
|
||||||
|
for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
|
||||||
|
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
|
||||||
|
if padded_size == image.size()[-2:]:
|
||||||
|
padded_images.append(image)
|
||||||
|
pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
|
||||||
|
padded_annotations.append(annotation)
|
||||||
|
continue
|
||||||
|
image, pixel_mask, annotation = self.pad(
|
||||||
|
image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
|
||||||
|
)
|
||||||
|
padded_images.append(image)
|
||||||
|
padded_annotations.append(annotation)
|
||||||
|
pixel_masks.append(pixel_mask)
|
||||||
|
images = padded_images
|
||||||
|
annotations = padded_annotations if annotations is not None else None
|
||||||
|
data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
|
||||||
|
|
||||||
|
data.update({"pixel_values": torch.stack(images, dim=0)})
|
||||||
|
encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
|
||||||
|
if annotations is not None:
|
||||||
|
encoded_inputs["labels"] = [
|
||||||
|
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
||||||
|
]
|
||||||
|
return encoded_inputs
|
||||||
|
|
||||||
|
def post_process_object_detection(
|
||||||
|
self,
|
||||||
|
outputs,
|
||||||
|
threshold: float = 0.5,
|
||||||
|
target_sizes: Union[TensorType, List[Tuple]] = None,
|
||||||
|
use_focal_loss: bool = True,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
|
||||||
|
bottom_right_x, bottom_right_y) format. Only supports PyTorch.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
outputs ([`DetrObjectDetectionOutput`]):
|
||||||
|
Raw outputs of the model.
|
||||||
|
threshold (`float`, *optional*, defaults to 0.5):
|
||||||
|
Score threshold to keep object detection predictions.
|
||||||
|
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
|
||||||
|
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
||||||
|
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
|
||||||
|
use_focal_loss (`bool` defaults to `True`):
|
||||||
|
Variable informing if the focal loss was used to predict the outputs. If `True`, a sigmoid is applied
|
||||||
|
to compute the scores of each detection, otherwise, a softmax function is used.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
||||||
|
in the batch as predicted by the model.
|
||||||
|
"""
|
||||||
|
requires_backends(self, ["torch"])
|
||||||
|
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
|
||||||
|
# convert from relative cxcywh to absolute xyxy
|
||||||
|
boxes = center_to_corners_format(out_bbox)
|
||||||
|
if target_sizes is not None:
|
||||||
|
if len(out_logits) != len(target_sizes):
|
||||||
|
raise ValueError(
|
||||||
|
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
|
||||||
|
)
|
||||||
|
if isinstance(target_sizes, List):
|
||||||
|
img_h, img_w = torch.as_tensor(target_sizes).unbind(1)
|
||||||
|
else:
|
||||||
|
img_h, img_w = target_sizes.unbind(1)
|
||||||
|
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
|
||||||
|
boxes = boxes * scale_fct[:, None, :]
|
||||||
|
|
||||||
|
num_top_queries = out_logits.shape[1]
|
||||||
|
num_classes = out_logits.shape[2]
|
||||||
|
|
||||||
|
if use_focal_loss:
|
||||||
|
scores = torch.nn.functional.sigmoid(out_logits)
|
||||||
|
scores, index = torch.topk(scores.flatten(1), num_top_queries, axis=-1)
|
||||||
|
labels = index % num_classes
|
||||||
|
index = index // num_classes
|
||||||
|
boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))
|
||||||
|
else:
|
||||||
|
scores = torch.nn.functional.softmax(out_logits)[:, :, :-1]
|
||||||
|
scores, labels = scores.max(dim=-1)
|
||||||
|
if scores.shape[1] > num_top_queries:
|
||||||
|
scores, index = torch.topk(scores, num_top_queries, dim=-1)
|
||||||
|
labels = torch.gather(labels, dim=1, index=index)
|
||||||
|
boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for score, label, box in zip(scores, labels, boxes):
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
"scores": score[score > threshold],
|
||||||
|
"labels": label[score > threshold],
|
||||||
|
"boxes": box[score > threshold],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def from_dict():
|
||||||
|
raise NotImplementedError("No need to override this method for RT-DETR yet.")
|
||||||
|
|
||||||
|
def post_process():
|
||||||
|
raise NotImplementedError("Post-processing is not implemented for RT-DETR yet.")
|
||||||
|
|
||||||
|
def post_process_segmentation():
|
||||||
|
raise NotImplementedError("Segmentation post-processing is not implemented for RT-DETR yet.")
|
||||||
|
|
||||||
|
def post_process_instance():
|
||||||
|
raise NotImplementedError("Instance post-processing is not implemented for RT-DETR yet.")
|
||||||
|
|
||||||
|
def post_process_panoptic():
|
||||||
|
raise NotImplementedError("Panoptic post-processing is not implemented for RT-DETR yet.")
|
||||||
|
|
||||||
|
def post_process_instance_segmentation():
|
||||||
|
raise NotImplementedError("Segmentation post-processing is not implemented for RT-DETR yet.")
|
||||||
|
|
||||||
|
def post_process_semantic_segmentation():
|
||||||
|
raise NotImplementedError("Semantic segmentation post-processing is not implemented for RT-DETR yet.")
|
||||||
|
|
||||||
|
def post_process_panoptic_segmentation():
|
||||||
|
raise NotImplementedError("Panoptic segmentation post-processing is not implemented for RT-DETR yet.")
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["RTDetrImageProcessorFast"]
|
||||||
@@ -18,7 +18,9 @@ console = Console()
|
|||||||
|
|
||||||
|
|
||||||
def process_file(modular_file_path, generated_modeling_content, file_type="modeling_", fix_and_overwrite=False):
|
def process_file(modular_file_path, generated_modeling_content, file_type="modeling_", fix_and_overwrite=False):
|
||||||
file_path = modular_file_path.replace("modular_", f"{file_type}_")
|
file_name_prefix = file_type.split("*")[0]
|
||||||
|
file_name_suffix = file_type.split("*")[-1] if "*" in file_type else ""
|
||||||
|
file_path = modular_file_path.replace("modular_", f"{file_name_prefix}_").replace(".py", f"{file_name_suffix}.py")
|
||||||
# Read the actual modeling file
|
# Read the actual modeling file
|
||||||
with open(file_path, "r") as modeling_file:
|
with open(file_path, "r") as modeling_file:
|
||||||
content = modeling_file.read()
|
content = modeling_file.read()
|
||||||
|
|||||||
@@ -7,10 +7,15 @@ def topological_sort(dependencies):
|
|||||||
new_dependencies = {}
|
new_dependencies = {}
|
||||||
graph = defaultdict(list)
|
graph = defaultdict(list)
|
||||||
for node, deps in dependencies.items():
|
for node, deps in dependencies.items():
|
||||||
|
node_name = node.split("/")[-2]
|
||||||
for dep in deps:
|
for dep in deps:
|
||||||
if "example" not in node and "auto" not in dep:
|
dep_name = dep.split(".")[-2]
|
||||||
graph[dep.split(".")[-2]].append(node.split("/")[-2])
|
if dep_name == node_name:
|
||||||
new_dependencies[node.split("/")[-2]] = node
|
# Skip self dependencies for topological sort as they create cycles
|
||||||
|
continue
|
||||||
|
if "example" not in node and "auto" not in dep and node_name not in graph[dep_name]:
|
||||||
|
graph[dep_name].append(node_name)
|
||||||
|
new_dependencies[node_name] = node
|
||||||
|
|
||||||
# Create a graph and in-degree count for each node
|
# Create a graph and in-degree count for each node
|
||||||
def filter_one_by_one(filtered_list, reverse):
|
def filter_one_by_one(filtered_list, reverse):
|
||||||
@@ -54,7 +59,7 @@ def extract_classes_and_imports(file_path):
|
|||||||
for node in ast.walk(tree):
|
for node in ast.walk(tree):
|
||||||
if isinstance(node, (ast.Import, ast.ImportFrom)):
|
if isinstance(node, (ast.Import, ast.ImportFrom)):
|
||||||
module = node.module if isinstance(node, ast.ImportFrom) else None
|
module = node.module if isinstance(node, ast.ImportFrom) else None
|
||||||
if module and (".modeling_" in module):
|
if module and (".modeling_" in module or "transformers.models" in module):
|
||||||
imports.add(module)
|
imports.add(module)
|
||||||
return imports
|
return imports
|
||||||
|
|
||||||
|
|||||||
@@ -1059,6 +1059,7 @@ TYPE_TO_FILE_TYPE = {
|
|||||||
"Tokenizer": "tokenization",
|
"Tokenizer": "tokenization",
|
||||||
"Processor": "processing",
|
"Processor": "processing",
|
||||||
"ImageProcessor": "image_processing",
|
"ImageProcessor": "image_processing",
|
||||||
|
"ImageProcessorFast": "image_processing*_fast", # "*" indicates where to insert the model name before the "_fast" suffix
|
||||||
"FeatureExtractor": "feature_extractor",
|
"FeatureExtractor": "feature_extractor",
|
||||||
"ProcessorKwargs": "processing",
|
"ProcessorKwargs": "processing",
|
||||||
"ImagesKwargs": "processing",
|
"ImagesKwargs": "processing",
|
||||||
@@ -1658,11 +1659,16 @@ def convert_modular_file(modular_file):
|
|||||||
|
|
||||||
def save_modeling_file(modular_file, converted_file):
|
def save_modeling_file(modular_file, converted_file):
|
||||||
for file_type in converted_file.keys():
|
for file_type in converted_file.keys():
|
||||||
|
file_name_prefix = file_type.split("*")[0]
|
||||||
|
file_name_suffix = file_type.split("*")[-1] if "*" in file_type else ""
|
||||||
|
new_file_name = modular_file.replace("modular_", f"{file_name_prefix}_").replace(
|
||||||
|
".py", f"{file_name_suffix}.py"
|
||||||
|
)
|
||||||
non_comment_lines = len(
|
non_comment_lines = len(
|
||||||
[line for line in converted_file[file_type][0].strip().split("\n") if not line.strip().startswith("#")]
|
[line for line in converted_file[file_type][0].strip().split("\n") if not line.strip().startswith("#")]
|
||||||
)
|
)
|
||||||
if len(converted_file[file_type][0].strip()) > 0 and non_comment_lines > 0:
|
if len(converted_file[file_type][0].strip()) > 0 and non_comment_lines > 0:
|
||||||
with open(modular_file.replace("modular_", f"{file_type}_"), "w", encoding="utf-8") as f:
|
with open(new_file_name, "w", encoding="utf-8") as f:
|
||||||
f.write(converted_file[file_type][0])
|
f.write(converted_file[file_type][0])
|
||||||
else:
|
else:
|
||||||
non_comment_lines = len(
|
non_comment_lines = len(
|
||||||
@@ -1670,7 +1676,7 @@ def save_modeling_file(modular_file, converted_file):
|
|||||||
)
|
)
|
||||||
if len(converted_file[file_type][1].strip()) > 0 and non_comment_lines > 0:
|
if len(converted_file[file_type][1].strip()) > 0 and non_comment_lines > 0:
|
||||||
logger.warning("The modeling code contains errors, it's written without formatting")
|
logger.warning("The modeling code contains errors, it's written without formatting")
|
||||||
with open(modular_file.replace("modular_", f"{file_type}_"), "w", encoding="utf-8") as f:
|
with open(new_file_name, "w", encoding="utf-8") as f:
|
||||||
f.write(converted_file[file_type][1])
|
f.write(converted_file[file_type][1])
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user