[DETR] Update the processing to adapt masks & bboxes to reflect padding (#28363)
* Update the processing so bbox coords are adjusted for padding * Just pad masks * Tidy up, add tests * Better tests * Fix yolos and mark as slow for pycocotols * Fix yolos - return_tensors * Clarify padding and normalization behaviour
This commit is contained in:
@@ -280,7 +280,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
|
||||||
def _pad_image(
|
def _pad_image(
|
||||||
self,
|
self,
|
||||||
image: np.ndarray,
|
image: np.ndarray,
|
||||||
@@ -308,7 +308,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
|
|||||||
)
|
)
|
||||||
return padded_image
|
return padded_image
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
|
||||||
def pad(
|
def pad(
|
||||||
self,
|
self,
|
||||||
images: List[np.ndarray],
|
images: List[np.ndarray],
|
||||||
|
|||||||
@@ -785,9 +785,14 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
|||||||
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
|
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
|
||||||
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
||||||
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||||
|
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||||
|
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
|
||||||
|
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||||
|
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||||
do_pad (`bool`, *optional*, defaults to `True`):
|
do_pad (`bool`, *optional*, defaults to `True`):
|
||||||
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
|
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||||
overridden by the `do_pad` parameter in the `preprocess` method.
|
method. If `True` will pad the images in the batch to the largest height and width in the batch.
|
||||||
|
Padding will be applied to the bottom and right of the image with zeros.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model_input_names = ["pixel_values", "pixel_mask"]
|
model_input_names = ["pixel_values", "pixel_mask"]
|
||||||
@@ -804,6 +809,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
|||||||
do_normalize: bool = True,
|
do_normalize: bool = True,
|
||||||
image_mean: Union[float, List[float]] = None,
|
image_mean: Union[float, List[float]] = None,
|
||||||
image_std: Union[float, List[float]] = None,
|
image_std: Union[float, List[float]] = None,
|
||||||
|
do_convert_annotations: Optional[bool] = None,
|
||||||
do_pad: bool = True,
|
do_pad: bool = True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -822,6 +828,10 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
|||||||
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
|
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
|
||||||
size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
||||||
|
|
||||||
|
# Backwards compatibility
|
||||||
|
if do_convert_annotations is None:
|
||||||
|
do_convert_annotations = do_normalize
|
||||||
|
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.format = format
|
self.format = format
|
||||||
self.do_resize = do_resize
|
self.do_resize = do_resize
|
||||||
@@ -830,6 +840,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
|||||||
self.do_rescale = do_rescale
|
self.do_rescale = do_rescale
|
||||||
self.rescale_factor = rescale_factor
|
self.rescale_factor = rescale_factor
|
||||||
self.do_normalize = do_normalize
|
self.do_normalize = do_normalize
|
||||||
|
self.do_convert_annotations = do_convert_annotations
|
||||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||||
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||||
self.do_pad = do_pad
|
self.do_pad = do_pad
|
||||||
@@ -1007,18 +1018,64 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
|||||||
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
||||||
"""
|
"""
|
||||||
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
|
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
|
||||||
`[center_x, center_y, width, height]` format.
|
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
|
||||||
"""
|
"""
|
||||||
return normalize_annotation(annotation, image_size=image_size)
|
return normalize_annotation(annotation, image_size=image_size)
|
||||||
|
|
||||||
|
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
|
||||||
|
def _update_annotation_for_padded_image(
|
||||||
|
self,
|
||||||
|
annotation: Dict,
|
||||||
|
input_image_size: Tuple[int, int],
|
||||||
|
output_image_size: Tuple[int, int],
|
||||||
|
padding,
|
||||||
|
update_bboxes,
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
Update the annotation for a padded image.
|
||||||
|
"""
|
||||||
|
new_annotation = {}
|
||||||
|
new_annotation["size"] = output_image_size
|
||||||
|
|
||||||
|
for key, value in annotation.items():
|
||||||
|
if key == "masks":
|
||||||
|
masks = value
|
||||||
|
masks = pad(
|
||||||
|
masks,
|
||||||
|
padding,
|
||||||
|
mode=PaddingMode.CONSTANT,
|
||||||
|
constant_values=0,
|
||||||
|
input_data_format=ChannelDimension.FIRST,
|
||||||
|
)
|
||||||
|
masks = safe_squeeze(masks, 1)
|
||||||
|
new_annotation["masks"] = masks
|
||||||
|
elif key == "boxes" and update_bboxes:
|
||||||
|
boxes = value
|
||||||
|
boxes *= np.asarray(
|
||||||
|
[
|
||||||
|
input_image_size[1] / output_image_size[1],
|
||||||
|
input_image_size[0] / output_image_size[0],
|
||||||
|
input_image_size[1] / output_image_size[1],
|
||||||
|
input_image_size[0] / output_image_size[0],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
new_annotation["boxes"] = boxes
|
||||||
|
elif key == "size":
|
||||||
|
new_annotation["size"] = output_image_size
|
||||||
|
else:
|
||||||
|
new_annotation[key] = value
|
||||||
|
return new_annotation
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
||||||
def _pad_image(
|
def _pad_image(
|
||||||
self,
|
self,
|
||||||
image: np.ndarray,
|
image: np.ndarray,
|
||||||
output_size: Tuple[int, int],
|
output_size: Tuple[int, int],
|
||||||
|
annotation: Optional[Dict[str, Any]] = None,
|
||||||
constant_values: Union[float, Iterable[float]] = 0,
|
constant_values: Union[float, Iterable[float]] = 0,
|
||||||
data_format: Optional[ChannelDimension] = None,
|
data_format: Optional[ChannelDimension] = None,
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
update_bboxes: bool = True,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""
|
"""
|
||||||
Pad an image with zeros to the given size.
|
Pad an image with zeros to the given size.
|
||||||
@@ -1037,25 +1094,33 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
|||||||
data_format=data_format,
|
data_format=data_format,
|
||||||
input_data_format=input_data_format,
|
input_data_format=input_data_format,
|
||||||
)
|
)
|
||||||
return padded_image
|
if annotation is not None:
|
||||||
|
annotation = self._update_annotation_for_padded_image(
|
||||||
|
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
|
||||||
|
)
|
||||||
|
return padded_image, annotation
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
||||||
def pad(
|
def pad(
|
||||||
self,
|
self,
|
||||||
images: List[np.ndarray],
|
images: List[np.ndarray],
|
||||||
|
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
|
||||||
constant_values: Union[float, Iterable[float]] = 0,
|
constant_values: Union[float, Iterable[float]] = 0,
|
||||||
return_pixel_mask: bool = True,
|
return_pixel_mask: bool = True,
|
||||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||||
data_format: Optional[ChannelDimension] = None,
|
data_format: Optional[ChannelDimension] = None,
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
update_bboxes: bool = True,
|
||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
"""
|
"""
|
||||||
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
||||||
in the batch and optionally returns their corresponding pixel mask.
|
in the batch and optionally returns their corresponding pixel mask.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image (`np.ndarray`):
|
images (List[`np.ndarray`]):
|
||||||
Image to pad.
|
Images to pad.
|
||||||
|
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
|
||||||
|
Annotations to transform according to the padding that is applied to the images.
|
||||||
constant_values (`float` or `Iterable[float]`, *optional*):
|
constant_values (`float` or `Iterable[float]`, *optional*):
|
||||||
The value to use for the padding if `mode` is `"constant"`.
|
The value to use for the padding if `mode` is `"constant"`.
|
||||||
return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
||||||
@@ -1071,19 +1136,29 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
|||||||
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
||||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||||
The channel dimension format of the input image. If not provided, it will be inferred.
|
The channel dimension format of the input image. If not provided, it will be inferred.
|
||||||
|
update_bboxes (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to update the bounding boxes in the annotations to match the padded images. If the
|
||||||
|
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
|
||||||
|
format, the bounding boxes will not be updated.
|
||||||
"""
|
"""
|
||||||
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||||
|
|
||||||
padded_images = [
|
annotation_list = annotations if annotations is not None else [None] * len(images)
|
||||||
self._pad_image(
|
padded_images = []
|
||||||
|
padded_annotations = []
|
||||||
|
for image, annotation in zip(images, annotation_list):
|
||||||
|
padded_image, padded_annotation = self._pad_image(
|
||||||
image,
|
image,
|
||||||
pad_size,
|
pad_size,
|
||||||
|
annotation,
|
||||||
constant_values=constant_values,
|
constant_values=constant_values,
|
||||||
data_format=data_format,
|
data_format=data_format,
|
||||||
input_data_format=input_data_format,
|
input_data_format=input_data_format,
|
||||||
|
update_bboxes=update_bboxes,
|
||||||
)
|
)
|
||||||
for image in images
|
padded_images.append(padded_image)
|
||||||
]
|
padded_annotations.append(padded_annotation)
|
||||||
|
|
||||||
data = {"pixel_values": padded_images}
|
data = {"pixel_values": padded_images}
|
||||||
|
|
||||||
if return_pixel_mask:
|
if return_pixel_mask:
|
||||||
@@ -1093,7 +1168,14 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
|||||||
]
|
]
|
||||||
data["pixel_mask"] = masks
|
data["pixel_mask"] = masks
|
||||||
|
|
||||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
||||||
|
|
||||||
|
if annotations is not None:
|
||||||
|
encoded_inputs["labels"] = [
|
||||||
|
BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
|
||||||
|
]
|
||||||
|
|
||||||
|
return encoded_inputs
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
|
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
|
||||||
def preprocess(
|
def preprocess(
|
||||||
@@ -1108,6 +1190,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
|||||||
do_rescale: Optional[bool] = None,
|
do_rescale: Optional[bool] = None,
|
||||||
rescale_factor: Optional[Union[int, float]] = None,
|
rescale_factor: Optional[Union[int, float]] = None,
|
||||||
do_normalize: Optional[bool] = None,
|
do_normalize: Optional[bool] = None,
|
||||||
|
do_convert_annotations: Optional[bool] = None,
|
||||||
image_mean: Optional[Union[float, List[float]]] = None,
|
image_mean: Optional[Union[float, List[float]]] = None,
|
||||||
image_std: Optional[Union[float, List[float]]] = None,
|
image_std: Optional[Union[float, List[float]]] = None,
|
||||||
do_pad: Optional[bool] = None,
|
do_pad: Optional[bool] = None,
|
||||||
@@ -1151,12 +1234,17 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
|||||||
Rescale factor to use when rescaling the image.
|
Rescale factor to use when rescaling the image.
|
||||||
do_normalize (`bool`, *optional*, defaults to self.do_normalize):
|
do_normalize (`bool`, *optional*, defaults to self.do_normalize):
|
||||||
Whether to normalize the image.
|
Whether to normalize the image.
|
||||||
|
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
|
||||||
|
Whether to convert the annotations to the format expected by the model. Converts the bounding
|
||||||
|
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
|
||||||
|
and in relative coordinates.
|
||||||
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
|
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
|
||||||
Mean to use when normalizing the image.
|
Mean to use when normalizing the image.
|
||||||
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
||||||
Standard deviation to use when normalizing the image.
|
Standard deviation to use when normalizing the image.
|
||||||
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
||||||
Whether to pad the image.
|
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
|
||||||
|
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
|
||||||
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
||||||
Format of the annotations.
|
Format of the annotations.
|
||||||
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
||||||
@@ -1197,6 +1285,9 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
|||||||
do_normalize = self.do_normalize if do_normalize is None else do_normalize
|
do_normalize = self.do_normalize if do_normalize is None else do_normalize
|
||||||
image_mean = self.image_mean if image_mean is None else image_mean
|
image_mean = self.image_mean if image_mean is None else image_mean
|
||||||
image_std = self.image_std if image_std is None else image_std
|
image_std = self.image_std if image_std is None else image_std
|
||||||
|
do_convert_annotations = (
|
||||||
|
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
|
||||||
|
)
|
||||||
do_pad = self.do_pad if do_pad is None else do_pad
|
do_pad = self.do_pad if do_pad is None else do_pad
|
||||||
format = self.format if format is None else format
|
format = self.format if format is None else format
|
||||||
|
|
||||||
@@ -1300,29 +1391,34 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
|||||||
images = [
|
images = [
|
||||||
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
|
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
|
||||||
]
|
]
|
||||||
if annotations is not None:
|
|
||||||
annotations = [
|
if do_convert_annotations and annotations is not None:
|
||||||
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
|
annotations = [
|
||||||
for annotation, image in zip(annotations, images)
|
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
|
||||||
]
|
for annotation, image in zip(annotations, images)
|
||||||
|
]
|
||||||
|
|
||||||
if do_pad:
|
if do_pad:
|
||||||
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
|
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
|
||||||
data = self.pad(
|
encoded_inputs = self.pad(
|
||||||
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
|
images,
|
||||||
|
annotations=annotations,
|
||||||
|
return_pixel_mask=True,
|
||||||
|
data_format=data_format,
|
||||||
|
input_data_format=input_data_format,
|
||||||
|
return_tensors=return_tensors,
|
||||||
|
update_bboxes=do_convert_annotations,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
images = [
|
images = [
|
||||||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||||||
for image in images
|
for image in images
|
||||||
]
|
]
|
||||||
data = {"pixel_values": images}
|
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
|
||||||
|
if annotations is not None:
|
||||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
encoded_inputs["labels"] = [
|
||||||
if annotations is not None:
|
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
||||||
encoded_inputs["labels"] = [
|
]
|
||||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
|
||||||
]
|
|
||||||
|
|
||||||
return encoded_inputs
|
return encoded_inputs
|
||||||
|
|
||||||
|
|||||||
@@ -783,9 +783,14 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
|||||||
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
|
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
|
||||||
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
||||||
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||||
|
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||||
|
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
|
||||||
|
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||||
|
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||||
do_pad (`bool`, *optional*, defaults to `True`):
|
do_pad (`bool`, *optional*, defaults to `True`):
|
||||||
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
|
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||||
overridden by the `do_pad` parameter in the `preprocess` method.
|
method. If `True` will pad the images in the batch to the largest height and width in the batch.
|
||||||
|
Padding will be applied to the bottom and right of the image with zeros.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model_input_names = ["pixel_values", "pixel_mask"]
|
model_input_names = ["pixel_values", "pixel_mask"]
|
||||||
@@ -802,6 +807,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
|||||||
do_normalize: bool = True,
|
do_normalize: bool = True,
|
||||||
image_mean: Union[float, List[float]] = None,
|
image_mean: Union[float, List[float]] = None,
|
||||||
image_std: Union[float, List[float]] = None,
|
image_std: Union[float, List[float]] = None,
|
||||||
|
do_convert_annotations: Optional[bool] = None,
|
||||||
do_pad: bool = True,
|
do_pad: bool = True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -820,6 +826,10 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
|||||||
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
|
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
|
||||||
size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
||||||
|
|
||||||
|
# Backwards compatibility
|
||||||
|
if do_convert_annotations is None:
|
||||||
|
do_convert_annotations = do_normalize
|
||||||
|
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.format = format
|
self.format = format
|
||||||
self.do_resize = do_resize
|
self.do_resize = do_resize
|
||||||
@@ -828,6 +838,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
|||||||
self.do_rescale = do_rescale
|
self.do_rescale = do_rescale
|
||||||
self.rescale_factor = rescale_factor
|
self.rescale_factor = rescale_factor
|
||||||
self.do_normalize = do_normalize
|
self.do_normalize = do_normalize
|
||||||
|
self.do_convert_annotations = do_convert_annotations
|
||||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||||
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||||
self.do_pad = do_pad
|
self.do_pad = do_pad
|
||||||
@@ -1005,18 +1016,64 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
|||||||
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
||||||
"""
|
"""
|
||||||
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
|
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
|
||||||
`[center_x, center_y, width, height]` format.
|
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
|
||||||
"""
|
"""
|
||||||
return normalize_annotation(annotation, image_size=image_size)
|
return normalize_annotation(annotation, image_size=image_size)
|
||||||
|
|
||||||
|
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
|
||||||
|
def _update_annotation_for_padded_image(
|
||||||
|
self,
|
||||||
|
annotation: Dict,
|
||||||
|
input_image_size: Tuple[int, int],
|
||||||
|
output_image_size: Tuple[int, int],
|
||||||
|
padding,
|
||||||
|
update_bboxes,
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
Update the annotation for a padded image.
|
||||||
|
"""
|
||||||
|
new_annotation = {}
|
||||||
|
new_annotation["size"] = output_image_size
|
||||||
|
|
||||||
|
for key, value in annotation.items():
|
||||||
|
if key == "masks":
|
||||||
|
masks = value
|
||||||
|
masks = pad(
|
||||||
|
masks,
|
||||||
|
padding,
|
||||||
|
mode=PaddingMode.CONSTANT,
|
||||||
|
constant_values=0,
|
||||||
|
input_data_format=ChannelDimension.FIRST,
|
||||||
|
)
|
||||||
|
masks = safe_squeeze(masks, 1)
|
||||||
|
new_annotation["masks"] = masks
|
||||||
|
elif key == "boxes" and update_bboxes:
|
||||||
|
boxes = value
|
||||||
|
boxes *= np.asarray(
|
||||||
|
[
|
||||||
|
input_image_size[1] / output_image_size[1],
|
||||||
|
input_image_size[0] / output_image_size[0],
|
||||||
|
input_image_size[1] / output_image_size[1],
|
||||||
|
input_image_size[0] / output_image_size[0],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
new_annotation["boxes"] = boxes
|
||||||
|
elif key == "size":
|
||||||
|
new_annotation["size"] = output_image_size
|
||||||
|
else:
|
||||||
|
new_annotation[key] = value
|
||||||
|
return new_annotation
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
||||||
def _pad_image(
|
def _pad_image(
|
||||||
self,
|
self,
|
||||||
image: np.ndarray,
|
image: np.ndarray,
|
||||||
output_size: Tuple[int, int],
|
output_size: Tuple[int, int],
|
||||||
|
annotation: Optional[Dict[str, Any]] = None,
|
||||||
constant_values: Union[float, Iterable[float]] = 0,
|
constant_values: Union[float, Iterable[float]] = 0,
|
||||||
data_format: Optional[ChannelDimension] = None,
|
data_format: Optional[ChannelDimension] = None,
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
update_bboxes: bool = True,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""
|
"""
|
||||||
Pad an image with zeros to the given size.
|
Pad an image with zeros to the given size.
|
||||||
@@ -1035,25 +1092,33 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
|||||||
data_format=data_format,
|
data_format=data_format,
|
||||||
input_data_format=input_data_format,
|
input_data_format=input_data_format,
|
||||||
)
|
)
|
||||||
return padded_image
|
if annotation is not None:
|
||||||
|
annotation = self._update_annotation_for_padded_image(
|
||||||
|
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
|
||||||
|
)
|
||||||
|
return padded_image, annotation
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
||||||
def pad(
|
def pad(
|
||||||
self,
|
self,
|
||||||
images: List[np.ndarray],
|
images: List[np.ndarray],
|
||||||
|
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
|
||||||
constant_values: Union[float, Iterable[float]] = 0,
|
constant_values: Union[float, Iterable[float]] = 0,
|
||||||
return_pixel_mask: bool = True,
|
return_pixel_mask: bool = True,
|
||||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||||
data_format: Optional[ChannelDimension] = None,
|
data_format: Optional[ChannelDimension] = None,
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
update_bboxes: bool = True,
|
||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
"""
|
"""
|
||||||
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
||||||
in the batch and optionally returns their corresponding pixel mask.
|
in the batch and optionally returns their corresponding pixel mask.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image (`np.ndarray`):
|
images (List[`np.ndarray`]):
|
||||||
Image to pad.
|
Images to pad.
|
||||||
|
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
|
||||||
|
Annotations to transform according to the padding that is applied to the images.
|
||||||
constant_values (`float` or `Iterable[float]`, *optional*):
|
constant_values (`float` or `Iterable[float]`, *optional*):
|
||||||
The value to use for the padding if `mode` is `"constant"`.
|
The value to use for the padding if `mode` is `"constant"`.
|
||||||
return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
||||||
@@ -1069,19 +1134,29 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
|||||||
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
||||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||||
The channel dimension format of the input image. If not provided, it will be inferred.
|
The channel dimension format of the input image. If not provided, it will be inferred.
|
||||||
|
update_bboxes (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to update the bounding boxes in the annotations to match the padded images. If the
|
||||||
|
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
|
||||||
|
format, the bounding boxes will not be updated.
|
||||||
"""
|
"""
|
||||||
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||||
|
|
||||||
padded_images = [
|
annotation_list = annotations if annotations is not None else [None] * len(images)
|
||||||
self._pad_image(
|
padded_images = []
|
||||||
|
padded_annotations = []
|
||||||
|
for image, annotation in zip(images, annotation_list):
|
||||||
|
padded_image, padded_annotation = self._pad_image(
|
||||||
image,
|
image,
|
||||||
pad_size,
|
pad_size,
|
||||||
|
annotation,
|
||||||
constant_values=constant_values,
|
constant_values=constant_values,
|
||||||
data_format=data_format,
|
data_format=data_format,
|
||||||
input_data_format=input_data_format,
|
input_data_format=input_data_format,
|
||||||
|
update_bboxes=update_bboxes,
|
||||||
)
|
)
|
||||||
for image in images
|
padded_images.append(padded_image)
|
||||||
]
|
padded_annotations.append(padded_annotation)
|
||||||
|
|
||||||
data = {"pixel_values": padded_images}
|
data = {"pixel_values": padded_images}
|
||||||
|
|
||||||
if return_pixel_mask:
|
if return_pixel_mask:
|
||||||
@@ -1091,7 +1166,14 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
|||||||
]
|
]
|
||||||
data["pixel_mask"] = masks
|
data["pixel_mask"] = masks
|
||||||
|
|
||||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
||||||
|
|
||||||
|
if annotations is not None:
|
||||||
|
encoded_inputs["labels"] = [
|
||||||
|
BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
|
||||||
|
]
|
||||||
|
|
||||||
|
return encoded_inputs
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
|
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
|
||||||
def preprocess(
|
def preprocess(
|
||||||
@@ -1106,6 +1188,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
|||||||
do_rescale: Optional[bool] = None,
|
do_rescale: Optional[bool] = None,
|
||||||
rescale_factor: Optional[Union[int, float]] = None,
|
rescale_factor: Optional[Union[int, float]] = None,
|
||||||
do_normalize: Optional[bool] = None,
|
do_normalize: Optional[bool] = None,
|
||||||
|
do_convert_annotations: Optional[bool] = None,
|
||||||
image_mean: Optional[Union[float, List[float]]] = None,
|
image_mean: Optional[Union[float, List[float]]] = None,
|
||||||
image_std: Optional[Union[float, List[float]]] = None,
|
image_std: Optional[Union[float, List[float]]] = None,
|
||||||
do_pad: Optional[bool] = None,
|
do_pad: Optional[bool] = None,
|
||||||
@@ -1149,12 +1232,17 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
|||||||
Rescale factor to use when rescaling the image.
|
Rescale factor to use when rescaling the image.
|
||||||
do_normalize (`bool`, *optional*, defaults to self.do_normalize):
|
do_normalize (`bool`, *optional*, defaults to self.do_normalize):
|
||||||
Whether to normalize the image.
|
Whether to normalize the image.
|
||||||
|
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
|
||||||
|
Whether to convert the annotations to the format expected by the model. Converts the bounding
|
||||||
|
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
|
||||||
|
and in relative coordinates.
|
||||||
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
|
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
|
||||||
Mean to use when normalizing the image.
|
Mean to use when normalizing the image.
|
||||||
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
||||||
Standard deviation to use when normalizing the image.
|
Standard deviation to use when normalizing the image.
|
||||||
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
||||||
Whether to pad the image.
|
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
|
||||||
|
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
|
||||||
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
||||||
Format of the annotations.
|
Format of the annotations.
|
||||||
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
||||||
@@ -1195,6 +1283,9 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
|||||||
do_normalize = self.do_normalize if do_normalize is None else do_normalize
|
do_normalize = self.do_normalize if do_normalize is None else do_normalize
|
||||||
image_mean = self.image_mean if image_mean is None else image_mean
|
image_mean = self.image_mean if image_mean is None else image_mean
|
||||||
image_std = self.image_std if image_std is None else image_std
|
image_std = self.image_std if image_std is None else image_std
|
||||||
|
do_convert_annotations = (
|
||||||
|
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
|
||||||
|
)
|
||||||
do_pad = self.do_pad if do_pad is None else do_pad
|
do_pad = self.do_pad if do_pad is None else do_pad
|
||||||
format = self.format if format is None else format
|
format = self.format if format is None else format
|
||||||
|
|
||||||
@@ -1298,29 +1389,34 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
|||||||
images = [
|
images = [
|
||||||
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
|
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
|
||||||
]
|
]
|
||||||
if annotations is not None:
|
|
||||||
annotations = [
|
if do_convert_annotations and annotations is not None:
|
||||||
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
|
annotations = [
|
||||||
for annotation, image in zip(annotations, images)
|
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
|
||||||
]
|
for annotation, image in zip(annotations, images)
|
||||||
|
]
|
||||||
|
|
||||||
if do_pad:
|
if do_pad:
|
||||||
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
|
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
|
||||||
data = self.pad(
|
encoded_inputs = self.pad(
|
||||||
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
|
images,
|
||||||
|
annotations=annotations,
|
||||||
|
return_pixel_mask=True,
|
||||||
|
data_format=data_format,
|
||||||
|
input_data_format=input_data_format,
|
||||||
|
return_tensors=return_tensors,
|
||||||
|
update_bboxes=do_convert_annotations,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
images = [
|
images = [
|
||||||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||||||
for image in images
|
for image in images
|
||||||
]
|
]
|
||||||
data = {"pixel_values": images}
|
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
|
||||||
|
if annotations is not None:
|
||||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
encoded_inputs["labels"] = [
|
||||||
if annotations is not None:
|
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
||||||
encoded_inputs["labels"] = [
|
]
|
||||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
|
||||||
]
|
|
||||||
|
|
||||||
return encoded_inputs
|
return encoded_inputs
|
||||||
|
|
||||||
|
|||||||
@@ -35,6 +35,7 @@ from ...image_utils import (
|
|||||||
IMAGENET_DEFAULT_MEAN,
|
IMAGENET_DEFAULT_MEAN,
|
||||||
IMAGENET_DEFAULT_STD,
|
IMAGENET_DEFAULT_STD,
|
||||||
AnnotationFormat,
|
AnnotationFormat,
|
||||||
|
AnnotationType,
|
||||||
ChannelDimension,
|
ChannelDimension,
|
||||||
ImageInput,
|
ImageInput,
|
||||||
PILImageResampling,
|
PILImageResampling,
|
||||||
@@ -492,9 +493,14 @@ class DetaImageProcessor(BaseImageProcessor):
|
|||||||
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
|
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
|
||||||
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
||||||
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||||
|
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||||
|
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
|
||||||
|
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||||
|
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||||
do_pad (`bool`, *optional*, defaults to `True`):
|
do_pad (`bool`, *optional*, defaults to `True`):
|
||||||
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
|
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||||
overridden by the `do_pad` parameter in the `preprocess` method.
|
method. If `True` will pad the images in the batch to the largest height and width in the batch.
|
||||||
|
Padding will be applied to the bottom and right of the image with zeros.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model_input_names = ["pixel_values", "pixel_mask"]
|
model_input_names = ["pixel_values", "pixel_mask"]
|
||||||
@@ -510,6 +516,7 @@ class DetaImageProcessor(BaseImageProcessor):
|
|||||||
do_normalize: bool = True,
|
do_normalize: bool = True,
|
||||||
image_mean: Union[float, List[float]] = None,
|
image_mean: Union[float, List[float]] = None,
|
||||||
image_std: Union[float, List[float]] = None,
|
image_std: Union[float, List[float]] = None,
|
||||||
|
do_convert_annotations: bool = True,
|
||||||
do_pad: bool = True,
|
do_pad: bool = True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -519,6 +526,9 @@ class DetaImageProcessor(BaseImageProcessor):
|
|||||||
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
|
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
|
||||||
size = get_size_dict(size, default_to_square=False)
|
size = get_size_dict(size, default_to_square=False)
|
||||||
|
|
||||||
|
if do_convert_annotations is None:
|
||||||
|
do_convert_annotations = do_normalize
|
||||||
|
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.format = format
|
self.format = format
|
||||||
self.do_resize = do_resize
|
self.do_resize = do_resize
|
||||||
@@ -527,6 +537,7 @@ class DetaImageProcessor(BaseImageProcessor):
|
|||||||
self.do_rescale = do_rescale
|
self.do_rescale = do_rescale
|
||||||
self.rescale_factor = rescale_factor
|
self.rescale_factor = rescale_factor
|
||||||
self.do_normalize = do_normalize
|
self.do_normalize = do_normalize
|
||||||
|
self.do_convert_annotations = do_convert_annotations
|
||||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||||
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||||
self.do_pad = do_pad
|
self.do_pad = do_pad
|
||||||
@@ -680,18 +691,64 @@ class DetaImageProcessor(BaseImageProcessor):
|
|||||||
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
||||||
"""
|
"""
|
||||||
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
|
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
|
||||||
`[center_x, center_y, width, height]` format.
|
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
|
||||||
"""
|
"""
|
||||||
return normalize_annotation(annotation, image_size=image_size)
|
return normalize_annotation(annotation, image_size=image_size)
|
||||||
|
|
||||||
|
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
|
||||||
|
def _update_annotation_for_padded_image(
|
||||||
|
self,
|
||||||
|
annotation: Dict,
|
||||||
|
input_image_size: Tuple[int, int],
|
||||||
|
output_image_size: Tuple[int, int],
|
||||||
|
padding,
|
||||||
|
update_bboxes,
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
Update the annotation for a padded image.
|
||||||
|
"""
|
||||||
|
new_annotation = {}
|
||||||
|
new_annotation["size"] = output_image_size
|
||||||
|
|
||||||
|
for key, value in annotation.items():
|
||||||
|
if key == "masks":
|
||||||
|
masks = value
|
||||||
|
masks = pad(
|
||||||
|
masks,
|
||||||
|
padding,
|
||||||
|
mode=PaddingMode.CONSTANT,
|
||||||
|
constant_values=0,
|
||||||
|
input_data_format=ChannelDimension.FIRST,
|
||||||
|
)
|
||||||
|
masks = safe_squeeze(masks, 1)
|
||||||
|
new_annotation["masks"] = masks
|
||||||
|
elif key == "boxes" and update_bboxes:
|
||||||
|
boxes = value
|
||||||
|
boxes *= np.asarray(
|
||||||
|
[
|
||||||
|
input_image_size[1] / output_image_size[1],
|
||||||
|
input_image_size[0] / output_image_size[0],
|
||||||
|
input_image_size[1] / output_image_size[1],
|
||||||
|
input_image_size[0] / output_image_size[0],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
new_annotation["boxes"] = boxes
|
||||||
|
elif key == "size":
|
||||||
|
new_annotation["size"] = output_image_size
|
||||||
|
else:
|
||||||
|
new_annotation[key] = value
|
||||||
|
return new_annotation
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
||||||
def _pad_image(
|
def _pad_image(
|
||||||
self,
|
self,
|
||||||
image: np.ndarray,
|
image: np.ndarray,
|
||||||
output_size: Tuple[int, int],
|
output_size: Tuple[int, int],
|
||||||
|
annotation: Optional[Dict[str, Any]] = None,
|
||||||
constant_values: Union[float, Iterable[float]] = 0,
|
constant_values: Union[float, Iterable[float]] = 0,
|
||||||
data_format: Optional[ChannelDimension] = None,
|
data_format: Optional[ChannelDimension] = None,
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
update_bboxes: bool = True,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""
|
"""
|
||||||
Pad an image with zeros to the given size.
|
Pad an image with zeros to the given size.
|
||||||
@@ -710,25 +767,33 @@ class DetaImageProcessor(BaseImageProcessor):
|
|||||||
data_format=data_format,
|
data_format=data_format,
|
||||||
input_data_format=input_data_format,
|
input_data_format=input_data_format,
|
||||||
)
|
)
|
||||||
return padded_image
|
if annotation is not None:
|
||||||
|
annotation = self._update_annotation_for_padded_image(
|
||||||
|
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
|
||||||
|
)
|
||||||
|
return padded_image, annotation
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
||||||
def pad(
|
def pad(
|
||||||
self,
|
self,
|
||||||
images: List[np.ndarray],
|
images: List[np.ndarray],
|
||||||
|
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
|
||||||
constant_values: Union[float, Iterable[float]] = 0,
|
constant_values: Union[float, Iterable[float]] = 0,
|
||||||
return_pixel_mask: bool = True,
|
return_pixel_mask: bool = True,
|
||||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||||
data_format: Optional[ChannelDimension] = None,
|
data_format: Optional[ChannelDimension] = None,
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
update_bboxes: bool = True,
|
||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
"""
|
"""
|
||||||
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
||||||
in the batch and optionally returns their corresponding pixel mask.
|
in the batch and optionally returns their corresponding pixel mask.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image (`np.ndarray`):
|
images (List[`np.ndarray`]):
|
||||||
Image to pad.
|
Images to pad.
|
||||||
|
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
|
||||||
|
Annotations to transform according to the padding that is applied to the images.
|
||||||
constant_values (`float` or `Iterable[float]`, *optional*):
|
constant_values (`float` or `Iterable[float]`, *optional*):
|
||||||
The value to use for the padding if `mode` is `"constant"`.
|
The value to use for the padding if `mode` is `"constant"`.
|
||||||
return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
||||||
@@ -744,19 +809,29 @@ class DetaImageProcessor(BaseImageProcessor):
|
|||||||
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
||||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||||
The channel dimension format of the input image. If not provided, it will be inferred.
|
The channel dimension format of the input image. If not provided, it will be inferred.
|
||||||
|
update_bboxes (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to update the bounding boxes in the annotations to match the padded images. If the
|
||||||
|
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
|
||||||
|
format, the bounding boxes will not be updated.
|
||||||
"""
|
"""
|
||||||
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||||
|
|
||||||
padded_images = [
|
annotation_list = annotations if annotations is not None else [None] * len(images)
|
||||||
self._pad_image(
|
padded_images = []
|
||||||
|
padded_annotations = []
|
||||||
|
for image, annotation in zip(images, annotation_list):
|
||||||
|
padded_image, padded_annotation = self._pad_image(
|
||||||
image,
|
image,
|
||||||
pad_size,
|
pad_size,
|
||||||
|
annotation,
|
||||||
constant_values=constant_values,
|
constant_values=constant_values,
|
||||||
data_format=data_format,
|
data_format=data_format,
|
||||||
input_data_format=input_data_format,
|
input_data_format=input_data_format,
|
||||||
|
update_bboxes=update_bboxes,
|
||||||
)
|
)
|
||||||
for image in images
|
padded_images.append(padded_image)
|
||||||
]
|
padded_annotations.append(padded_annotation)
|
||||||
|
|
||||||
data = {"pixel_values": padded_images}
|
data = {"pixel_values": padded_images}
|
||||||
|
|
||||||
if return_pixel_mask:
|
if return_pixel_mask:
|
||||||
@@ -766,7 +841,14 @@ class DetaImageProcessor(BaseImageProcessor):
|
|||||||
]
|
]
|
||||||
data["pixel_mask"] = masks
|
data["pixel_mask"] = masks
|
||||||
|
|
||||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
||||||
|
|
||||||
|
if annotations is not None:
|
||||||
|
encoded_inputs["labels"] = [
|
||||||
|
BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
|
||||||
|
]
|
||||||
|
|
||||||
|
return encoded_inputs
|
||||||
|
|
||||||
def preprocess(
|
def preprocess(
|
||||||
self,
|
self,
|
||||||
@@ -782,6 +864,7 @@ class DetaImageProcessor(BaseImageProcessor):
|
|||||||
do_normalize: Optional[bool] = None,
|
do_normalize: Optional[bool] = None,
|
||||||
image_mean: Optional[Union[float, List[float]]] = None,
|
image_mean: Optional[Union[float, List[float]]] = None,
|
||||||
image_std: Optional[Union[float, List[float]]] = None,
|
image_std: Optional[Union[float, List[float]]] = None,
|
||||||
|
do_convert_annotations: Optional[bool] = None,
|
||||||
do_pad: Optional[bool] = None,
|
do_pad: Optional[bool] = None,
|
||||||
format: Optional[Union[str, AnnotationFormat]] = None,
|
format: Optional[Union[str, AnnotationFormat]] = None,
|
||||||
return_tensors: Optional[Union[TensorType, str]] = None,
|
return_tensors: Optional[Union[TensorType, str]] = None,
|
||||||
@@ -827,8 +910,13 @@ class DetaImageProcessor(BaseImageProcessor):
|
|||||||
Mean to use when normalizing the image.
|
Mean to use when normalizing the image.
|
||||||
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
||||||
Standard deviation to use when normalizing the image.
|
Standard deviation to use when normalizing the image.
|
||||||
|
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
|
||||||
|
Whether to convert the annotations to the format expected by the model. Converts the bounding
|
||||||
|
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
|
||||||
|
and in relative coordinates.
|
||||||
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
||||||
Whether to pad the image.
|
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
|
||||||
|
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
|
||||||
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
||||||
Format of the annotations.
|
Format of the annotations.
|
||||||
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
||||||
@@ -861,6 +949,9 @@ class DetaImageProcessor(BaseImageProcessor):
|
|||||||
do_normalize = self.do_normalize if do_normalize is None else do_normalize
|
do_normalize = self.do_normalize if do_normalize is None else do_normalize
|
||||||
image_mean = self.image_mean if image_mean is None else image_mean
|
image_mean = self.image_mean if image_mean is None else image_mean
|
||||||
image_std = self.image_std if image_std is None else image_std
|
image_std = self.image_std if image_std is None else image_std
|
||||||
|
do_convert_annotations = (
|
||||||
|
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
|
||||||
|
)
|
||||||
do_pad = self.do_pad if do_pad is None else do_pad
|
do_pad = self.do_pad if do_pad is None else do_pad
|
||||||
format = self.format if format is None else format
|
format = self.format if format is None else format
|
||||||
|
|
||||||
@@ -964,29 +1055,34 @@ class DetaImageProcessor(BaseImageProcessor):
|
|||||||
images = [
|
images = [
|
||||||
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
|
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
|
||||||
]
|
]
|
||||||
if annotations is not None:
|
|
||||||
annotations = [
|
if do_convert_annotations and annotations is not None:
|
||||||
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
|
annotations = [
|
||||||
for annotation, image in zip(annotations, images)
|
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
|
||||||
]
|
for annotation, image in zip(annotations, images)
|
||||||
|
]
|
||||||
|
|
||||||
if do_pad:
|
if do_pad:
|
||||||
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
|
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
|
||||||
data = self.pad(
|
encoded_inputs = self.pad(
|
||||||
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
|
images,
|
||||||
|
annotations=annotations,
|
||||||
|
return_pixel_mask=True,
|
||||||
|
data_format=data_format,
|
||||||
|
input_data_format=input_data_format,
|
||||||
|
return_tensors=return_tensors,
|
||||||
|
update_bboxes=do_convert_annotations,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
images = [
|
images = [
|
||||||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||||||
for image in images
|
for image in images
|
||||||
]
|
]
|
||||||
data = {"pixel_values": images}
|
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
|
||||||
|
if annotations is not None:
|
||||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
encoded_inputs["labels"] = [
|
||||||
if annotations is not None:
|
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
||||||
encoded_inputs["labels"] = [
|
]
|
||||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
|
||||||
]
|
|
||||||
|
|
||||||
return encoded_inputs
|
return encoded_inputs
|
||||||
|
|
||||||
|
|||||||
@@ -760,7 +760,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
|||||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||||
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
|
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
|
||||||
`preprocess` method.
|
`preprocess` method.
|
||||||
do_normalize:
|
do_normalize (`bool`, *optional*, defaults to True):
|
||||||
Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
|
Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
|
||||||
`preprocess` method.
|
`preprocess` method.
|
||||||
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
|
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
|
||||||
@@ -769,9 +769,14 @@ class DetrImageProcessor(BaseImageProcessor):
|
|||||||
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
|
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
|
||||||
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
||||||
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||||
|
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||||
|
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
|
||||||
|
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||||
|
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||||
do_pad (`bool`, *optional*, defaults to `True`):
|
do_pad (`bool`, *optional*, defaults to `True`):
|
||||||
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
|
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||||
overridden by the `do_pad` parameter in the `preprocess` method.
|
method. If `True` will pad the images in the batch to the largest height and width in the batch.
|
||||||
|
Padding will be applied to the bottom and right of the image with zeros.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model_input_names = ["pixel_values", "pixel_mask"]
|
model_input_names = ["pixel_values", "pixel_mask"]
|
||||||
@@ -787,6 +792,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
|||||||
do_normalize: bool = True,
|
do_normalize: bool = True,
|
||||||
image_mean: Union[float, List[float]] = None,
|
image_mean: Union[float, List[float]] = None,
|
||||||
image_std: Union[float, List[float]] = None,
|
image_std: Union[float, List[float]] = None,
|
||||||
|
do_convert_annotations: Optional[bool] = None,
|
||||||
do_pad: bool = True,
|
do_pad: bool = True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -805,6 +811,10 @@ class DetrImageProcessor(BaseImageProcessor):
|
|||||||
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
|
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
|
||||||
size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
||||||
|
|
||||||
|
# Backwards compatibility
|
||||||
|
if do_convert_annotations is None:
|
||||||
|
do_convert_annotations = do_normalize
|
||||||
|
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.format = format
|
self.format = format
|
||||||
self.do_resize = do_resize
|
self.do_resize = do_resize
|
||||||
@@ -813,6 +823,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
|||||||
self.do_rescale = do_rescale
|
self.do_rescale = do_rescale
|
||||||
self.rescale_factor = rescale_factor
|
self.rescale_factor = rescale_factor
|
||||||
self.do_normalize = do_normalize
|
self.do_normalize = do_normalize
|
||||||
|
self.do_convert_annotations = do_convert_annotations
|
||||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||||
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||||
self.do_pad = do_pad
|
self.do_pad = do_pad
|
||||||
@@ -981,17 +992,62 @@ class DetrImageProcessor(BaseImageProcessor):
|
|||||||
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
||||||
"""
|
"""
|
||||||
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
|
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
|
||||||
`[center_x, center_y, width, height]` format.
|
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
|
||||||
"""
|
"""
|
||||||
return normalize_annotation(annotation, image_size=image_size)
|
return normalize_annotation(annotation, image_size=image_size)
|
||||||
|
|
||||||
|
def _update_annotation_for_padded_image(
|
||||||
|
self,
|
||||||
|
annotation: Dict,
|
||||||
|
input_image_size: Tuple[int, int],
|
||||||
|
output_image_size: Tuple[int, int],
|
||||||
|
padding,
|
||||||
|
update_bboxes,
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
Update the annotation for a padded image.
|
||||||
|
"""
|
||||||
|
new_annotation = {}
|
||||||
|
new_annotation["size"] = output_image_size
|
||||||
|
|
||||||
|
for key, value in annotation.items():
|
||||||
|
if key == "masks":
|
||||||
|
masks = value
|
||||||
|
masks = pad(
|
||||||
|
masks,
|
||||||
|
padding,
|
||||||
|
mode=PaddingMode.CONSTANT,
|
||||||
|
constant_values=0,
|
||||||
|
input_data_format=ChannelDimension.FIRST,
|
||||||
|
)
|
||||||
|
masks = safe_squeeze(masks, 1)
|
||||||
|
new_annotation["masks"] = masks
|
||||||
|
elif key == "boxes" and update_bboxes:
|
||||||
|
boxes = value
|
||||||
|
boxes *= np.asarray(
|
||||||
|
[
|
||||||
|
input_image_size[1] / output_image_size[1],
|
||||||
|
input_image_size[0] / output_image_size[0],
|
||||||
|
input_image_size[1] / output_image_size[1],
|
||||||
|
input_image_size[0] / output_image_size[0],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
new_annotation["boxes"] = boxes
|
||||||
|
elif key == "size":
|
||||||
|
new_annotation["size"] = output_image_size
|
||||||
|
else:
|
||||||
|
new_annotation[key] = value
|
||||||
|
return new_annotation
|
||||||
|
|
||||||
def _pad_image(
|
def _pad_image(
|
||||||
self,
|
self,
|
||||||
image: np.ndarray,
|
image: np.ndarray,
|
||||||
output_size: Tuple[int, int],
|
output_size: Tuple[int, int],
|
||||||
|
annotation: Optional[Dict[str, Any]] = None,
|
||||||
constant_values: Union[float, Iterable[float]] = 0,
|
constant_values: Union[float, Iterable[float]] = 0,
|
||||||
data_format: Optional[ChannelDimension] = None,
|
data_format: Optional[ChannelDimension] = None,
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
update_bboxes: bool = True,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""
|
"""
|
||||||
Pad an image with zeros to the given size.
|
Pad an image with zeros to the given size.
|
||||||
@@ -1010,24 +1066,32 @@ class DetrImageProcessor(BaseImageProcessor):
|
|||||||
data_format=data_format,
|
data_format=data_format,
|
||||||
input_data_format=input_data_format,
|
input_data_format=input_data_format,
|
||||||
)
|
)
|
||||||
return padded_image
|
if annotation is not None:
|
||||||
|
annotation = self._update_annotation_for_padded_image(
|
||||||
|
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
|
||||||
|
)
|
||||||
|
return padded_image, annotation
|
||||||
|
|
||||||
def pad(
|
def pad(
|
||||||
self,
|
self,
|
||||||
images: List[np.ndarray],
|
images: List[np.ndarray],
|
||||||
|
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
|
||||||
constant_values: Union[float, Iterable[float]] = 0,
|
constant_values: Union[float, Iterable[float]] = 0,
|
||||||
return_pixel_mask: bool = True,
|
return_pixel_mask: bool = True,
|
||||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||||
data_format: Optional[ChannelDimension] = None,
|
data_format: Optional[ChannelDimension] = None,
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
update_bboxes: bool = True,
|
||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
"""
|
"""
|
||||||
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
||||||
in the batch and optionally returns their corresponding pixel mask.
|
in the batch and optionally returns their corresponding pixel mask.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image (`np.ndarray`):
|
images (List[`np.ndarray`]):
|
||||||
Image to pad.
|
Images to pad.
|
||||||
|
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
|
||||||
|
Annotations to transform according to the padding that is applied to the images.
|
||||||
constant_values (`float` or `Iterable[float]`, *optional*):
|
constant_values (`float` or `Iterable[float]`, *optional*):
|
||||||
The value to use for the padding if `mode` is `"constant"`.
|
The value to use for the padding if `mode` is `"constant"`.
|
||||||
return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
||||||
@@ -1043,19 +1107,29 @@ class DetrImageProcessor(BaseImageProcessor):
|
|||||||
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
||||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||||
The channel dimension format of the input image. If not provided, it will be inferred.
|
The channel dimension format of the input image. If not provided, it will be inferred.
|
||||||
|
update_bboxes (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to update the bounding boxes in the annotations to match the padded images. If the
|
||||||
|
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
|
||||||
|
format, the bounding boxes will not be updated.
|
||||||
"""
|
"""
|
||||||
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||||
|
|
||||||
padded_images = [
|
annotation_list = annotations if annotations is not None else [None] * len(images)
|
||||||
self._pad_image(
|
padded_images = []
|
||||||
|
padded_annotations = []
|
||||||
|
for image, annotation in zip(images, annotation_list):
|
||||||
|
padded_image, padded_annotation = self._pad_image(
|
||||||
image,
|
image,
|
||||||
pad_size,
|
pad_size,
|
||||||
|
annotation,
|
||||||
constant_values=constant_values,
|
constant_values=constant_values,
|
||||||
data_format=data_format,
|
data_format=data_format,
|
||||||
input_data_format=input_data_format,
|
input_data_format=input_data_format,
|
||||||
|
update_bboxes=update_bboxes,
|
||||||
)
|
)
|
||||||
for image in images
|
padded_images.append(padded_image)
|
||||||
]
|
padded_annotations.append(padded_annotation)
|
||||||
|
|
||||||
data = {"pixel_values": padded_images}
|
data = {"pixel_values": padded_images}
|
||||||
|
|
||||||
if return_pixel_mask:
|
if return_pixel_mask:
|
||||||
@@ -1065,7 +1139,14 @@ class DetrImageProcessor(BaseImageProcessor):
|
|||||||
]
|
]
|
||||||
data["pixel_mask"] = masks
|
data["pixel_mask"] = masks
|
||||||
|
|
||||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
||||||
|
|
||||||
|
if annotations is not None:
|
||||||
|
encoded_inputs["labels"] = [
|
||||||
|
BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
|
||||||
|
]
|
||||||
|
|
||||||
|
return encoded_inputs
|
||||||
|
|
||||||
def preprocess(
|
def preprocess(
|
||||||
self,
|
self,
|
||||||
@@ -1079,6 +1160,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
|||||||
do_rescale: Optional[bool] = None,
|
do_rescale: Optional[bool] = None,
|
||||||
rescale_factor: Optional[Union[int, float]] = None,
|
rescale_factor: Optional[Union[int, float]] = None,
|
||||||
do_normalize: Optional[bool] = None,
|
do_normalize: Optional[bool] = None,
|
||||||
|
do_convert_annotations: Optional[bool] = None,
|
||||||
image_mean: Optional[Union[float, List[float]]] = None,
|
image_mean: Optional[Union[float, List[float]]] = None,
|
||||||
image_std: Optional[Union[float, List[float]]] = None,
|
image_std: Optional[Union[float, List[float]]] = None,
|
||||||
do_pad: Optional[bool] = None,
|
do_pad: Optional[bool] = None,
|
||||||
@@ -1122,12 +1204,17 @@ class DetrImageProcessor(BaseImageProcessor):
|
|||||||
Rescale factor to use when rescaling the image.
|
Rescale factor to use when rescaling the image.
|
||||||
do_normalize (`bool`, *optional*, defaults to self.do_normalize):
|
do_normalize (`bool`, *optional*, defaults to self.do_normalize):
|
||||||
Whether to normalize the image.
|
Whether to normalize the image.
|
||||||
|
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
|
||||||
|
Whether to convert the annotations to the format expected by the model. Converts the bounding
|
||||||
|
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
|
||||||
|
and in relative coordinates.
|
||||||
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
|
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
|
||||||
Mean to use when normalizing the image.
|
Mean to use when normalizing the image.
|
||||||
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
||||||
Standard deviation to use when normalizing the image.
|
Standard deviation to use when normalizing the image.
|
||||||
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
||||||
Whether to pad the image.
|
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
|
||||||
|
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
|
||||||
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
||||||
Format of the annotations.
|
Format of the annotations.
|
||||||
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
||||||
@@ -1168,6 +1255,9 @@ class DetrImageProcessor(BaseImageProcessor):
|
|||||||
do_normalize = self.do_normalize if do_normalize is None else do_normalize
|
do_normalize = self.do_normalize if do_normalize is None else do_normalize
|
||||||
image_mean = self.image_mean if image_mean is None else image_mean
|
image_mean = self.image_mean if image_mean is None else image_mean
|
||||||
image_std = self.image_std if image_std is None else image_std
|
image_std = self.image_std if image_std is None else image_std
|
||||||
|
do_convert_annotations = (
|
||||||
|
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
|
||||||
|
)
|
||||||
do_pad = self.do_pad if do_pad is None else do_pad
|
do_pad = self.do_pad if do_pad is None else do_pad
|
||||||
format = self.format if format is None else format
|
format = self.format if format is None else format
|
||||||
|
|
||||||
@@ -1271,29 +1361,34 @@ class DetrImageProcessor(BaseImageProcessor):
|
|||||||
images = [
|
images = [
|
||||||
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
|
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
|
||||||
]
|
]
|
||||||
if annotations is not None:
|
|
||||||
annotations = [
|
if do_convert_annotations and annotations is not None:
|
||||||
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
|
annotations = [
|
||||||
for annotation, image in zip(annotations, images)
|
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
|
||||||
]
|
for annotation, image in zip(annotations, images)
|
||||||
|
]
|
||||||
|
|
||||||
if do_pad:
|
if do_pad:
|
||||||
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
|
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
|
||||||
data = self.pad(
|
encoded_inputs = self.pad(
|
||||||
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
|
images,
|
||||||
|
annotations=annotations,
|
||||||
|
return_pixel_mask=True,
|
||||||
|
data_format=data_format,
|
||||||
|
input_data_format=input_data_format,
|
||||||
|
return_tensors=return_tensors,
|
||||||
|
update_bboxes=do_convert_annotations,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
images = [
|
images = [
|
||||||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||||||
for image in images
|
for image in images
|
||||||
]
|
]
|
||||||
data = {"pixel_values": images}
|
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
|
||||||
|
if annotations is not None:
|
||||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
encoded_inputs["labels"] = [
|
||||||
if annotations is not None:
|
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
||||||
encoded_inputs["labels"] = [
|
]
|
||||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
|
||||||
]
|
|
||||||
|
|
||||||
return encoded_inputs
|
return encoded_inputs
|
||||||
|
|
||||||
|
|||||||
@@ -771,7 +771,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
|
|||||||
)
|
)
|
||||||
return encoded_inputs
|
return encoded_inputs
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
|
||||||
def _pad_image(
|
def _pad_image(
|
||||||
self,
|
self,
|
||||||
image: np.ndarray,
|
image: np.ndarray,
|
||||||
@@ -799,7 +799,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
|
|||||||
)
|
)
|
||||||
return padded_image
|
return padded_image
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
|
||||||
def pad(
|
def pad(
|
||||||
self,
|
self,
|
||||||
images: List[np.ndarray],
|
images: List[np.ndarray],
|
||||||
|
|||||||
@@ -788,7 +788,7 @@ class MaskFormerImageProcessor(BaseImageProcessor):
|
|||||||
)
|
)
|
||||||
return encoded_inputs
|
return encoded_inputs
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
|
||||||
def _pad_image(
|
def _pad_image(
|
||||||
self,
|
self,
|
||||||
image: np.ndarray,
|
image: np.ndarray,
|
||||||
@@ -816,7 +816,7 @@ class MaskFormerImageProcessor(BaseImageProcessor):
|
|||||||
)
|
)
|
||||||
return padded_image
|
return padded_image
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
|
||||||
def pad(
|
def pad(
|
||||||
self,
|
self,
|
||||||
images: List[np.ndarray],
|
images: List[np.ndarray],
|
||||||
|
|||||||
@@ -770,7 +770,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
|
|||||||
)
|
)
|
||||||
return encoded_inputs
|
return encoded_inputs
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
|
||||||
def _pad_image(
|
def _pad_image(
|
||||||
self,
|
self,
|
||||||
image: np.ndarray,
|
image: np.ndarray,
|
||||||
@@ -798,7 +798,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
|
|||||||
)
|
)
|
||||||
return padded_image
|
return padded_image
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
|
||||||
def pad(
|
def pad(
|
||||||
self,
|
self,
|
||||||
images: List[np.ndarray],
|
images: List[np.ndarray],
|
||||||
|
|||||||
@@ -251,7 +251,6 @@ class ViltImageProcessor(BaseImageProcessor):
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
|
||||||
def _pad_image(
|
def _pad_image(
|
||||||
self,
|
self,
|
||||||
image: np.ndarray,
|
image: np.ndarray,
|
||||||
@@ -279,7 +278,6 @@ class ViltImageProcessor(BaseImageProcessor):
|
|||||||
)
|
)
|
||||||
return padded_image
|
return padded_image
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
|
||||||
def pad(
|
def pad(
|
||||||
self,
|
self,
|
||||||
images: List[np.ndarray],
|
images: List[np.ndarray],
|
||||||
|
|||||||
@@ -696,8 +696,9 @@ class YolosImageProcessor(BaseImageProcessor):
|
|||||||
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
||||||
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||||
do_pad (`bool`, *optional*, defaults to `True`):
|
do_pad (`bool`, *optional*, defaults to `True`):
|
||||||
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
|
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||||
overridden by the `do_pad` parameter in the `preprocess` method.
|
method. If `True` will pad the images in the batch to the largest height and width in the batch.
|
||||||
|
Padding will be applied to the bottom and right of the image with zeros.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model_input_names = ["pixel_values", "pixel_mask"]
|
model_input_names = ["pixel_values", "pixel_mask"]
|
||||||
@@ -713,6 +714,7 @@ class YolosImageProcessor(BaseImageProcessor):
|
|||||||
do_normalize: bool = True,
|
do_normalize: bool = True,
|
||||||
image_mean: Union[float, List[float]] = None,
|
image_mean: Union[float, List[float]] = None,
|
||||||
image_std: Union[float, List[float]] = None,
|
image_std: Union[float, List[float]] = None,
|
||||||
|
do_convert_annotations: Optional[bool] = None,
|
||||||
do_pad: bool = True,
|
do_pad: bool = True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -731,6 +733,10 @@ class YolosImageProcessor(BaseImageProcessor):
|
|||||||
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
|
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
|
||||||
size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
||||||
|
|
||||||
|
# Backwards compatibility
|
||||||
|
if do_convert_annotations is None:
|
||||||
|
do_convert_annotations = do_normalize
|
||||||
|
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.format = format
|
self.format = format
|
||||||
self.do_resize = do_resize
|
self.do_resize = do_resize
|
||||||
@@ -739,6 +745,7 @@ class YolosImageProcessor(BaseImageProcessor):
|
|||||||
self.do_rescale = do_rescale
|
self.do_rescale = do_rescale
|
||||||
self.rescale_factor = rescale_factor
|
self.rescale_factor = rescale_factor
|
||||||
self.do_normalize = do_normalize
|
self.do_normalize = do_normalize
|
||||||
|
self.do_convert_annotations = do_convert_annotations
|
||||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||||
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||||
self.do_pad = do_pad
|
self.do_pad = do_pad
|
||||||
@@ -916,18 +923,64 @@ class YolosImageProcessor(BaseImageProcessor):
|
|||||||
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
||||||
"""
|
"""
|
||||||
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
|
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
|
||||||
`[center_x, center_y, width, height]` format.
|
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
|
||||||
"""
|
"""
|
||||||
return normalize_annotation(annotation, image_size=image_size)
|
return normalize_annotation(annotation, image_size=image_size)
|
||||||
|
|
||||||
|
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
|
||||||
|
def _update_annotation_for_padded_image(
|
||||||
|
self,
|
||||||
|
annotation: Dict,
|
||||||
|
input_image_size: Tuple[int, int],
|
||||||
|
output_image_size: Tuple[int, int],
|
||||||
|
padding,
|
||||||
|
update_bboxes,
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
Update the annotation for a padded image.
|
||||||
|
"""
|
||||||
|
new_annotation = {}
|
||||||
|
new_annotation["size"] = output_image_size
|
||||||
|
|
||||||
|
for key, value in annotation.items():
|
||||||
|
if key == "masks":
|
||||||
|
masks = value
|
||||||
|
masks = pad(
|
||||||
|
masks,
|
||||||
|
padding,
|
||||||
|
mode=PaddingMode.CONSTANT,
|
||||||
|
constant_values=0,
|
||||||
|
input_data_format=ChannelDimension.FIRST,
|
||||||
|
)
|
||||||
|
masks = safe_squeeze(masks, 1)
|
||||||
|
new_annotation["masks"] = masks
|
||||||
|
elif key == "boxes" and update_bboxes:
|
||||||
|
boxes = value
|
||||||
|
boxes *= np.asarray(
|
||||||
|
[
|
||||||
|
input_image_size[1] / output_image_size[1],
|
||||||
|
input_image_size[0] / output_image_size[0],
|
||||||
|
input_image_size[1] / output_image_size[1],
|
||||||
|
input_image_size[0] / output_image_size[0],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
new_annotation["boxes"] = boxes
|
||||||
|
elif key == "size":
|
||||||
|
new_annotation["size"] = output_image_size
|
||||||
|
else:
|
||||||
|
new_annotation[key] = value
|
||||||
|
return new_annotation
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
||||||
def _pad_image(
|
def _pad_image(
|
||||||
self,
|
self,
|
||||||
image: np.ndarray,
|
image: np.ndarray,
|
||||||
output_size: Tuple[int, int],
|
output_size: Tuple[int, int],
|
||||||
|
annotation: Optional[Dict[str, Any]] = None,
|
||||||
constant_values: Union[float, Iterable[float]] = 0,
|
constant_values: Union[float, Iterable[float]] = 0,
|
||||||
data_format: Optional[ChannelDimension] = None,
|
data_format: Optional[ChannelDimension] = None,
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
update_bboxes: bool = True,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""
|
"""
|
||||||
Pad an image with zeros to the given size.
|
Pad an image with zeros to the given size.
|
||||||
@@ -946,16 +999,22 @@ class YolosImageProcessor(BaseImageProcessor):
|
|||||||
data_format=data_format,
|
data_format=data_format,
|
||||||
input_data_format=input_data_format,
|
input_data_format=input_data_format,
|
||||||
)
|
)
|
||||||
return padded_image
|
if annotation is not None:
|
||||||
|
annotation = self._update_annotation_for_padded_image(
|
||||||
|
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
|
||||||
|
)
|
||||||
|
return padded_image, annotation
|
||||||
|
|
||||||
def pad(
|
def pad(
|
||||||
self,
|
self,
|
||||||
images: List[np.ndarray],
|
images: List[np.ndarray],
|
||||||
|
annotations: Optional[List[Dict[str, Any]]] = None,
|
||||||
constant_values: Union[float, Iterable[float]] = 0,
|
constant_values: Union[float, Iterable[float]] = 0,
|
||||||
return_pixel_mask: bool = False,
|
return_pixel_mask: bool = False,
|
||||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||||
data_format: Optional[ChannelDimension] = None,
|
data_format: Optional[ChannelDimension] = None,
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
update_bboxes: bool = True,
|
||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
"""
|
"""
|
||||||
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
||||||
@@ -964,6 +1023,9 @@ class YolosImageProcessor(BaseImageProcessor):
|
|||||||
Args:
|
Args:
|
||||||
image (`np.ndarray`):
|
image (`np.ndarray`):
|
||||||
Image to pad.
|
Image to pad.
|
||||||
|
annotations (`List[Dict[str, any]]`, *optional*):
|
||||||
|
Annotations to pad along with the images. If provided, the bounding boxes will be updated to match the
|
||||||
|
padded images.
|
||||||
constant_values (`float` or `Iterable[float]`, *optional*):
|
constant_values (`float` or `Iterable[float]`, *optional*):
|
||||||
The value to use for the padding if `mode` is `"constant"`.
|
The value to use for the padding if `mode` is `"constant"`.
|
||||||
return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
||||||
@@ -979,19 +1041,29 @@ class YolosImageProcessor(BaseImageProcessor):
|
|||||||
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
||||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||||
The channel dimension format of the input image. If not provided, it will be inferred.
|
The channel dimension format of the input image. If not provided, it will be inferred.
|
||||||
|
update_bboxes (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to update the bounding boxes in the annotations to match the padded images. If the
|
||||||
|
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
|
||||||
|
format, the bounding boxes will not be updated.
|
||||||
"""
|
"""
|
||||||
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||||
|
|
||||||
padded_images = [
|
annotation_list = annotations if annotations is not None else [None] * len(images)
|
||||||
self._pad_image(
|
padded_images = []
|
||||||
|
padded_annotations = []
|
||||||
|
for image, annotation in zip(images, annotation_list):
|
||||||
|
padded_image, padded_annotation = self._pad_image(
|
||||||
image,
|
image,
|
||||||
pad_size,
|
pad_size,
|
||||||
|
annotation,
|
||||||
constant_values=constant_values,
|
constant_values=constant_values,
|
||||||
data_format=data_format,
|
data_format=data_format,
|
||||||
input_data_format=input_data_format,
|
input_data_format=input_data_format,
|
||||||
|
update_bboxes=update_bboxes,
|
||||||
)
|
)
|
||||||
for image in images
|
padded_images.append(padded_image)
|
||||||
]
|
padded_annotations.append(padded_annotation)
|
||||||
|
|
||||||
data = {"pixel_values": padded_images}
|
data = {"pixel_values": padded_images}
|
||||||
|
|
||||||
if return_pixel_mask:
|
if return_pixel_mask:
|
||||||
@@ -1017,6 +1089,7 @@ class YolosImageProcessor(BaseImageProcessor):
|
|||||||
do_normalize: Optional[bool] = None,
|
do_normalize: Optional[bool] = None,
|
||||||
image_mean: Optional[Union[float, List[float]]] = None,
|
image_mean: Optional[Union[float, List[float]]] = None,
|
||||||
image_std: Optional[Union[float, List[float]]] = None,
|
image_std: Optional[Union[float, List[float]]] = None,
|
||||||
|
do_convert_annotations: Optional[bool] = None,
|
||||||
do_pad: Optional[bool] = None,
|
do_pad: Optional[bool] = None,
|
||||||
format: Optional[Union[str, AnnotationFormat]] = None,
|
format: Optional[Union[str, AnnotationFormat]] = None,
|
||||||
return_tensors: Optional[Union[TensorType, str]] = None,
|
return_tensors: Optional[Union[TensorType, str]] = None,
|
||||||
@@ -1062,8 +1135,13 @@ class YolosImageProcessor(BaseImageProcessor):
|
|||||||
Mean to use when normalizing the image.
|
Mean to use when normalizing the image.
|
||||||
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
||||||
Standard deviation to use when normalizing the image.
|
Standard deviation to use when normalizing the image.
|
||||||
|
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
|
||||||
|
Whether to convert the annotations to the format expected by the model. Converts the bounding
|
||||||
|
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
|
||||||
|
and in relative coordinates.
|
||||||
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
||||||
Whether to pad the image.
|
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
|
||||||
|
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
|
||||||
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
||||||
Format of the annotations.
|
Format of the annotations.
|
||||||
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
||||||
@@ -1101,6 +1179,9 @@ class YolosImageProcessor(BaseImageProcessor):
|
|||||||
do_normalize = self.do_normalize if do_normalize is None else do_normalize
|
do_normalize = self.do_normalize if do_normalize is None else do_normalize
|
||||||
image_mean = self.image_mean if image_mean is None else image_mean
|
image_mean = self.image_mean if image_mean is None else image_mean
|
||||||
image_std = self.image_std if image_std is None else image_std
|
image_std = self.image_std if image_std is None else image_std
|
||||||
|
do_convert_annotations = (
|
||||||
|
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
|
||||||
|
)
|
||||||
do_pad = self.do_pad if do_pad is None else do_pad
|
do_pad = self.do_pad if do_pad is None else do_pad
|
||||||
format = self.format if format is None else format
|
format = self.format if format is None else format
|
||||||
|
|
||||||
@@ -1204,26 +1285,34 @@ class YolosImageProcessor(BaseImageProcessor):
|
|||||||
images = [
|
images = [
|
||||||
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
|
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
|
||||||
]
|
]
|
||||||
if annotations is not None:
|
|
||||||
annotations = [
|
if do_convert_annotations and annotations is not None:
|
||||||
self.normalize_annotation(annotation, get_image_size(image))
|
annotations = [
|
||||||
for annotation, image in zip(annotations, images)
|
self.normalize_annotation(annotation, get_image_size(image))
|
||||||
]
|
for annotation, image in zip(annotations, images)
|
||||||
|
]
|
||||||
|
|
||||||
if do_pad:
|
if do_pad:
|
||||||
data = self.pad(images, data_format=data_format, input_data_format=input_data_format)
|
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
|
||||||
|
encoded_inputs = self.pad(
|
||||||
|
images,
|
||||||
|
annotations=annotations,
|
||||||
|
return_pixel_mask=True,
|
||||||
|
data_format=data_format,
|
||||||
|
input_data_format=input_data_format,
|
||||||
|
update_bboxes=do_convert_annotations,
|
||||||
|
return_tensors=return_tensors,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
images = [
|
images = [
|
||||||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||||||
for image in images
|
for image in images
|
||||||
]
|
]
|
||||||
data = {"pixel_values": images}
|
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
|
||||||
|
if annotations is not None:
|
||||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
encoded_inputs["labels"] = [
|
||||||
if annotations is not None:
|
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
||||||
encoded_inputs["labels"] = [
|
]
|
||||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
|
||||||
]
|
|
||||||
|
|
||||||
return encoded_inputs
|
return encoded_inputs
|
||||||
|
|
||||||
|
|||||||
@@ -248,3 +248,246 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
|
|||||||
# verify size
|
# verify size
|
||||||
expected_size = torch.tensor([800, 1066])
|
expected_size = torch.tensor([800, 1066])
|
||||||
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
|
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
|
||||||
|
|
||||||
|
@slow
|
||||||
|
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->ConditionalDetr, facebook/detr-resnet-50 ->microsoft/conditional-detr-resnet-50
|
||||||
|
def test_batched_coco_detection_annotations(self):
|
||||||
|
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||||
|
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||||
|
|
||||||
|
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
|
||||||
|
target = json.loads(f.read())
|
||||||
|
|
||||||
|
annotations_0 = {"image_id": 39769, "annotations": target}
|
||||||
|
annotations_1 = {"image_id": 39769, "annotations": target}
|
||||||
|
|
||||||
|
# Adjust the bounding boxes for the resized image
|
||||||
|
w_0, h_0 = image_0.size
|
||||||
|
w_1, h_1 = image_1.size
|
||||||
|
for i in range(len(annotations_1["annotations"])):
|
||||||
|
coords = annotations_1["annotations"][i]["bbox"]
|
||||||
|
new_bbox = [
|
||||||
|
coords[0] * w_1 / w_0,
|
||||||
|
coords[1] * h_1 / h_0,
|
||||||
|
coords[2] * w_1 / w_0,
|
||||||
|
coords[3] * h_1 / h_0,
|
||||||
|
]
|
||||||
|
annotations_1["annotations"][i]["bbox"] = new_bbox
|
||||||
|
|
||||||
|
images = [image_0, image_1]
|
||||||
|
annotations = [annotations_0, annotations_1]
|
||||||
|
|
||||||
|
image_processing = ConditionalDetrImageProcessor()
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
return_tensors="pt", # do_convert_annotations=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check the pixel values have been padded
|
||||||
|
postprocessed_height, postprocessed_width = 800, 1066
|
||||||
|
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||||
|
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||||
|
|
||||||
|
# Check the bounding boxes have been adjusted for padded images
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
expected_boxes_0 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.6879, 0.4609, 0.0755, 0.3691],
|
||||||
|
[0.2118, 0.3359, 0.2601, 0.1566],
|
||||||
|
[0.5011, 0.5000, 0.9979, 1.0000],
|
||||||
|
[0.5010, 0.5020, 0.9979, 0.9959],
|
||||||
|
[0.3284, 0.5944, 0.5884, 0.8112],
|
||||||
|
[0.8394, 0.5445, 0.3213, 0.9110],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected_boxes_1 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.4130, 0.2765, 0.0453, 0.2215],
|
||||||
|
[0.1272, 0.2016, 0.1561, 0.0940],
|
||||||
|
[0.3757, 0.4933, 0.7488, 0.9865],
|
||||||
|
[0.3759, 0.5002, 0.7492, 0.9955],
|
||||||
|
[0.1971, 0.5456, 0.3532, 0.8646],
|
||||||
|
[0.5790, 0.4115, 0.3430, 0.7161],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||||
|
|
||||||
|
# Check the masks have also been padded
|
||||||
|
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
|
||||||
|
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||||
|
# format and not in the range [0, 1]
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
do_convert_annotations=False,
|
||||||
|
return_tensors="pt",
|
||||||
|
)
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
# Convert to absolute coordinates
|
||||||
|
unnormalized_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_0[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_0[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
unnormalized_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_1[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_1[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||||
|
expected_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
expected_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||||
|
|
||||||
|
@slow
|
||||||
|
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->ConditionalDetr
|
||||||
|
def test_batched_coco_panoptic_annotations(self):
|
||||||
|
# prepare image, target and masks_path
|
||||||
|
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||||
|
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||||
|
|
||||||
|
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
|
||||||
|
target = json.loads(f.read())
|
||||||
|
|
||||||
|
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||||
|
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||||
|
|
||||||
|
w_0, h_0 = image_0.size
|
||||||
|
w_1, h_1 = image_1.size
|
||||||
|
for i in range(len(annotation_1["segments_info"])):
|
||||||
|
coords = annotation_1["segments_info"][i]["bbox"]
|
||||||
|
new_bbox = [
|
||||||
|
coords[0] * w_1 / w_0,
|
||||||
|
coords[1] * h_1 / h_0,
|
||||||
|
coords[2] * w_1 / w_0,
|
||||||
|
coords[3] * h_1 / h_0,
|
||||||
|
]
|
||||||
|
annotation_1["segments_info"][i]["bbox"] = new_bbox
|
||||||
|
|
||||||
|
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
|
||||||
|
|
||||||
|
images = [image_0, image_1]
|
||||||
|
annotations = [annotation_0, annotation_1]
|
||||||
|
|
||||||
|
# encode them
|
||||||
|
image_processing = ConditionalDetrImageProcessor(format="coco_panoptic")
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
masks_path=masks_path,
|
||||||
|
return_tensors="pt",
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check the pixel values have been padded
|
||||||
|
postprocessed_height, postprocessed_width = 800, 1066
|
||||||
|
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||||
|
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||||
|
|
||||||
|
# Check the bounding boxes have been adjusted for padded images
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
expected_boxes_0 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.2625, 0.5437, 0.4688, 0.8625],
|
||||||
|
[0.7719, 0.4104, 0.4531, 0.7125],
|
||||||
|
[0.5000, 0.4927, 0.9969, 0.9854],
|
||||||
|
[0.1688, 0.2000, 0.2063, 0.0917],
|
||||||
|
[0.5492, 0.2760, 0.0578, 0.2187],
|
||||||
|
[0.4992, 0.4990, 0.9984, 0.9979],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected_boxes_1 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.1576, 0.3262, 0.2814, 0.5175],
|
||||||
|
[0.4634, 0.2463, 0.2720, 0.4275],
|
||||||
|
[0.3002, 0.2956, 0.5985, 0.5913],
|
||||||
|
[0.1013, 0.1200, 0.1238, 0.0550],
|
||||||
|
[0.3297, 0.1656, 0.0347, 0.1312],
|
||||||
|
[0.2997, 0.2994, 0.5994, 0.5987],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||||
|
|
||||||
|
# Check the masks have also been padded
|
||||||
|
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
|
||||||
|
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||||
|
# format and not in the range [0, 1]
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
masks_path=masks_path,
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
do_convert_annotations=False,
|
||||||
|
return_tensors="pt",
|
||||||
|
)
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
# Convert to absolute coordinates
|
||||||
|
unnormalized_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_0[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_0[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
unnormalized_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_1[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_1[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||||
|
expected_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
expected_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||||
|
|||||||
@@ -250,3 +250,246 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
|
|||||||
# verify size
|
# verify size
|
||||||
expected_size = torch.tensor([800, 1066])
|
expected_size = torch.tensor([800, 1066])
|
||||||
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
|
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
|
||||||
|
|
||||||
|
@slow
|
||||||
|
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->DeformableDetr
|
||||||
|
def test_batched_coco_detection_annotations(self):
|
||||||
|
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||||
|
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||||
|
|
||||||
|
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
|
||||||
|
target = json.loads(f.read())
|
||||||
|
|
||||||
|
annotations_0 = {"image_id": 39769, "annotations": target}
|
||||||
|
annotations_1 = {"image_id": 39769, "annotations": target}
|
||||||
|
|
||||||
|
# Adjust the bounding boxes for the resized image
|
||||||
|
w_0, h_0 = image_0.size
|
||||||
|
w_1, h_1 = image_1.size
|
||||||
|
for i in range(len(annotations_1["annotations"])):
|
||||||
|
coords = annotations_1["annotations"][i]["bbox"]
|
||||||
|
new_bbox = [
|
||||||
|
coords[0] * w_1 / w_0,
|
||||||
|
coords[1] * h_1 / h_0,
|
||||||
|
coords[2] * w_1 / w_0,
|
||||||
|
coords[3] * h_1 / h_0,
|
||||||
|
]
|
||||||
|
annotations_1["annotations"][i]["bbox"] = new_bbox
|
||||||
|
|
||||||
|
images = [image_0, image_1]
|
||||||
|
annotations = [annotations_0, annotations_1]
|
||||||
|
|
||||||
|
image_processing = DeformableDetrImageProcessor()
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
return_tensors="pt", # do_convert_annotations=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check the pixel values have been padded
|
||||||
|
postprocessed_height, postprocessed_width = 800, 1066
|
||||||
|
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||||
|
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||||
|
|
||||||
|
# Check the bounding boxes have been adjusted for padded images
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
expected_boxes_0 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.6879, 0.4609, 0.0755, 0.3691],
|
||||||
|
[0.2118, 0.3359, 0.2601, 0.1566],
|
||||||
|
[0.5011, 0.5000, 0.9979, 1.0000],
|
||||||
|
[0.5010, 0.5020, 0.9979, 0.9959],
|
||||||
|
[0.3284, 0.5944, 0.5884, 0.8112],
|
||||||
|
[0.8394, 0.5445, 0.3213, 0.9110],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected_boxes_1 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.4130, 0.2765, 0.0453, 0.2215],
|
||||||
|
[0.1272, 0.2016, 0.1561, 0.0940],
|
||||||
|
[0.3757, 0.4933, 0.7488, 0.9865],
|
||||||
|
[0.3759, 0.5002, 0.7492, 0.9955],
|
||||||
|
[0.1971, 0.5456, 0.3532, 0.8646],
|
||||||
|
[0.5790, 0.4115, 0.3430, 0.7161],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||||
|
|
||||||
|
# Check the masks have also been padded
|
||||||
|
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
|
||||||
|
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||||
|
# format and not in the range [0, 1]
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
do_convert_annotations=False,
|
||||||
|
return_tensors="pt",
|
||||||
|
)
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
# Convert to absolute coordinates
|
||||||
|
unnormalized_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_0[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_0[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
unnormalized_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_1[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_1[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||||
|
expected_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
expected_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||||
|
|
||||||
|
@slow
|
||||||
|
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->DeformableDetr
|
||||||
|
def test_batched_coco_panoptic_annotations(self):
|
||||||
|
# prepare image, target and masks_path
|
||||||
|
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||||
|
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||||
|
|
||||||
|
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
|
||||||
|
target = json.loads(f.read())
|
||||||
|
|
||||||
|
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||||
|
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||||
|
|
||||||
|
w_0, h_0 = image_0.size
|
||||||
|
w_1, h_1 = image_1.size
|
||||||
|
for i in range(len(annotation_1["segments_info"])):
|
||||||
|
coords = annotation_1["segments_info"][i]["bbox"]
|
||||||
|
new_bbox = [
|
||||||
|
coords[0] * w_1 / w_0,
|
||||||
|
coords[1] * h_1 / h_0,
|
||||||
|
coords[2] * w_1 / w_0,
|
||||||
|
coords[3] * h_1 / h_0,
|
||||||
|
]
|
||||||
|
annotation_1["segments_info"][i]["bbox"] = new_bbox
|
||||||
|
|
||||||
|
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
|
||||||
|
|
||||||
|
images = [image_0, image_1]
|
||||||
|
annotations = [annotation_0, annotation_1]
|
||||||
|
|
||||||
|
# encode them
|
||||||
|
image_processing = DeformableDetrImageProcessor(format="coco_panoptic")
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
masks_path=masks_path,
|
||||||
|
return_tensors="pt",
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check the pixel values have been padded
|
||||||
|
postprocessed_height, postprocessed_width = 800, 1066
|
||||||
|
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||||
|
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||||
|
|
||||||
|
# Check the bounding boxes have been adjusted for padded images
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
expected_boxes_0 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.2625, 0.5437, 0.4688, 0.8625],
|
||||||
|
[0.7719, 0.4104, 0.4531, 0.7125],
|
||||||
|
[0.5000, 0.4927, 0.9969, 0.9854],
|
||||||
|
[0.1688, 0.2000, 0.2063, 0.0917],
|
||||||
|
[0.5492, 0.2760, 0.0578, 0.2187],
|
||||||
|
[0.4992, 0.4990, 0.9984, 0.9979],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected_boxes_1 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.1576, 0.3262, 0.2814, 0.5175],
|
||||||
|
[0.4634, 0.2463, 0.2720, 0.4275],
|
||||||
|
[0.3002, 0.2956, 0.5985, 0.5913],
|
||||||
|
[0.1013, 0.1200, 0.1238, 0.0550],
|
||||||
|
[0.3297, 0.1656, 0.0347, 0.1312],
|
||||||
|
[0.2997, 0.2994, 0.5994, 0.5987],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||||
|
|
||||||
|
# Check the masks have also been padded
|
||||||
|
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
|
||||||
|
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||||
|
# format and not in the range [0, 1]
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
masks_path=masks_path,
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
do_convert_annotations=False,
|
||||||
|
return_tensors="pt",
|
||||||
|
)
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
# Convert to absolute coordinates
|
||||||
|
unnormalized_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_0[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_0[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
unnormalized_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_1[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_1[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||||
|
expected_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
expected_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||||
|
|||||||
@@ -244,3 +244,246 @@ class DetaImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
|
|||||||
# verify size
|
# verify size
|
||||||
expected_size = torch.tensor([800, 1066])
|
expected_size = torch.tensor([800, 1066])
|
||||||
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
|
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
|
||||||
|
|
||||||
|
@slow
|
||||||
|
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->Deta
|
||||||
|
def test_batched_coco_detection_annotations(self):
|
||||||
|
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||||
|
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||||
|
|
||||||
|
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
|
||||||
|
target = json.loads(f.read())
|
||||||
|
|
||||||
|
annotations_0 = {"image_id": 39769, "annotations": target}
|
||||||
|
annotations_1 = {"image_id": 39769, "annotations": target}
|
||||||
|
|
||||||
|
# Adjust the bounding boxes for the resized image
|
||||||
|
w_0, h_0 = image_0.size
|
||||||
|
w_1, h_1 = image_1.size
|
||||||
|
for i in range(len(annotations_1["annotations"])):
|
||||||
|
coords = annotations_1["annotations"][i]["bbox"]
|
||||||
|
new_bbox = [
|
||||||
|
coords[0] * w_1 / w_0,
|
||||||
|
coords[1] * h_1 / h_0,
|
||||||
|
coords[2] * w_1 / w_0,
|
||||||
|
coords[3] * h_1 / h_0,
|
||||||
|
]
|
||||||
|
annotations_1["annotations"][i]["bbox"] = new_bbox
|
||||||
|
|
||||||
|
images = [image_0, image_1]
|
||||||
|
annotations = [annotations_0, annotations_1]
|
||||||
|
|
||||||
|
image_processing = DetaImageProcessor()
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
return_tensors="pt", # do_convert_annotations=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check the pixel values have been padded
|
||||||
|
postprocessed_height, postprocessed_width = 800, 1066
|
||||||
|
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||||
|
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||||
|
|
||||||
|
# Check the bounding boxes have been adjusted for padded images
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
expected_boxes_0 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.6879, 0.4609, 0.0755, 0.3691],
|
||||||
|
[0.2118, 0.3359, 0.2601, 0.1566],
|
||||||
|
[0.5011, 0.5000, 0.9979, 1.0000],
|
||||||
|
[0.5010, 0.5020, 0.9979, 0.9959],
|
||||||
|
[0.3284, 0.5944, 0.5884, 0.8112],
|
||||||
|
[0.8394, 0.5445, 0.3213, 0.9110],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected_boxes_1 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.4130, 0.2765, 0.0453, 0.2215],
|
||||||
|
[0.1272, 0.2016, 0.1561, 0.0940],
|
||||||
|
[0.3757, 0.4933, 0.7488, 0.9865],
|
||||||
|
[0.3759, 0.5002, 0.7492, 0.9955],
|
||||||
|
[0.1971, 0.5456, 0.3532, 0.8646],
|
||||||
|
[0.5790, 0.4115, 0.3430, 0.7161],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||||
|
|
||||||
|
# Check the masks have also been padded
|
||||||
|
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
|
||||||
|
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||||
|
# format and not in the range [0, 1]
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
do_convert_annotations=False,
|
||||||
|
return_tensors="pt",
|
||||||
|
)
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
# Convert to absolute coordinates
|
||||||
|
unnormalized_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_0[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_0[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
unnormalized_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_1[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_1[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||||
|
expected_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
expected_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||||
|
|
||||||
|
@slow
|
||||||
|
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->Deta
|
||||||
|
def test_batched_coco_panoptic_annotations(self):
|
||||||
|
# prepare image, target and masks_path
|
||||||
|
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||||
|
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||||
|
|
||||||
|
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
|
||||||
|
target = json.loads(f.read())
|
||||||
|
|
||||||
|
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||||
|
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||||
|
|
||||||
|
w_0, h_0 = image_0.size
|
||||||
|
w_1, h_1 = image_1.size
|
||||||
|
for i in range(len(annotation_1["segments_info"])):
|
||||||
|
coords = annotation_1["segments_info"][i]["bbox"]
|
||||||
|
new_bbox = [
|
||||||
|
coords[0] * w_1 / w_0,
|
||||||
|
coords[1] * h_1 / h_0,
|
||||||
|
coords[2] * w_1 / w_0,
|
||||||
|
coords[3] * h_1 / h_0,
|
||||||
|
]
|
||||||
|
annotation_1["segments_info"][i]["bbox"] = new_bbox
|
||||||
|
|
||||||
|
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
|
||||||
|
|
||||||
|
images = [image_0, image_1]
|
||||||
|
annotations = [annotation_0, annotation_1]
|
||||||
|
|
||||||
|
# encode them
|
||||||
|
image_processing = DetaImageProcessor(format="coco_panoptic")
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
masks_path=masks_path,
|
||||||
|
return_tensors="pt",
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check the pixel values have been padded
|
||||||
|
postprocessed_height, postprocessed_width = 800, 1066
|
||||||
|
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||||
|
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||||
|
|
||||||
|
# Check the bounding boxes have been adjusted for padded images
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
expected_boxes_0 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.2625, 0.5437, 0.4688, 0.8625],
|
||||||
|
[0.7719, 0.4104, 0.4531, 0.7125],
|
||||||
|
[0.5000, 0.4927, 0.9969, 0.9854],
|
||||||
|
[0.1688, 0.2000, 0.2063, 0.0917],
|
||||||
|
[0.5492, 0.2760, 0.0578, 0.2187],
|
||||||
|
[0.4992, 0.4990, 0.9984, 0.9979],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected_boxes_1 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.1576, 0.3262, 0.2814, 0.5175],
|
||||||
|
[0.4634, 0.2463, 0.2720, 0.4275],
|
||||||
|
[0.3002, 0.2956, 0.5985, 0.5913],
|
||||||
|
[0.1013, 0.1200, 0.1238, 0.0550],
|
||||||
|
[0.3297, 0.1656, 0.0347, 0.1312],
|
||||||
|
[0.2997, 0.2994, 0.5994, 0.5987],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||||
|
|
||||||
|
# Check the masks have also been padded
|
||||||
|
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
|
||||||
|
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||||
|
# format and not in the range [0, 1]
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
masks_path=masks_path,
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
do_convert_annotations=False,
|
||||||
|
return_tensors="pt",
|
||||||
|
)
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
# Convert to absolute coordinates
|
||||||
|
unnormalized_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_0[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_0[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
unnormalized_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_1[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_1[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||||
|
expected_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
expected_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||||
|
|||||||
@@ -13,7 +13,6 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import pathlib
|
import pathlib
|
||||||
import unittest
|
import unittest
|
||||||
@@ -308,3 +307,244 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
|
|||||||
# verify size
|
# verify size
|
||||||
expected_size = torch.tensor([800, 1066])
|
expected_size = torch.tensor([800, 1066])
|
||||||
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
|
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_batched_coco_detection_annotations(self):
|
||||||
|
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||||
|
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||||
|
|
||||||
|
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
|
||||||
|
target = json.loads(f.read())
|
||||||
|
|
||||||
|
annotations_0 = {"image_id": 39769, "annotations": target}
|
||||||
|
annotations_1 = {"image_id": 39769, "annotations": target}
|
||||||
|
|
||||||
|
# Adjust the bounding boxes for the resized image
|
||||||
|
w_0, h_0 = image_0.size
|
||||||
|
w_1, h_1 = image_1.size
|
||||||
|
for i in range(len(annotations_1["annotations"])):
|
||||||
|
coords = annotations_1["annotations"][i]["bbox"]
|
||||||
|
new_bbox = [
|
||||||
|
coords[0] * w_1 / w_0,
|
||||||
|
coords[1] * h_1 / h_0,
|
||||||
|
coords[2] * w_1 / w_0,
|
||||||
|
coords[3] * h_1 / h_0,
|
||||||
|
]
|
||||||
|
annotations_1["annotations"][i]["bbox"] = new_bbox
|
||||||
|
|
||||||
|
images = [image_0, image_1]
|
||||||
|
annotations = [annotations_0, annotations_1]
|
||||||
|
|
||||||
|
image_processing = DetrImageProcessor()
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
return_tensors="pt", # do_convert_annotations=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check the pixel values have been padded
|
||||||
|
postprocessed_height, postprocessed_width = 800, 1066
|
||||||
|
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||||
|
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||||
|
|
||||||
|
# Check the bounding boxes have been adjusted for padded images
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
expected_boxes_0 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.6879, 0.4609, 0.0755, 0.3691],
|
||||||
|
[0.2118, 0.3359, 0.2601, 0.1566],
|
||||||
|
[0.5011, 0.5000, 0.9979, 1.0000],
|
||||||
|
[0.5010, 0.5020, 0.9979, 0.9959],
|
||||||
|
[0.3284, 0.5944, 0.5884, 0.8112],
|
||||||
|
[0.8394, 0.5445, 0.3213, 0.9110],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected_boxes_1 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.4130, 0.2765, 0.0453, 0.2215],
|
||||||
|
[0.1272, 0.2016, 0.1561, 0.0940],
|
||||||
|
[0.3757, 0.4933, 0.7488, 0.9865],
|
||||||
|
[0.3759, 0.5002, 0.7492, 0.9955],
|
||||||
|
[0.1971, 0.5456, 0.3532, 0.8646],
|
||||||
|
[0.5790, 0.4115, 0.3430, 0.7161],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||||
|
|
||||||
|
# Check the masks have also been padded
|
||||||
|
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
|
||||||
|
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||||
|
# format and not in the range [0, 1]
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
do_convert_annotations=False,
|
||||||
|
return_tensors="pt",
|
||||||
|
)
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
# Convert to absolute coordinates
|
||||||
|
unnormalized_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_0[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_0[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
unnormalized_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_1[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_1[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||||
|
expected_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
expected_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_batched_coco_panoptic_annotations(self):
|
||||||
|
# prepare image, target and masks_path
|
||||||
|
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||||
|
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||||
|
|
||||||
|
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
|
||||||
|
target = json.loads(f.read())
|
||||||
|
|
||||||
|
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||||
|
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||||
|
|
||||||
|
w_0, h_0 = image_0.size
|
||||||
|
w_1, h_1 = image_1.size
|
||||||
|
for i in range(len(annotation_1["segments_info"])):
|
||||||
|
coords = annotation_1["segments_info"][i]["bbox"]
|
||||||
|
new_bbox = [
|
||||||
|
coords[0] * w_1 / w_0,
|
||||||
|
coords[1] * h_1 / h_0,
|
||||||
|
coords[2] * w_1 / w_0,
|
||||||
|
coords[3] * h_1 / h_0,
|
||||||
|
]
|
||||||
|
annotation_1["segments_info"][i]["bbox"] = new_bbox
|
||||||
|
|
||||||
|
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
|
||||||
|
|
||||||
|
images = [image_0, image_1]
|
||||||
|
annotations = [annotation_0, annotation_1]
|
||||||
|
|
||||||
|
# encode them
|
||||||
|
image_processing = DetrImageProcessor(format="coco_panoptic")
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
masks_path=masks_path,
|
||||||
|
return_tensors="pt",
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check the pixel values have been padded
|
||||||
|
postprocessed_height, postprocessed_width = 800, 1066
|
||||||
|
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||||
|
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||||
|
|
||||||
|
# Check the bounding boxes have been adjusted for padded images
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
expected_boxes_0 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.2625, 0.5437, 0.4688, 0.8625],
|
||||||
|
[0.7719, 0.4104, 0.4531, 0.7125],
|
||||||
|
[0.5000, 0.4927, 0.9969, 0.9854],
|
||||||
|
[0.1688, 0.2000, 0.2063, 0.0917],
|
||||||
|
[0.5492, 0.2760, 0.0578, 0.2187],
|
||||||
|
[0.4992, 0.4990, 0.9984, 0.9979],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected_boxes_1 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.1576, 0.3262, 0.2814, 0.5175],
|
||||||
|
[0.4634, 0.2463, 0.2720, 0.4275],
|
||||||
|
[0.3002, 0.2956, 0.5985, 0.5913],
|
||||||
|
[0.1013, 0.1200, 0.1238, 0.0550],
|
||||||
|
[0.3297, 0.1656, 0.0347, 0.1312],
|
||||||
|
[0.2997, 0.2994, 0.5994, 0.5987],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||||
|
|
||||||
|
# Check the masks have also been padded
|
||||||
|
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
|
||||||
|
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||||
|
# format and not in the range [0, 1]
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
masks_path=masks_path,
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
do_convert_annotations=False,
|
||||||
|
return_tensors="pt",
|
||||||
|
)
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
# Convert to absolute coordinates
|
||||||
|
unnormalized_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_0[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_0[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
unnormalized_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_1[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_1[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||||
|
expected_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
expected_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||||
|
|||||||
@@ -287,3 +287,246 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
|
|||||||
# verify size
|
# verify size
|
||||||
expected_size = torch.tensor([800, 1056])
|
expected_size = torch.tensor([800, 1056])
|
||||||
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
|
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
|
||||||
|
|
||||||
|
@slow
|
||||||
|
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->Yolos
|
||||||
|
def test_batched_coco_detection_annotations(self):
|
||||||
|
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||||
|
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||||
|
|
||||||
|
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
|
||||||
|
target = json.loads(f.read())
|
||||||
|
|
||||||
|
annotations_0 = {"image_id": 39769, "annotations": target}
|
||||||
|
annotations_1 = {"image_id": 39769, "annotations": target}
|
||||||
|
|
||||||
|
# Adjust the bounding boxes for the resized image
|
||||||
|
w_0, h_0 = image_0.size
|
||||||
|
w_1, h_1 = image_1.size
|
||||||
|
for i in range(len(annotations_1["annotations"])):
|
||||||
|
coords = annotations_1["annotations"][i]["bbox"]
|
||||||
|
new_bbox = [
|
||||||
|
coords[0] * w_1 / w_0,
|
||||||
|
coords[1] * h_1 / h_0,
|
||||||
|
coords[2] * w_1 / w_0,
|
||||||
|
coords[3] * h_1 / h_0,
|
||||||
|
]
|
||||||
|
annotations_1["annotations"][i]["bbox"] = new_bbox
|
||||||
|
|
||||||
|
images = [image_0, image_1]
|
||||||
|
annotations = [annotations_0, annotations_1]
|
||||||
|
|
||||||
|
image_processing = YolosImageProcessor()
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
return_tensors="pt", # do_convert_annotations=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check the pixel values have been padded
|
||||||
|
postprocessed_height, postprocessed_width = 800, 1066
|
||||||
|
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||||
|
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||||
|
|
||||||
|
# Check the bounding boxes have been adjusted for padded images
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
expected_boxes_0 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.6879, 0.4609, 0.0755, 0.3691],
|
||||||
|
[0.2118, 0.3359, 0.2601, 0.1566],
|
||||||
|
[0.5011, 0.5000, 0.9979, 1.0000],
|
||||||
|
[0.5010, 0.5020, 0.9979, 0.9959],
|
||||||
|
[0.3284, 0.5944, 0.5884, 0.8112],
|
||||||
|
[0.8394, 0.5445, 0.3213, 0.9110],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected_boxes_1 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.4130, 0.2765, 0.0453, 0.2215],
|
||||||
|
[0.1272, 0.2016, 0.1561, 0.0940],
|
||||||
|
[0.3757, 0.4933, 0.7488, 0.9865],
|
||||||
|
[0.3759, 0.5002, 0.7492, 0.9955],
|
||||||
|
[0.1971, 0.5456, 0.3532, 0.8646],
|
||||||
|
[0.5790, 0.4115, 0.3430, 0.7161],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||||
|
|
||||||
|
# Check the masks have also been padded
|
||||||
|
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
|
||||||
|
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||||
|
# format and not in the range [0, 1]
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
do_convert_annotations=False,
|
||||||
|
return_tensors="pt",
|
||||||
|
)
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
# Convert to absolute coordinates
|
||||||
|
unnormalized_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_0[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_0[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
unnormalized_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_1[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_1[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||||
|
expected_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
expected_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||||
|
|
||||||
|
@slow
|
||||||
|
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->Yolos
|
||||||
|
def test_batched_coco_panoptic_annotations(self):
|
||||||
|
# prepare image, target and masks_path
|
||||||
|
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||||
|
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||||
|
|
||||||
|
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
|
||||||
|
target = json.loads(f.read())
|
||||||
|
|
||||||
|
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||||
|
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||||
|
|
||||||
|
w_0, h_0 = image_0.size
|
||||||
|
w_1, h_1 = image_1.size
|
||||||
|
for i in range(len(annotation_1["segments_info"])):
|
||||||
|
coords = annotation_1["segments_info"][i]["bbox"]
|
||||||
|
new_bbox = [
|
||||||
|
coords[0] * w_1 / w_0,
|
||||||
|
coords[1] * h_1 / h_0,
|
||||||
|
coords[2] * w_1 / w_0,
|
||||||
|
coords[3] * h_1 / h_0,
|
||||||
|
]
|
||||||
|
annotation_1["segments_info"][i]["bbox"] = new_bbox
|
||||||
|
|
||||||
|
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
|
||||||
|
|
||||||
|
images = [image_0, image_1]
|
||||||
|
annotations = [annotation_0, annotation_1]
|
||||||
|
|
||||||
|
# encode them
|
||||||
|
image_processing = YolosImageProcessor(format="coco_panoptic")
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
masks_path=masks_path,
|
||||||
|
return_tensors="pt",
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check the pixel values have been padded
|
||||||
|
postprocessed_height, postprocessed_width = 800, 1066
|
||||||
|
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||||
|
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||||
|
|
||||||
|
# Check the bounding boxes have been adjusted for padded images
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
expected_boxes_0 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.2625, 0.5437, 0.4688, 0.8625],
|
||||||
|
[0.7719, 0.4104, 0.4531, 0.7125],
|
||||||
|
[0.5000, 0.4927, 0.9969, 0.9854],
|
||||||
|
[0.1688, 0.2000, 0.2063, 0.0917],
|
||||||
|
[0.5492, 0.2760, 0.0578, 0.2187],
|
||||||
|
[0.4992, 0.4990, 0.9984, 0.9979],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected_boxes_1 = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.1576, 0.3262, 0.2814, 0.5175],
|
||||||
|
[0.4634, 0.2463, 0.2720, 0.4275],
|
||||||
|
[0.3002, 0.2956, 0.5985, 0.5913],
|
||||||
|
[0.1013, 0.1200, 0.1238, 0.0550],
|
||||||
|
[0.3297, 0.1656, 0.0347, 0.1312],
|
||||||
|
[0.2997, 0.2994, 0.5994, 0.5987],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||||
|
|
||||||
|
# Check the masks have also been padded
|
||||||
|
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||||
|
|
||||||
|
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||||
|
# format and not in the range [0, 1]
|
||||||
|
encoding = image_processing(
|
||||||
|
images=images,
|
||||||
|
annotations=annotations,
|
||||||
|
masks_path=masks_path,
|
||||||
|
return_segmentation_masks=True,
|
||||||
|
do_convert_annotations=False,
|
||||||
|
return_tensors="pt",
|
||||||
|
)
|
||||||
|
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||||
|
# Convert to absolute coordinates
|
||||||
|
unnormalized_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_0[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_0[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_0[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
unnormalized_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
expected_boxes_1[:, 0] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 1] * postprocessed_height,
|
||||||
|
expected_boxes_1[:, 2] * postprocessed_width,
|
||||||
|
expected_boxes_1[:, 3] * postprocessed_height,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||||
|
expected_boxes_0 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||||
|
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
expected_boxes_1 = torch.vstack(
|
||||||
|
[
|
||||||
|
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||||
|
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||||
|
]
|
||||||
|
).T
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||||
|
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||||
|
|||||||
Reference in New Issue
Block a user