[DETR] Update the processing to adapt masks & bboxes to reflect padding (#28363)
* Update the processing so bbox coords are adjusted for padding * Just pad masks * Tidy up, add tests * Better tests * Fix yolos and mark as slow for pycocotols * Fix yolos - return_tensors * Clarify padding and normalization behaviour
This commit is contained in:
@@ -280,7 +280,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
||||
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
|
||||
def _pad_image(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
@@ -308,7 +308,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
|
||||
)
|
||||
return padded_image
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
||||
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
|
||||
def pad(
|
||||
self,
|
||||
images: List[np.ndarray],
|
||||
|
||||
@@ -785,9 +785,14 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
|
||||
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
||||
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
|
||||
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||
do_pad (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
|
||||
overridden by the `do_pad` parameter in the `preprocess` method.
|
||||
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||
method. If `True` will pad the images in the batch to the largest height and width in the batch.
|
||||
Padding will be applied to the bottom and right of the image with zeros.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
@@ -804,6 +809,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
do_normalize: bool = True,
|
||||
image_mean: Union[float, List[float]] = None,
|
||||
image_std: Union[float, List[float]] = None,
|
||||
do_convert_annotations: Optional[bool] = None,
|
||||
do_pad: bool = True,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
@@ -822,6 +828,10 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
|
||||
size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
||||
|
||||
# Backwards compatibility
|
||||
if do_convert_annotations is None:
|
||||
do_convert_annotations = do_normalize
|
||||
|
||||
super().__init__(**kwargs)
|
||||
self.format = format
|
||||
self.do_resize = do_resize
|
||||
@@ -830,6 +840,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.do_convert_annotations = do_convert_annotations
|
||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||
self.do_pad = do_pad
|
||||
@@ -1007,18 +1018,64 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
||||
"""
|
||||
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
|
||||
`[center_x, center_y, width, height]` format.
|
||||
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
|
||||
"""
|
||||
return normalize_annotation(annotation, image_size=image_size)
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
|
||||
def _update_annotation_for_padded_image(
|
||||
self,
|
||||
annotation: Dict,
|
||||
input_image_size: Tuple[int, int],
|
||||
output_image_size: Tuple[int, int],
|
||||
padding,
|
||||
update_bboxes,
|
||||
) -> Dict:
|
||||
"""
|
||||
Update the annotation for a padded image.
|
||||
"""
|
||||
new_annotation = {}
|
||||
new_annotation["size"] = output_image_size
|
||||
|
||||
for key, value in annotation.items():
|
||||
if key == "masks":
|
||||
masks = value
|
||||
masks = pad(
|
||||
masks,
|
||||
padding,
|
||||
mode=PaddingMode.CONSTANT,
|
||||
constant_values=0,
|
||||
input_data_format=ChannelDimension.FIRST,
|
||||
)
|
||||
masks = safe_squeeze(masks, 1)
|
||||
new_annotation["masks"] = masks
|
||||
elif key == "boxes" and update_bboxes:
|
||||
boxes = value
|
||||
boxes *= np.asarray(
|
||||
[
|
||||
input_image_size[1] / output_image_size[1],
|
||||
input_image_size[0] / output_image_size[0],
|
||||
input_image_size[1] / output_image_size[1],
|
||||
input_image_size[0] / output_image_size[0],
|
||||
]
|
||||
)
|
||||
new_annotation["boxes"] = boxes
|
||||
elif key == "size":
|
||||
new_annotation["size"] = output_image_size
|
||||
else:
|
||||
new_annotation[key] = value
|
||||
return new_annotation
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
||||
def _pad_image(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
output_size: Tuple[int, int],
|
||||
annotation: Optional[Dict[str, Any]] = None,
|
||||
constant_values: Union[float, Iterable[float]] = 0,
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
update_bboxes: bool = True,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Pad an image with zeros to the given size.
|
||||
@@ -1037,25 +1094,33 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
return padded_image
|
||||
if annotation is not None:
|
||||
annotation = self._update_annotation_for_padded_image(
|
||||
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
|
||||
)
|
||||
return padded_image, annotation
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
||||
def pad(
|
||||
self,
|
||||
images: List[np.ndarray],
|
||||
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
|
||||
constant_values: Union[float, Iterable[float]] = 0,
|
||||
return_pixel_mask: bool = True,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
update_bboxes: bool = True,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
||||
in the batch and optionally returns their corresponding pixel mask.
|
||||
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
Image to pad.
|
||||
images (List[`np.ndarray`]):
|
||||
Images to pad.
|
||||
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
|
||||
Annotations to transform according to the padding that is applied to the images.
|
||||
constant_values (`float` or `Iterable[float]`, *optional*):
|
||||
The value to use for the padding if `mode` is `"constant"`.
|
||||
return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
||||
@@ -1071,19 +1136,29 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format of the input image. If not provided, it will be inferred.
|
||||
update_bboxes (`bool`, *optional*, defaults to `True`):
|
||||
Whether to update the bounding boxes in the annotations to match the padded images. If the
|
||||
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
|
||||
format, the bounding boxes will not be updated.
|
||||
"""
|
||||
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||
|
||||
padded_images = [
|
||||
self._pad_image(
|
||||
annotation_list = annotations if annotations is not None else [None] * len(images)
|
||||
padded_images = []
|
||||
padded_annotations = []
|
||||
for image, annotation in zip(images, annotation_list):
|
||||
padded_image, padded_annotation = self._pad_image(
|
||||
image,
|
||||
pad_size,
|
||||
annotation,
|
||||
constant_values=constant_values,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
update_bboxes=update_bboxes,
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
padded_images.append(padded_image)
|
||||
padded_annotations.append(padded_annotation)
|
||||
|
||||
data = {"pixel_values": padded_images}
|
||||
|
||||
if return_pixel_mask:
|
||||
@@ -1093,7 +1168,14 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
]
|
||||
data["pixel_mask"] = masks
|
||||
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
if annotations is not None:
|
||||
encoded_inputs["labels"] = [
|
||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
|
||||
]
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
|
||||
def preprocess(
|
||||
@@ -1108,6 +1190,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[Union[int, float]] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
do_convert_annotations: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_pad: Optional[bool] = None,
|
||||
@@ -1151,12 +1234,17 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
Rescale factor to use when rescaling the image.
|
||||
do_normalize (`bool`, *optional*, defaults to self.do_normalize):
|
||||
Whether to normalize the image.
|
||||
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
|
||||
Whether to convert the annotations to the format expected by the model. Converts the bounding
|
||||
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
|
||||
and in relative coordinates.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
|
||||
Mean to use when normalizing the image.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
||||
Standard deviation to use when normalizing the image.
|
||||
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
||||
Whether to pad the image.
|
||||
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
|
||||
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
|
||||
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
||||
Format of the annotations.
|
||||
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
||||
@@ -1197,6 +1285,9 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
do_normalize = self.do_normalize if do_normalize is None else do_normalize
|
||||
image_mean = self.image_mean if image_mean is None else image_mean
|
||||
image_std = self.image_std if image_std is None else image_std
|
||||
do_convert_annotations = (
|
||||
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
|
||||
)
|
||||
do_pad = self.do_pad if do_pad is None else do_pad
|
||||
format = self.format if format is None else format
|
||||
|
||||
@@ -1300,29 +1391,34 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
images = [
|
||||
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
|
||||
]
|
||||
if annotations is not None:
|
||||
annotations = [
|
||||
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
|
||||
for annotation, image in zip(annotations, images)
|
||||
]
|
||||
|
||||
if do_convert_annotations and annotations is not None:
|
||||
annotations = [
|
||||
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
|
||||
for annotation, image in zip(annotations, images)
|
||||
]
|
||||
|
||||
if do_pad:
|
||||
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
|
||||
data = self.pad(
|
||||
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
|
||||
encoded_inputs = self.pad(
|
||||
images,
|
||||
annotations=annotations,
|
||||
return_pixel_mask=True,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
return_tensors=return_tensors,
|
||||
update_bboxes=do_convert_annotations,
|
||||
)
|
||||
else:
|
||||
images = [
|
||||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
data = {"pixel_values": images}
|
||||
|
||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
||||
if annotations is not None:
|
||||
encoded_inputs["labels"] = [
|
||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
||||
]
|
||||
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
|
||||
if annotations is not None:
|
||||
encoded_inputs["labels"] = [
|
||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
||||
]
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
@@ -783,9 +783,14 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
|
||||
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
||||
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
|
||||
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||
do_pad (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
|
||||
overridden by the `do_pad` parameter in the `preprocess` method.
|
||||
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||
method. If `True` will pad the images in the batch to the largest height and width in the batch.
|
||||
Padding will be applied to the bottom and right of the image with zeros.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
@@ -802,6 +807,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
do_normalize: bool = True,
|
||||
image_mean: Union[float, List[float]] = None,
|
||||
image_std: Union[float, List[float]] = None,
|
||||
do_convert_annotations: Optional[bool] = None,
|
||||
do_pad: bool = True,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
@@ -820,6 +826,10 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
|
||||
size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
||||
|
||||
# Backwards compatibility
|
||||
if do_convert_annotations is None:
|
||||
do_convert_annotations = do_normalize
|
||||
|
||||
super().__init__(**kwargs)
|
||||
self.format = format
|
||||
self.do_resize = do_resize
|
||||
@@ -828,6 +838,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.do_convert_annotations = do_convert_annotations
|
||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||
self.do_pad = do_pad
|
||||
@@ -1005,18 +1016,64 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
||||
"""
|
||||
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
|
||||
`[center_x, center_y, width, height]` format.
|
||||
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
|
||||
"""
|
||||
return normalize_annotation(annotation, image_size=image_size)
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
|
||||
def _update_annotation_for_padded_image(
|
||||
self,
|
||||
annotation: Dict,
|
||||
input_image_size: Tuple[int, int],
|
||||
output_image_size: Tuple[int, int],
|
||||
padding,
|
||||
update_bboxes,
|
||||
) -> Dict:
|
||||
"""
|
||||
Update the annotation for a padded image.
|
||||
"""
|
||||
new_annotation = {}
|
||||
new_annotation["size"] = output_image_size
|
||||
|
||||
for key, value in annotation.items():
|
||||
if key == "masks":
|
||||
masks = value
|
||||
masks = pad(
|
||||
masks,
|
||||
padding,
|
||||
mode=PaddingMode.CONSTANT,
|
||||
constant_values=0,
|
||||
input_data_format=ChannelDimension.FIRST,
|
||||
)
|
||||
masks = safe_squeeze(masks, 1)
|
||||
new_annotation["masks"] = masks
|
||||
elif key == "boxes" and update_bboxes:
|
||||
boxes = value
|
||||
boxes *= np.asarray(
|
||||
[
|
||||
input_image_size[1] / output_image_size[1],
|
||||
input_image_size[0] / output_image_size[0],
|
||||
input_image_size[1] / output_image_size[1],
|
||||
input_image_size[0] / output_image_size[0],
|
||||
]
|
||||
)
|
||||
new_annotation["boxes"] = boxes
|
||||
elif key == "size":
|
||||
new_annotation["size"] = output_image_size
|
||||
else:
|
||||
new_annotation[key] = value
|
||||
return new_annotation
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
||||
def _pad_image(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
output_size: Tuple[int, int],
|
||||
annotation: Optional[Dict[str, Any]] = None,
|
||||
constant_values: Union[float, Iterable[float]] = 0,
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
update_bboxes: bool = True,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Pad an image with zeros to the given size.
|
||||
@@ -1035,25 +1092,33 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
return padded_image
|
||||
if annotation is not None:
|
||||
annotation = self._update_annotation_for_padded_image(
|
||||
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
|
||||
)
|
||||
return padded_image, annotation
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
||||
def pad(
|
||||
self,
|
||||
images: List[np.ndarray],
|
||||
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
|
||||
constant_values: Union[float, Iterable[float]] = 0,
|
||||
return_pixel_mask: bool = True,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
update_bboxes: bool = True,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
||||
in the batch and optionally returns their corresponding pixel mask.
|
||||
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
Image to pad.
|
||||
images (List[`np.ndarray`]):
|
||||
Images to pad.
|
||||
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
|
||||
Annotations to transform according to the padding that is applied to the images.
|
||||
constant_values (`float` or `Iterable[float]`, *optional*):
|
||||
The value to use for the padding if `mode` is `"constant"`.
|
||||
return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
||||
@@ -1069,19 +1134,29 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format of the input image. If not provided, it will be inferred.
|
||||
update_bboxes (`bool`, *optional*, defaults to `True`):
|
||||
Whether to update the bounding boxes in the annotations to match the padded images. If the
|
||||
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
|
||||
format, the bounding boxes will not be updated.
|
||||
"""
|
||||
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||
|
||||
padded_images = [
|
||||
self._pad_image(
|
||||
annotation_list = annotations if annotations is not None else [None] * len(images)
|
||||
padded_images = []
|
||||
padded_annotations = []
|
||||
for image, annotation in zip(images, annotation_list):
|
||||
padded_image, padded_annotation = self._pad_image(
|
||||
image,
|
||||
pad_size,
|
||||
annotation,
|
||||
constant_values=constant_values,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
update_bboxes=update_bboxes,
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
padded_images.append(padded_image)
|
||||
padded_annotations.append(padded_annotation)
|
||||
|
||||
data = {"pixel_values": padded_images}
|
||||
|
||||
if return_pixel_mask:
|
||||
@@ -1091,7 +1166,14 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
]
|
||||
data["pixel_mask"] = masks
|
||||
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
if annotations is not None:
|
||||
encoded_inputs["labels"] = [
|
||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
|
||||
]
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
|
||||
def preprocess(
|
||||
@@ -1106,6 +1188,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[Union[int, float]] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
do_convert_annotations: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_pad: Optional[bool] = None,
|
||||
@@ -1149,12 +1232,17 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
Rescale factor to use when rescaling the image.
|
||||
do_normalize (`bool`, *optional*, defaults to self.do_normalize):
|
||||
Whether to normalize the image.
|
||||
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
|
||||
Whether to convert the annotations to the format expected by the model. Converts the bounding
|
||||
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
|
||||
and in relative coordinates.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
|
||||
Mean to use when normalizing the image.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
||||
Standard deviation to use when normalizing the image.
|
||||
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
||||
Whether to pad the image.
|
||||
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
|
||||
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
|
||||
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
||||
Format of the annotations.
|
||||
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
||||
@@ -1195,6 +1283,9 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
do_normalize = self.do_normalize if do_normalize is None else do_normalize
|
||||
image_mean = self.image_mean if image_mean is None else image_mean
|
||||
image_std = self.image_std if image_std is None else image_std
|
||||
do_convert_annotations = (
|
||||
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
|
||||
)
|
||||
do_pad = self.do_pad if do_pad is None else do_pad
|
||||
format = self.format if format is None else format
|
||||
|
||||
@@ -1298,29 +1389,34 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
images = [
|
||||
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
|
||||
]
|
||||
if annotations is not None:
|
||||
annotations = [
|
||||
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
|
||||
for annotation, image in zip(annotations, images)
|
||||
]
|
||||
|
||||
if do_convert_annotations and annotations is not None:
|
||||
annotations = [
|
||||
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
|
||||
for annotation, image in zip(annotations, images)
|
||||
]
|
||||
|
||||
if do_pad:
|
||||
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
|
||||
data = self.pad(
|
||||
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
|
||||
encoded_inputs = self.pad(
|
||||
images,
|
||||
annotations=annotations,
|
||||
return_pixel_mask=True,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
return_tensors=return_tensors,
|
||||
update_bboxes=do_convert_annotations,
|
||||
)
|
||||
else:
|
||||
images = [
|
||||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
data = {"pixel_values": images}
|
||||
|
||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
||||
if annotations is not None:
|
||||
encoded_inputs["labels"] = [
|
||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
||||
]
|
||||
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
|
||||
if annotations is not None:
|
||||
encoded_inputs["labels"] = [
|
||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
||||
]
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
@@ -35,6 +35,7 @@ from ...image_utils import (
|
||||
IMAGENET_DEFAULT_MEAN,
|
||||
IMAGENET_DEFAULT_STD,
|
||||
AnnotationFormat,
|
||||
AnnotationType,
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
@@ -492,9 +493,14 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
|
||||
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
||||
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
|
||||
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||
do_pad (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
|
||||
overridden by the `do_pad` parameter in the `preprocess` method.
|
||||
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||
method. If `True` will pad the images in the batch to the largest height and width in the batch.
|
||||
Padding will be applied to the bottom and right of the image with zeros.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
@@ -510,6 +516,7 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
do_normalize: bool = True,
|
||||
image_mean: Union[float, List[float]] = None,
|
||||
image_std: Union[float, List[float]] = None,
|
||||
do_convert_annotations: bool = True,
|
||||
do_pad: bool = True,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
@@ -519,6 +526,9 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
|
||||
size = get_size_dict(size, default_to_square=False)
|
||||
|
||||
if do_convert_annotations is None:
|
||||
do_convert_annotations = do_normalize
|
||||
|
||||
super().__init__(**kwargs)
|
||||
self.format = format
|
||||
self.do_resize = do_resize
|
||||
@@ -527,6 +537,7 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.do_convert_annotations = do_convert_annotations
|
||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||
self.do_pad = do_pad
|
||||
@@ -680,18 +691,64 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
||||
"""
|
||||
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
|
||||
`[center_x, center_y, width, height]` format.
|
||||
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
|
||||
"""
|
||||
return normalize_annotation(annotation, image_size=image_size)
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
|
||||
def _update_annotation_for_padded_image(
|
||||
self,
|
||||
annotation: Dict,
|
||||
input_image_size: Tuple[int, int],
|
||||
output_image_size: Tuple[int, int],
|
||||
padding,
|
||||
update_bboxes,
|
||||
) -> Dict:
|
||||
"""
|
||||
Update the annotation for a padded image.
|
||||
"""
|
||||
new_annotation = {}
|
||||
new_annotation["size"] = output_image_size
|
||||
|
||||
for key, value in annotation.items():
|
||||
if key == "masks":
|
||||
masks = value
|
||||
masks = pad(
|
||||
masks,
|
||||
padding,
|
||||
mode=PaddingMode.CONSTANT,
|
||||
constant_values=0,
|
||||
input_data_format=ChannelDimension.FIRST,
|
||||
)
|
||||
masks = safe_squeeze(masks, 1)
|
||||
new_annotation["masks"] = masks
|
||||
elif key == "boxes" and update_bboxes:
|
||||
boxes = value
|
||||
boxes *= np.asarray(
|
||||
[
|
||||
input_image_size[1] / output_image_size[1],
|
||||
input_image_size[0] / output_image_size[0],
|
||||
input_image_size[1] / output_image_size[1],
|
||||
input_image_size[0] / output_image_size[0],
|
||||
]
|
||||
)
|
||||
new_annotation["boxes"] = boxes
|
||||
elif key == "size":
|
||||
new_annotation["size"] = output_image_size
|
||||
else:
|
||||
new_annotation[key] = value
|
||||
return new_annotation
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
||||
def _pad_image(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
output_size: Tuple[int, int],
|
||||
annotation: Optional[Dict[str, Any]] = None,
|
||||
constant_values: Union[float, Iterable[float]] = 0,
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
update_bboxes: bool = True,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Pad an image with zeros to the given size.
|
||||
@@ -710,25 +767,33 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
return padded_image
|
||||
if annotation is not None:
|
||||
annotation = self._update_annotation_for_padded_image(
|
||||
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
|
||||
)
|
||||
return padded_image, annotation
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
||||
def pad(
|
||||
self,
|
||||
images: List[np.ndarray],
|
||||
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
|
||||
constant_values: Union[float, Iterable[float]] = 0,
|
||||
return_pixel_mask: bool = True,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
update_bboxes: bool = True,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
||||
in the batch and optionally returns their corresponding pixel mask.
|
||||
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
Image to pad.
|
||||
images (List[`np.ndarray`]):
|
||||
Images to pad.
|
||||
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
|
||||
Annotations to transform according to the padding that is applied to the images.
|
||||
constant_values (`float` or `Iterable[float]`, *optional*):
|
||||
The value to use for the padding if `mode` is `"constant"`.
|
||||
return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
||||
@@ -744,19 +809,29 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format of the input image. If not provided, it will be inferred.
|
||||
update_bboxes (`bool`, *optional*, defaults to `True`):
|
||||
Whether to update the bounding boxes in the annotations to match the padded images. If the
|
||||
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
|
||||
format, the bounding boxes will not be updated.
|
||||
"""
|
||||
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||
|
||||
padded_images = [
|
||||
self._pad_image(
|
||||
annotation_list = annotations if annotations is not None else [None] * len(images)
|
||||
padded_images = []
|
||||
padded_annotations = []
|
||||
for image, annotation in zip(images, annotation_list):
|
||||
padded_image, padded_annotation = self._pad_image(
|
||||
image,
|
||||
pad_size,
|
||||
annotation,
|
||||
constant_values=constant_values,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
update_bboxes=update_bboxes,
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
padded_images.append(padded_image)
|
||||
padded_annotations.append(padded_annotation)
|
||||
|
||||
data = {"pixel_values": padded_images}
|
||||
|
||||
if return_pixel_mask:
|
||||
@@ -766,7 +841,14 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
]
|
||||
data["pixel_mask"] = masks
|
||||
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
if annotations is not None:
|
||||
encoded_inputs["labels"] = [
|
||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
|
||||
]
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
@@ -782,6 +864,7 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
do_normalize: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_annotations: Optional[bool] = None,
|
||||
do_pad: Optional[bool] = None,
|
||||
format: Optional[Union[str, AnnotationFormat]] = None,
|
||||
return_tensors: Optional[Union[TensorType, str]] = None,
|
||||
@@ -827,8 +910,13 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
Mean to use when normalizing the image.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
||||
Standard deviation to use when normalizing the image.
|
||||
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
|
||||
Whether to convert the annotations to the format expected by the model. Converts the bounding
|
||||
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
|
||||
and in relative coordinates.
|
||||
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
||||
Whether to pad the image.
|
||||
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
|
||||
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
|
||||
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
||||
Format of the annotations.
|
||||
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
||||
@@ -861,6 +949,9 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
do_normalize = self.do_normalize if do_normalize is None else do_normalize
|
||||
image_mean = self.image_mean if image_mean is None else image_mean
|
||||
image_std = self.image_std if image_std is None else image_std
|
||||
do_convert_annotations = (
|
||||
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
|
||||
)
|
||||
do_pad = self.do_pad if do_pad is None else do_pad
|
||||
format = self.format if format is None else format
|
||||
|
||||
@@ -964,29 +1055,34 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
images = [
|
||||
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
|
||||
]
|
||||
if annotations is not None:
|
||||
annotations = [
|
||||
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
|
||||
for annotation, image in zip(annotations, images)
|
||||
]
|
||||
|
||||
if do_convert_annotations and annotations is not None:
|
||||
annotations = [
|
||||
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
|
||||
for annotation, image in zip(annotations, images)
|
||||
]
|
||||
|
||||
if do_pad:
|
||||
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
|
||||
data = self.pad(
|
||||
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
|
||||
encoded_inputs = self.pad(
|
||||
images,
|
||||
annotations=annotations,
|
||||
return_pixel_mask=True,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
return_tensors=return_tensors,
|
||||
update_bboxes=do_convert_annotations,
|
||||
)
|
||||
else:
|
||||
images = [
|
||||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
data = {"pixel_values": images}
|
||||
|
||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
||||
if annotations is not None:
|
||||
encoded_inputs["labels"] = [
|
||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
||||
]
|
||||
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
|
||||
if annotations is not None:
|
||||
encoded_inputs["labels"] = [
|
||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
||||
]
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
@@ -760,7 +760,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
|
||||
`preprocess` method.
|
||||
do_normalize:
|
||||
do_normalize (`bool`, *optional*, defaults to True):
|
||||
Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
|
||||
`preprocess` method.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
|
||||
@@ -769,9 +769,14 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
|
||||
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
||||
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
|
||||
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||
do_pad (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
|
||||
overridden by the `do_pad` parameter in the `preprocess` method.
|
||||
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||
method. If `True` will pad the images in the batch to the largest height and width in the batch.
|
||||
Padding will be applied to the bottom and right of the image with zeros.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
@@ -787,6 +792,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
do_normalize: bool = True,
|
||||
image_mean: Union[float, List[float]] = None,
|
||||
image_std: Union[float, List[float]] = None,
|
||||
do_convert_annotations: Optional[bool] = None,
|
||||
do_pad: bool = True,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
@@ -805,6 +811,10 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
|
||||
size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
||||
|
||||
# Backwards compatibility
|
||||
if do_convert_annotations is None:
|
||||
do_convert_annotations = do_normalize
|
||||
|
||||
super().__init__(**kwargs)
|
||||
self.format = format
|
||||
self.do_resize = do_resize
|
||||
@@ -813,6 +823,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.do_convert_annotations = do_convert_annotations
|
||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||
self.do_pad = do_pad
|
||||
@@ -981,17 +992,62 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
||||
"""
|
||||
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
|
||||
`[center_x, center_y, width, height]` format.
|
||||
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
|
||||
"""
|
||||
return normalize_annotation(annotation, image_size=image_size)
|
||||
|
||||
def _update_annotation_for_padded_image(
|
||||
self,
|
||||
annotation: Dict,
|
||||
input_image_size: Tuple[int, int],
|
||||
output_image_size: Tuple[int, int],
|
||||
padding,
|
||||
update_bboxes,
|
||||
) -> Dict:
|
||||
"""
|
||||
Update the annotation for a padded image.
|
||||
"""
|
||||
new_annotation = {}
|
||||
new_annotation["size"] = output_image_size
|
||||
|
||||
for key, value in annotation.items():
|
||||
if key == "masks":
|
||||
masks = value
|
||||
masks = pad(
|
||||
masks,
|
||||
padding,
|
||||
mode=PaddingMode.CONSTANT,
|
||||
constant_values=0,
|
||||
input_data_format=ChannelDimension.FIRST,
|
||||
)
|
||||
masks = safe_squeeze(masks, 1)
|
||||
new_annotation["masks"] = masks
|
||||
elif key == "boxes" and update_bboxes:
|
||||
boxes = value
|
||||
boxes *= np.asarray(
|
||||
[
|
||||
input_image_size[1] / output_image_size[1],
|
||||
input_image_size[0] / output_image_size[0],
|
||||
input_image_size[1] / output_image_size[1],
|
||||
input_image_size[0] / output_image_size[0],
|
||||
]
|
||||
)
|
||||
new_annotation["boxes"] = boxes
|
||||
elif key == "size":
|
||||
new_annotation["size"] = output_image_size
|
||||
else:
|
||||
new_annotation[key] = value
|
||||
return new_annotation
|
||||
|
||||
def _pad_image(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
output_size: Tuple[int, int],
|
||||
annotation: Optional[Dict[str, Any]] = None,
|
||||
constant_values: Union[float, Iterable[float]] = 0,
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
update_bboxes: bool = True,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Pad an image with zeros to the given size.
|
||||
@@ -1010,24 +1066,32 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
return padded_image
|
||||
if annotation is not None:
|
||||
annotation = self._update_annotation_for_padded_image(
|
||||
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
|
||||
)
|
||||
return padded_image, annotation
|
||||
|
||||
def pad(
|
||||
self,
|
||||
images: List[np.ndarray],
|
||||
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
|
||||
constant_values: Union[float, Iterable[float]] = 0,
|
||||
return_pixel_mask: bool = True,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
update_bboxes: bool = True,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
||||
in the batch and optionally returns their corresponding pixel mask.
|
||||
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
Image to pad.
|
||||
images (List[`np.ndarray`]):
|
||||
Images to pad.
|
||||
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
|
||||
Annotations to transform according to the padding that is applied to the images.
|
||||
constant_values (`float` or `Iterable[float]`, *optional*):
|
||||
The value to use for the padding if `mode` is `"constant"`.
|
||||
return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
||||
@@ -1043,19 +1107,29 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format of the input image. If not provided, it will be inferred.
|
||||
update_bboxes (`bool`, *optional*, defaults to `True`):
|
||||
Whether to update the bounding boxes in the annotations to match the padded images. If the
|
||||
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
|
||||
format, the bounding boxes will not be updated.
|
||||
"""
|
||||
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||
|
||||
padded_images = [
|
||||
self._pad_image(
|
||||
annotation_list = annotations if annotations is not None else [None] * len(images)
|
||||
padded_images = []
|
||||
padded_annotations = []
|
||||
for image, annotation in zip(images, annotation_list):
|
||||
padded_image, padded_annotation = self._pad_image(
|
||||
image,
|
||||
pad_size,
|
||||
annotation,
|
||||
constant_values=constant_values,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
update_bboxes=update_bboxes,
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
padded_images.append(padded_image)
|
||||
padded_annotations.append(padded_annotation)
|
||||
|
||||
data = {"pixel_values": padded_images}
|
||||
|
||||
if return_pixel_mask:
|
||||
@@ -1065,7 +1139,14 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
]
|
||||
data["pixel_mask"] = masks
|
||||
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
if annotations is not None:
|
||||
encoded_inputs["labels"] = [
|
||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
|
||||
]
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
@@ -1079,6 +1160,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[Union[int, float]] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
do_convert_annotations: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_pad: Optional[bool] = None,
|
||||
@@ -1122,12 +1204,17 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
Rescale factor to use when rescaling the image.
|
||||
do_normalize (`bool`, *optional*, defaults to self.do_normalize):
|
||||
Whether to normalize the image.
|
||||
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
|
||||
Whether to convert the annotations to the format expected by the model. Converts the bounding
|
||||
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
|
||||
and in relative coordinates.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
|
||||
Mean to use when normalizing the image.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
||||
Standard deviation to use when normalizing the image.
|
||||
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
||||
Whether to pad the image.
|
||||
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
|
||||
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
|
||||
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
||||
Format of the annotations.
|
||||
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
||||
@@ -1168,6 +1255,9 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
do_normalize = self.do_normalize if do_normalize is None else do_normalize
|
||||
image_mean = self.image_mean if image_mean is None else image_mean
|
||||
image_std = self.image_std if image_std is None else image_std
|
||||
do_convert_annotations = (
|
||||
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
|
||||
)
|
||||
do_pad = self.do_pad if do_pad is None else do_pad
|
||||
format = self.format if format is None else format
|
||||
|
||||
@@ -1271,29 +1361,34 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
images = [
|
||||
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
|
||||
]
|
||||
if annotations is not None:
|
||||
annotations = [
|
||||
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
|
||||
for annotation, image in zip(annotations, images)
|
||||
]
|
||||
|
||||
if do_convert_annotations and annotations is not None:
|
||||
annotations = [
|
||||
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
|
||||
for annotation, image in zip(annotations, images)
|
||||
]
|
||||
|
||||
if do_pad:
|
||||
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
|
||||
data = self.pad(
|
||||
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
|
||||
encoded_inputs = self.pad(
|
||||
images,
|
||||
annotations=annotations,
|
||||
return_pixel_mask=True,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
return_tensors=return_tensors,
|
||||
update_bboxes=do_convert_annotations,
|
||||
)
|
||||
else:
|
||||
images = [
|
||||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
data = {"pixel_values": images}
|
||||
|
||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
||||
if annotations is not None:
|
||||
encoded_inputs["labels"] = [
|
||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
||||
]
|
||||
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
|
||||
if annotations is not None:
|
||||
encoded_inputs["labels"] = [
|
||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
||||
]
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
@@ -771,7 +771,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
|
||||
)
|
||||
return encoded_inputs
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
||||
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
|
||||
def _pad_image(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
@@ -799,7 +799,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
|
||||
)
|
||||
return padded_image
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
||||
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
|
||||
def pad(
|
||||
self,
|
||||
images: List[np.ndarray],
|
||||
|
||||
@@ -788,7 +788,7 @@ class MaskFormerImageProcessor(BaseImageProcessor):
|
||||
)
|
||||
return encoded_inputs
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
||||
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
|
||||
def _pad_image(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
@@ -816,7 +816,7 @@ class MaskFormerImageProcessor(BaseImageProcessor):
|
||||
)
|
||||
return padded_image
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
||||
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
|
||||
def pad(
|
||||
self,
|
||||
images: List[np.ndarray],
|
||||
|
||||
@@ -770,7 +770,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
|
||||
)
|
||||
return encoded_inputs
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
||||
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
|
||||
def _pad_image(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
@@ -798,7 +798,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
|
||||
)
|
||||
return padded_image
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
||||
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
|
||||
def pad(
|
||||
self,
|
||||
images: List[np.ndarray],
|
||||
|
||||
@@ -251,7 +251,6 @@ class ViltImageProcessor(BaseImageProcessor):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
||||
def _pad_image(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
@@ -279,7 +278,6 @@ class ViltImageProcessor(BaseImageProcessor):
|
||||
)
|
||||
return padded_image
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
|
||||
def pad(
|
||||
self,
|
||||
images: List[np.ndarray],
|
||||
|
||||
@@ -696,8 +696,9 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
|
||||
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||
do_pad (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
|
||||
overridden by the `do_pad` parameter in the `preprocess` method.
|
||||
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||
method. If `True` will pad the images in the batch to the largest height and width in the batch.
|
||||
Padding will be applied to the bottom and right of the image with zeros.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
@@ -713,6 +714,7 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
do_normalize: bool = True,
|
||||
image_mean: Union[float, List[float]] = None,
|
||||
image_std: Union[float, List[float]] = None,
|
||||
do_convert_annotations: Optional[bool] = None,
|
||||
do_pad: bool = True,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
@@ -731,6 +733,10 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
|
||||
size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
||||
|
||||
# Backwards compatibility
|
||||
if do_convert_annotations is None:
|
||||
do_convert_annotations = do_normalize
|
||||
|
||||
super().__init__(**kwargs)
|
||||
self.format = format
|
||||
self.do_resize = do_resize
|
||||
@@ -739,6 +745,7 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.do_convert_annotations = do_convert_annotations
|
||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||
self.do_pad = do_pad
|
||||
@@ -916,18 +923,64 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
|
||||
"""
|
||||
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
|
||||
`[center_x, center_y, width, height]` format.
|
||||
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
|
||||
"""
|
||||
return normalize_annotation(annotation, image_size=image_size)
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
|
||||
def _update_annotation_for_padded_image(
|
||||
self,
|
||||
annotation: Dict,
|
||||
input_image_size: Tuple[int, int],
|
||||
output_image_size: Tuple[int, int],
|
||||
padding,
|
||||
update_bboxes,
|
||||
) -> Dict:
|
||||
"""
|
||||
Update the annotation for a padded image.
|
||||
"""
|
||||
new_annotation = {}
|
||||
new_annotation["size"] = output_image_size
|
||||
|
||||
for key, value in annotation.items():
|
||||
if key == "masks":
|
||||
masks = value
|
||||
masks = pad(
|
||||
masks,
|
||||
padding,
|
||||
mode=PaddingMode.CONSTANT,
|
||||
constant_values=0,
|
||||
input_data_format=ChannelDimension.FIRST,
|
||||
)
|
||||
masks = safe_squeeze(masks, 1)
|
||||
new_annotation["masks"] = masks
|
||||
elif key == "boxes" and update_bboxes:
|
||||
boxes = value
|
||||
boxes *= np.asarray(
|
||||
[
|
||||
input_image_size[1] / output_image_size[1],
|
||||
input_image_size[0] / output_image_size[0],
|
||||
input_image_size[1] / output_image_size[1],
|
||||
input_image_size[0] / output_image_size[0],
|
||||
]
|
||||
)
|
||||
new_annotation["boxes"] = boxes
|
||||
elif key == "size":
|
||||
new_annotation["size"] = output_image_size
|
||||
else:
|
||||
new_annotation[key] = value
|
||||
return new_annotation
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
|
||||
def _pad_image(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
output_size: Tuple[int, int],
|
||||
annotation: Optional[Dict[str, Any]] = None,
|
||||
constant_values: Union[float, Iterable[float]] = 0,
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
update_bboxes: bool = True,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Pad an image with zeros to the given size.
|
||||
@@ -946,16 +999,22 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
return padded_image
|
||||
if annotation is not None:
|
||||
annotation = self._update_annotation_for_padded_image(
|
||||
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
|
||||
)
|
||||
return padded_image, annotation
|
||||
|
||||
def pad(
|
||||
self,
|
||||
images: List[np.ndarray],
|
||||
annotations: Optional[List[Dict[str, Any]]] = None,
|
||||
constant_values: Union[float, Iterable[float]] = 0,
|
||||
return_pixel_mask: bool = False,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
update_bboxes: bool = True,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
||||
@@ -964,6 +1023,9 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
Image to pad.
|
||||
annotations (`List[Dict[str, any]]`, *optional*):
|
||||
Annotations to pad along with the images. If provided, the bounding boxes will be updated to match the
|
||||
padded images.
|
||||
constant_values (`float` or `Iterable[float]`, *optional*):
|
||||
The value to use for the padding if `mode` is `"constant"`.
|
||||
return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
||||
@@ -979,19 +1041,29 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format of the input image. If not provided, it will be inferred.
|
||||
update_bboxes (`bool`, *optional*, defaults to `True`):
|
||||
Whether to update the bounding boxes in the annotations to match the padded images. If the
|
||||
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
|
||||
format, the bounding boxes will not be updated.
|
||||
"""
|
||||
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||
|
||||
padded_images = [
|
||||
self._pad_image(
|
||||
annotation_list = annotations if annotations is not None else [None] * len(images)
|
||||
padded_images = []
|
||||
padded_annotations = []
|
||||
for image, annotation in zip(images, annotation_list):
|
||||
padded_image, padded_annotation = self._pad_image(
|
||||
image,
|
||||
pad_size,
|
||||
annotation,
|
||||
constant_values=constant_values,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
update_bboxes=update_bboxes,
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
padded_images.append(padded_image)
|
||||
padded_annotations.append(padded_annotation)
|
||||
|
||||
data = {"pixel_values": padded_images}
|
||||
|
||||
if return_pixel_mask:
|
||||
@@ -1017,6 +1089,7 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
do_normalize: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_annotations: Optional[bool] = None,
|
||||
do_pad: Optional[bool] = None,
|
||||
format: Optional[Union[str, AnnotationFormat]] = None,
|
||||
return_tensors: Optional[Union[TensorType, str]] = None,
|
||||
@@ -1062,8 +1135,13 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
Mean to use when normalizing the image.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
||||
Standard deviation to use when normalizing the image.
|
||||
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
|
||||
Whether to convert the annotations to the format expected by the model. Converts the bounding
|
||||
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
|
||||
and in relative coordinates.
|
||||
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
||||
Whether to pad the image.
|
||||
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
|
||||
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
|
||||
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
||||
Format of the annotations.
|
||||
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
||||
@@ -1101,6 +1179,9 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
do_normalize = self.do_normalize if do_normalize is None else do_normalize
|
||||
image_mean = self.image_mean if image_mean is None else image_mean
|
||||
image_std = self.image_std if image_std is None else image_std
|
||||
do_convert_annotations = (
|
||||
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
|
||||
)
|
||||
do_pad = self.do_pad if do_pad is None else do_pad
|
||||
format = self.format if format is None else format
|
||||
|
||||
@@ -1204,26 +1285,34 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
images = [
|
||||
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
|
||||
]
|
||||
if annotations is not None:
|
||||
annotations = [
|
||||
self.normalize_annotation(annotation, get_image_size(image))
|
||||
for annotation, image in zip(annotations, images)
|
||||
]
|
||||
|
||||
if do_convert_annotations and annotations is not None:
|
||||
annotations = [
|
||||
self.normalize_annotation(annotation, get_image_size(image))
|
||||
for annotation, image in zip(annotations, images)
|
||||
]
|
||||
|
||||
if do_pad:
|
||||
data = self.pad(images, data_format=data_format, input_data_format=input_data_format)
|
||||
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
|
||||
encoded_inputs = self.pad(
|
||||
images,
|
||||
annotations=annotations,
|
||||
return_pixel_mask=True,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
update_bboxes=do_convert_annotations,
|
||||
return_tensors=return_tensors,
|
||||
)
|
||||
else:
|
||||
images = [
|
||||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
data = {"pixel_values": images}
|
||||
|
||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
||||
if annotations is not None:
|
||||
encoded_inputs["labels"] = [
|
||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
||||
]
|
||||
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
|
||||
if annotations is not None:
|
||||
encoded_inputs["labels"] = [
|
||||
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
|
||||
]
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
@@ -248,3 +248,246 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
|
||||
# verify size
|
||||
expected_size = torch.tensor([800, 1066])
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
|
||||
|
||||
@slow
|
||||
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->ConditionalDetr, facebook/detr-resnet-50 ->microsoft/conditional-detr-resnet-50
|
||||
def test_batched_coco_detection_annotations(self):
|
||||
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||
|
||||
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
|
||||
target = json.loads(f.read())
|
||||
|
||||
annotations_0 = {"image_id": 39769, "annotations": target}
|
||||
annotations_1 = {"image_id": 39769, "annotations": target}
|
||||
|
||||
# Adjust the bounding boxes for the resized image
|
||||
w_0, h_0 = image_0.size
|
||||
w_1, h_1 = image_1.size
|
||||
for i in range(len(annotations_1["annotations"])):
|
||||
coords = annotations_1["annotations"][i]["bbox"]
|
||||
new_bbox = [
|
||||
coords[0] * w_1 / w_0,
|
||||
coords[1] * h_1 / h_0,
|
||||
coords[2] * w_1 / w_0,
|
||||
coords[3] * h_1 / h_0,
|
||||
]
|
||||
annotations_1["annotations"][i]["bbox"] = new_bbox
|
||||
|
||||
images = [image_0, image_1]
|
||||
annotations = [annotations_0, annotations_1]
|
||||
|
||||
image_processing = ConditionalDetrImageProcessor()
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
return_segmentation_masks=True,
|
||||
return_tensors="pt", # do_convert_annotations=True
|
||||
)
|
||||
|
||||
# Check the pixel values have been padded
|
||||
postprocessed_height, postprocessed_width = 800, 1066
|
||||
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||
|
||||
# Check the bounding boxes have been adjusted for padded images
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
expected_boxes_0 = torch.tensor(
|
||||
[
|
||||
[0.6879, 0.4609, 0.0755, 0.3691],
|
||||
[0.2118, 0.3359, 0.2601, 0.1566],
|
||||
[0.5011, 0.5000, 0.9979, 1.0000],
|
||||
[0.5010, 0.5020, 0.9979, 0.9959],
|
||||
[0.3284, 0.5944, 0.5884, 0.8112],
|
||||
[0.8394, 0.5445, 0.3213, 0.9110],
|
||||
]
|
||||
)
|
||||
expected_boxes_1 = torch.tensor(
|
||||
[
|
||||
[0.4130, 0.2765, 0.0453, 0.2215],
|
||||
[0.1272, 0.2016, 0.1561, 0.0940],
|
||||
[0.3757, 0.4933, 0.7488, 0.9865],
|
||||
[0.3759, 0.5002, 0.7492, 0.9955],
|
||||
[0.1971, 0.5456, 0.3532, 0.8646],
|
||||
[0.5790, 0.4115, 0.3430, 0.7161],
|
||||
]
|
||||
)
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||
|
||||
# Check the masks have also been padded
|
||||
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
|
||||
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||
# format and not in the range [0, 1]
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
return_segmentation_masks=True,
|
||||
do_convert_annotations=False,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
# Convert to absolute coordinates
|
||||
unnormalized_boxes_0 = torch.vstack(
|
||||
[
|
||||
expected_boxes_0[:, 0] * postprocessed_width,
|
||||
expected_boxes_0[:, 1] * postprocessed_height,
|
||||
expected_boxes_0[:, 2] * postprocessed_width,
|
||||
expected_boxes_0[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
unnormalized_boxes_1 = torch.vstack(
|
||||
[
|
||||
expected_boxes_1[:, 0] * postprocessed_width,
|
||||
expected_boxes_1[:, 1] * postprocessed_height,
|
||||
expected_boxes_1[:, 2] * postprocessed_width,
|
||||
expected_boxes_1[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||
expected_boxes_0 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
expected_boxes_1 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||
|
||||
@slow
|
||||
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->ConditionalDetr
|
||||
def test_batched_coco_panoptic_annotations(self):
|
||||
# prepare image, target and masks_path
|
||||
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||
|
||||
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
|
||||
target = json.loads(f.read())
|
||||
|
||||
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||
|
||||
w_0, h_0 = image_0.size
|
||||
w_1, h_1 = image_1.size
|
||||
for i in range(len(annotation_1["segments_info"])):
|
||||
coords = annotation_1["segments_info"][i]["bbox"]
|
||||
new_bbox = [
|
||||
coords[0] * w_1 / w_0,
|
||||
coords[1] * h_1 / h_0,
|
||||
coords[2] * w_1 / w_0,
|
||||
coords[3] * h_1 / h_0,
|
||||
]
|
||||
annotation_1["segments_info"][i]["bbox"] = new_bbox
|
||||
|
||||
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
|
||||
|
||||
images = [image_0, image_1]
|
||||
annotations = [annotation_0, annotation_1]
|
||||
|
||||
# encode them
|
||||
image_processing = ConditionalDetrImageProcessor(format="coco_panoptic")
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
masks_path=masks_path,
|
||||
return_tensors="pt",
|
||||
return_segmentation_masks=True,
|
||||
)
|
||||
|
||||
# Check the pixel values have been padded
|
||||
postprocessed_height, postprocessed_width = 800, 1066
|
||||
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||
|
||||
# Check the bounding boxes have been adjusted for padded images
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
expected_boxes_0 = torch.tensor(
|
||||
[
|
||||
[0.2625, 0.5437, 0.4688, 0.8625],
|
||||
[0.7719, 0.4104, 0.4531, 0.7125],
|
||||
[0.5000, 0.4927, 0.9969, 0.9854],
|
||||
[0.1688, 0.2000, 0.2063, 0.0917],
|
||||
[0.5492, 0.2760, 0.0578, 0.2187],
|
||||
[0.4992, 0.4990, 0.9984, 0.9979],
|
||||
]
|
||||
)
|
||||
expected_boxes_1 = torch.tensor(
|
||||
[
|
||||
[0.1576, 0.3262, 0.2814, 0.5175],
|
||||
[0.4634, 0.2463, 0.2720, 0.4275],
|
||||
[0.3002, 0.2956, 0.5985, 0.5913],
|
||||
[0.1013, 0.1200, 0.1238, 0.0550],
|
||||
[0.3297, 0.1656, 0.0347, 0.1312],
|
||||
[0.2997, 0.2994, 0.5994, 0.5987],
|
||||
]
|
||||
)
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||
|
||||
# Check the masks have also been padded
|
||||
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
|
||||
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||
# format and not in the range [0, 1]
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
masks_path=masks_path,
|
||||
return_segmentation_masks=True,
|
||||
do_convert_annotations=False,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
# Convert to absolute coordinates
|
||||
unnormalized_boxes_0 = torch.vstack(
|
||||
[
|
||||
expected_boxes_0[:, 0] * postprocessed_width,
|
||||
expected_boxes_0[:, 1] * postprocessed_height,
|
||||
expected_boxes_0[:, 2] * postprocessed_width,
|
||||
expected_boxes_0[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
unnormalized_boxes_1 = torch.vstack(
|
||||
[
|
||||
expected_boxes_1[:, 0] * postprocessed_width,
|
||||
expected_boxes_1[:, 1] * postprocessed_height,
|
||||
expected_boxes_1[:, 2] * postprocessed_width,
|
||||
expected_boxes_1[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||
expected_boxes_0 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
expected_boxes_1 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||
|
||||
@@ -250,3 +250,246 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
|
||||
# verify size
|
||||
expected_size = torch.tensor([800, 1066])
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
|
||||
|
||||
@slow
|
||||
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->DeformableDetr
|
||||
def test_batched_coco_detection_annotations(self):
|
||||
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||
|
||||
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
|
||||
target = json.loads(f.read())
|
||||
|
||||
annotations_0 = {"image_id": 39769, "annotations": target}
|
||||
annotations_1 = {"image_id": 39769, "annotations": target}
|
||||
|
||||
# Adjust the bounding boxes for the resized image
|
||||
w_0, h_0 = image_0.size
|
||||
w_1, h_1 = image_1.size
|
||||
for i in range(len(annotations_1["annotations"])):
|
||||
coords = annotations_1["annotations"][i]["bbox"]
|
||||
new_bbox = [
|
||||
coords[0] * w_1 / w_0,
|
||||
coords[1] * h_1 / h_0,
|
||||
coords[2] * w_1 / w_0,
|
||||
coords[3] * h_1 / h_0,
|
||||
]
|
||||
annotations_1["annotations"][i]["bbox"] = new_bbox
|
||||
|
||||
images = [image_0, image_1]
|
||||
annotations = [annotations_0, annotations_1]
|
||||
|
||||
image_processing = DeformableDetrImageProcessor()
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
return_segmentation_masks=True,
|
||||
return_tensors="pt", # do_convert_annotations=True
|
||||
)
|
||||
|
||||
# Check the pixel values have been padded
|
||||
postprocessed_height, postprocessed_width = 800, 1066
|
||||
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||
|
||||
# Check the bounding boxes have been adjusted for padded images
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
expected_boxes_0 = torch.tensor(
|
||||
[
|
||||
[0.6879, 0.4609, 0.0755, 0.3691],
|
||||
[0.2118, 0.3359, 0.2601, 0.1566],
|
||||
[0.5011, 0.5000, 0.9979, 1.0000],
|
||||
[0.5010, 0.5020, 0.9979, 0.9959],
|
||||
[0.3284, 0.5944, 0.5884, 0.8112],
|
||||
[0.8394, 0.5445, 0.3213, 0.9110],
|
||||
]
|
||||
)
|
||||
expected_boxes_1 = torch.tensor(
|
||||
[
|
||||
[0.4130, 0.2765, 0.0453, 0.2215],
|
||||
[0.1272, 0.2016, 0.1561, 0.0940],
|
||||
[0.3757, 0.4933, 0.7488, 0.9865],
|
||||
[0.3759, 0.5002, 0.7492, 0.9955],
|
||||
[0.1971, 0.5456, 0.3532, 0.8646],
|
||||
[0.5790, 0.4115, 0.3430, 0.7161],
|
||||
]
|
||||
)
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||
|
||||
# Check the masks have also been padded
|
||||
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
|
||||
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||
# format and not in the range [0, 1]
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
return_segmentation_masks=True,
|
||||
do_convert_annotations=False,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
# Convert to absolute coordinates
|
||||
unnormalized_boxes_0 = torch.vstack(
|
||||
[
|
||||
expected_boxes_0[:, 0] * postprocessed_width,
|
||||
expected_boxes_0[:, 1] * postprocessed_height,
|
||||
expected_boxes_0[:, 2] * postprocessed_width,
|
||||
expected_boxes_0[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
unnormalized_boxes_1 = torch.vstack(
|
||||
[
|
||||
expected_boxes_1[:, 0] * postprocessed_width,
|
||||
expected_boxes_1[:, 1] * postprocessed_height,
|
||||
expected_boxes_1[:, 2] * postprocessed_width,
|
||||
expected_boxes_1[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||
expected_boxes_0 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
expected_boxes_1 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||
|
||||
@slow
|
||||
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->DeformableDetr
|
||||
def test_batched_coco_panoptic_annotations(self):
|
||||
# prepare image, target and masks_path
|
||||
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||
|
||||
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
|
||||
target = json.loads(f.read())
|
||||
|
||||
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||
|
||||
w_0, h_0 = image_0.size
|
||||
w_1, h_1 = image_1.size
|
||||
for i in range(len(annotation_1["segments_info"])):
|
||||
coords = annotation_1["segments_info"][i]["bbox"]
|
||||
new_bbox = [
|
||||
coords[0] * w_1 / w_0,
|
||||
coords[1] * h_1 / h_0,
|
||||
coords[2] * w_1 / w_0,
|
||||
coords[3] * h_1 / h_0,
|
||||
]
|
||||
annotation_1["segments_info"][i]["bbox"] = new_bbox
|
||||
|
||||
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
|
||||
|
||||
images = [image_0, image_1]
|
||||
annotations = [annotation_0, annotation_1]
|
||||
|
||||
# encode them
|
||||
image_processing = DeformableDetrImageProcessor(format="coco_panoptic")
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
masks_path=masks_path,
|
||||
return_tensors="pt",
|
||||
return_segmentation_masks=True,
|
||||
)
|
||||
|
||||
# Check the pixel values have been padded
|
||||
postprocessed_height, postprocessed_width = 800, 1066
|
||||
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||
|
||||
# Check the bounding boxes have been adjusted for padded images
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
expected_boxes_0 = torch.tensor(
|
||||
[
|
||||
[0.2625, 0.5437, 0.4688, 0.8625],
|
||||
[0.7719, 0.4104, 0.4531, 0.7125],
|
||||
[0.5000, 0.4927, 0.9969, 0.9854],
|
||||
[0.1688, 0.2000, 0.2063, 0.0917],
|
||||
[0.5492, 0.2760, 0.0578, 0.2187],
|
||||
[0.4992, 0.4990, 0.9984, 0.9979],
|
||||
]
|
||||
)
|
||||
expected_boxes_1 = torch.tensor(
|
||||
[
|
||||
[0.1576, 0.3262, 0.2814, 0.5175],
|
||||
[0.4634, 0.2463, 0.2720, 0.4275],
|
||||
[0.3002, 0.2956, 0.5985, 0.5913],
|
||||
[0.1013, 0.1200, 0.1238, 0.0550],
|
||||
[0.3297, 0.1656, 0.0347, 0.1312],
|
||||
[0.2997, 0.2994, 0.5994, 0.5987],
|
||||
]
|
||||
)
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||
|
||||
# Check the masks have also been padded
|
||||
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
|
||||
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||
# format and not in the range [0, 1]
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
masks_path=masks_path,
|
||||
return_segmentation_masks=True,
|
||||
do_convert_annotations=False,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
# Convert to absolute coordinates
|
||||
unnormalized_boxes_0 = torch.vstack(
|
||||
[
|
||||
expected_boxes_0[:, 0] * postprocessed_width,
|
||||
expected_boxes_0[:, 1] * postprocessed_height,
|
||||
expected_boxes_0[:, 2] * postprocessed_width,
|
||||
expected_boxes_0[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
unnormalized_boxes_1 = torch.vstack(
|
||||
[
|
||||
expected_boxes_1[:, 0] * postprocessed_width,
|
||||
expected_boxes_1[:, 1] * postprocessed_height,
|
||||
expected_boxes_1[:, 2] * postprocessed_width,
|
||||
expected_boxes_1[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||
expected_boxes_0 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
expected_boxes_1 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||
|
||||
@@ -244,3 +244,246 @@ class DetaImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
|
||||
# verify size
|
||||
expected_size = torch.tensor([800, 1066])
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
|
||||
|
||||
@slow
|
||||
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->Deta
|
||||
def test_batched_coco_detection_annotations(self):
|
||||
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||
|
||||
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
|
||||
target = json.loads(f.read())
|
||||
|
||||
annotations_0 = {"image_id": 39769, "annotations": target}
|
||||
annotations_1 = {"image_id": 39769, "annotations": target}
|
||||
|
||||
# Adjust the bounding boxes for the resized image
|
||||
w_0, h_0 = image_0.size
|
||||
w_1, h_1 = image_1.size
|
||||
for i in range(len(annotations_1["annotations"])):
|
||||
coords = annotations_1["annotations"][i]["bbox"]
|
||||
new_bbox = [
|
||||
coords[0] * w_1 / w_0,
|
||||
coords[1] * h_1 / h_0,
|
||||
coords[2] * w_1 / w_0,
|
||||
coords[3] * h_1 / h_0,
|
||||
]
|
||||
annotations_1["annotations"][i]["bbox"] = new_bbox
|
||||
|
||||
images = [image_0, image_1]
|
||||
annotations = [annotations_0, annotations_1]
|
||||
|
||||
image_processing = DetaImageProcessor()
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
return_segmentation_masks=True,
|
||||
return_tensors="pt", # do_convert_annotations=True
|
||||
)
|
||||
|
||||
# Check the pixel values have been padded
|
||||
postprocessed_height, postprocessed_width = 800, 1066
|
||||
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||
|
||||
# Check the bounding boxes have been adjusted for padded images
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
expected_boxes_0 = torch.tensor(
|
||||
[
|
||||
[0.6879, 0.4609, 0.0755, 0.3691],
|
||||
[0.2118, 0.3359, 0.2601, 0.1566],
|
||||
[0.5011, 0.5000, 0.9979, 1.0000],
|
||||
[0.5010, 0.5020, 0.9979, 0.9959],
|
||||
[0.3284, 0.5944, 0.5884, 0.8112],
|
||||
[0.8394, 0.5445, 0.3213, 0.9110],
|
||||
]
|
||||
)
|
||||
expected_boxes_1 = torch.tensor(
|
||||
[
|
||||
[0.4130, 0.2765, 0.0453, 0.2215],
|
||||
[0.1272, 0.2016, 0.1561, 0.0940],
|
||||
[0.3757, 0.4933, 0.7488, 0.9865],
|
||||
[0.3759, 0.5002, 0.7492, 0.9955],
|
||||
[0.1971, 0.5456, 0.3532, 0.8646],
|
||||
[0.5790, 0.4115, 0.3430, 0.7161],
|
||||
]
|
||||
)
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||
|
||||
# Check the masks have also been padded
|
||||
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
|
||||
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||
# format and not in the range [0, 1]
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
return_segmentation_masks=True,
|
||||
do_convert_annotations=False,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
# Convert to absolute coordinates
|
||||
unnormalized_boxes_0 = torch.vstack(
|
||||
[
|
||||
expected_boxes_0[:, 0] * postprocessed_width,
|
||||
expected_boxes_0[:, 1] * postprocessed_height,
|
||||
expected_boxes_0[:, 2] * postprocessed_width,
|
||||
expected_boxes_0[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
unnormalized_boxes_1 = torch.vstack(
|
||||
[
|
||||
expected_boxes_1[:, 0] * postprocessed_width,
|
||||
expected_boxes_1[:, 1] * postprocessed_height,
|
||||
expected_boxes_1[:, 2] * postprocessed_width,
|
||||
expected_boxes_1[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||
expected_boxes_0 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
expected_boxes_1 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||
|
||||
@slow
|
||||
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->Deta
|
||||
def test_batched_coco_panoptic_annotations(self):
|
||||
# prepare image, target and masks_path
|
||||
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||
|
||||
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
|
||||
target = json.loads(f.read())
|
||||
|
||||
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||
|
||||
w_0, h_0 = image_0.size
|
||||
w_1, h_1 = image_1.size
|
||||
for i in range(len(annotation_1["segments_info"])):
|
||||
coords = annotation_1["segments_info"][i]["bbox"]
|
||||
new_bbox = [
|
||||
coords[0] * w_1 / w_0,
|
||||
coords[1] * h_1 / h_0,
|
||||
coords[2] * w_1 / w_0,
|
||||
coords[3] * h_1 / h_0,
|
||||
]
|
||||
annotation_1["segments_info"][i]["bbox"] = new_bbox
|
||||
|
||||
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
|
||||
|
||||
images = [image_0, image_1]
|
||||
annotations = [annotation_0, annotation_1]
|
||||
|
||||
# encode them
|
||||
image_processing = DetaImageProcessor(format="coco_panoptic")
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
masks_path=masks_path,
|
||||
return_tensors="pt",
|
||||
return_segmentation_masks=True,
|
||||
)
|
||||
|
||||
# Check the pixel values have been padded
|
||||
postprocessed_height, postprocessed_width = 800, 1066
|
||||
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||
|
||||
# Check the bounding boxes have been adjusted for padded images
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
expected_boxes_0 = torch.tensor(
|
||||
[
|
||||
[0.2625, 0.5437, 0.4688, 0.8625],
|
||||
[0.7719, 0.4104, 0.4531, 0.7125],
|
||||
[0.5000, 0.4927, 0.9969, 0.9854],
|
||||
[0.1688, 0.2000, 0.2063, 0.0917],
|
||||
[0.5492, 0.2760, 0.0578, 0.2187],
|
||||
[0.4992, 0.4990, 0.9984, 0.9979],
|
||||
]
|
||||
)
|
||||
expected_boxes_1 = torch.tensor(
|
||||
[
|
||||
[0.1576, 0.3262, 0.2814, 0.5175],
|
||||
[0.4634, 0.2463, 0.2720, 0.4275],
|
||||
[0.3002, 0.2956, 0.5985, 0.5913],
|
||||
[0.1013, 0.1200, 0.1238, 0.0550],
|
||||
[0.3297, 0.1656, 0.0347, 0.1312],
|
||||
[0.2997, 0.2994, 0.5994, 0.5987],
|
||||
]
|
||||
)
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||
|
||||
# Check the masks have also been padded
|
||||
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
|
||||
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||
# format and not in the range [0, 1]
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
masks_path=masks_path,
|
||||
return_segmentation_masks=True,
|
||||
do_convert_annotations=False,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
# Convert to absolute coordinates
|
||||
unnormalized_boxes_0 = torch.vstack(
|
||||
[
|
||||
expected_boxes_0[:, 0] * postprocessed_width,
|
||||
expected_boxes_0[:, 1] * postprocessed_height,
|
||||
expected_boxes_0[:, 2] * postprocessed_width,
|
||||
expected_boxes_0[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
unnormalized_boxes_1 = torch.vstack(
|
||||
[
|
||||
expected_boxes_1[:, 0] * postprocessed_width,
|
||||
expected_boxes_1[:, 1] * postprocessed_height,
|
||||
expected_boxes_1[:, 2] * postprocessed_width,
|
||||
expected_boxes_1[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||
expected_boxes_0 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
expected_boxes_1 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import json
|
||||
import pathlib
|
||||
import unittest
|
||||
@@ -308,3 +307,244 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
|
||||
# verify size
|
||||
expected_size = torch.tensor([800, 1066])
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
|
||||
|
||||
@slow
|
||||
def test_batched_coco_detection_annotations(self):
|
||||
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||
|
||||
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
|
||||
target = json.loads(f.read())
|
||||
|
||||
annotations_0 = {"image_id": 39769, "annotations": target}
|
||||
annotations_1 = {"image_id": 39769, "annotations": target}
|
||||
|
||||
# Adjust the bounding boxes for the resized image
|
||||
w_0, h_0 = image_0.size
|
||||
w_1, h_1 = image_1.size
|
||||
for i in range(len(annotations_1["annotations"])):
|
||||
coords = annotations_1["annotations"][i]["bbox"]
|
||||
new_bbox = [
|
||||
coords[0] * w_1 / w_0,
|
||||
coords[1] * h_1 / h_0,
|
||||
coords[2] * w_1 / w_0,
|
||||
coords[3] * h_1 / h_0,
|
||||
]
|
||||
annotations_1["annotations"][i]["bbox"] = new_bbox
|
||||
|
||||
images = [image_0, image_1]
|
||||
annotations = [annotations_0, annotations_1]
|
||||
|
||||
image_processing = DetrImageProcessor()
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
return_segmentation_masks=True,
|
||||
return_tensors="pt", # do_convert_annotations=True
|
||||
)
|
||||
|
||||
# Check the pixel values have been padded
|
||||
postprocessed_height, postprocessed_width = 800, 1066
|
||||
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||
|
||||
# Check the bounding boxes have been adjusted for padded images
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
expected_boxes_0 = torch.tensor(
|
||||
[
|
||||
[0.6879, 0.4609, 0.0755, 0.3691],
|
||||
[0.2118, 0.3359, 0.2601, 0.1566],
|
||||
[0.5011, 0.5000, 0.9979, 1.0000],
|
||||
[0.5010, 0.5020, 0.9979, 0.9959],
|
||||
[0.3284, 0.5944, 0.5884, 0.8112],
|
||||
[0.8394, 0.5445, 0.3213, 0.9110],
|
||||
]
|
||||
)
|
||||
expected_boxes_1 = torch.tensor(
|
||||
[
|
||||
[0.4130, 0.2765, 0.0453, 0.2215],
|
||||
[0.1272, 0.2016, 0.1561, 0.0940],
|
||||
[0.3757, 0.4933, 0.7488, 0.9865],
|
||||
[0.3759, 0.5002, 0.7492, 0.9955],
|
||||
[0.1971, 0.5456, 0.3532, 0.8646],
|
||||
[0.5790, 0.4115, 0.3430, 0.7161],
|
||||
]
|
||||
)
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||
|
||||
# Check the masks have also been padded
|
||||
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
|
||||
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||
# format and not in the range [0, 1]
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
return_segmentation_masks=True,
|
||||
do_convert_annotations=False,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
# Convert to absolute coordinates
|
||||
unnormalized_boxes_0 = torch.vstack(
|
||||
[
|
||||
expected_boxes_0[:, 0] * postprocessed_width,
|
||||
expected_boxes_0[:, 1] * postprocessed_height,
|
||||
expected_boxes_0[:, 2] * postprocessed_width,
|
||||
expected_boxes_0[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
unnormalized_boxes_1 = torch.vstack(
|
||||
[
|
||||
expected_boxes_1[:, 0] * postprocessed_width,
|
||||
expected_boxes_1[:, 1] * postprocessed_height,
|
||||
expected_boxes_1[:, 2] * postprocessed_width,
|
||||
expected_boxes_1[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||
expected_boxes_0 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
expected_boxes_1 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||
|
||||
@slow
|
||||
def test_batched_coco_panoptic_annotations(self):
|
||||
# prepare image, target and masks_path
|
||||
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||
|
||||
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
|
||||
target = json.loads(f.read())
|
||||
|
||||
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||
|
||||
w_0, h_0 = image_0.size
|
||||
w_1, h_1 = image_1.size
|
||||
for i in range(len(annotation_1["segments_info"])):
|
||||
coords = annotation_1["segments_info"][i]["bbox"]
|
||||
new_bbox = [
|
||||
coords[0] * w_1 / w_0,
|
||||
coords[1] * h_1 / h_0,
|
||||
coords[2] * w_1 / w_0,
|
||||
coords[3] * h_1 / h_0,
|
||||
]
|
||||
annotation_1["segments_info"][i]["bbox"] = new_bbox
|
||||
|
||||
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
|
||||
|
||||
images = [image_0, image_1]
|
||||
annotations = [annotation_0, annotation_1]
|
||||
|
||||
# encode them
|
||||
image_processing = DetrImageProcessor(format="coco_panoptic")
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
masks_path=masks_path,
|
||||
return_tensors="pt",
|
||||
return_segmentation_masks=True,
|
||||
)
|
||||
|
||||
# Check the pixel values have been padded
|
||||
postprocessed_height, postprocessed_width = 800, 1066
|
||||
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||
|
||||
# Check the bounding boxes have been adjusted for padded images
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
expected_boxes_0 = torch.tensor(
|
||||
[
|
||||
[0.2625, 0.5437, 0.4688, 0.8625],
|
||||
[0.7719, 0.4104, 0.4531, 0.7125],
|
||||
[0.5000, 0.4927, 0.9969, 0.9854],
|
||||
[0.1688, 0.2000, 0.2063, 0.0917],
|
||||
[0.5492, 0.2760, 0.0578, 0.2187],
|
||||
[0.4992, 0.4990, 0.9984, 0.9979],
|
||||
]
|
||||
)
|
||||
expected_boxes_1 = torch.tensor(
|
||||
[
|
||||
[0.1576, 0.3262, 0.2814, 0.5175],
|
||||
[0.4634, 0.2463, 0.2720, 0.4275],
|
||||
[0.3002, 0.2956, 0.5985, 0.5913],
|
||||
[0.1013, 0.1200, 0.1238, 0.0550],
|
||||
[0.3297, 0.1656, 0.0347, 0.1312],
|
||||
[0.2997, 0.2994, 0.5994, 0.5987],
|
||||
]
|
||||
)
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||
|
||||
# Check the masks have also been padded
|
||||
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
|
||||
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||
# format and not in the range [0, 1]
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
masks_path=masks_path,
|
||||
return_segmentation_masks=True,
|
||||
do_convert_annotations=False,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
# Convert to absolute coordinates
|
||||
unnormalized_boxes_0 = torch.vstack(
|
||||
[
|
||||
expected_boxes_0[:, 0] * postprocessed_width,
|
||||
expected_boxes_0[:, 1] * postprocessed_height,
|
||||
expected_boxes_0[:, 2] * postprocessed_width,
|
||||
expected_boxes_0[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
unnormalized_boxes_1 = torch.vstack(
|
||||
[
|
||||
expected_boxes_1[:, 0] * postprocessed_width,
|
||||
expected_boxes_1[:, 1] * postprocessed_height,
|
||||
expected_boxes_1[:, 2] * postprocessed_width,
|
||||
expected_boxes_1[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||
expected_boxes_0 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
expected_boxes_1 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||
|
||||
@@ -287,3 +287,246 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
|
||||
# verify size
|
||||
expected_size = torch.tensor([800, 1056])
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
|
||||
|
||||
@slow
|
||||
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->Yolos
|
||||
def test_batched_coco_detection_annotations(self):
|
||||
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||
|
||||
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
|
||||
target = json.loads(f.read())
|
||||
|
||||
annotations_0 = {"image_id": 39769, "annotations": target}
|
||||
annotations_1 = {"image_id": 39769, "annotations": target}
|
||||
|
||||
# Adjust the bounding boxes for the resized image
|
||||
w_0, h_0 = image_0.size
|
||||
w_1, h_1 = image_1.size
|
||||
for i in range(len(annotations_1["annotations"])):
|
||||
coords = annotations_1["annotations"][i]["bbox"]
|
||||
new_bbox = [
|
||||
coords[0] * w_1 / w_0,
|
||||
coords[1] * h_1 / h_0,
|
||||
coords[2] * w_1 / w_0,
|
||||
coords[3] * h_1 / h_0,
|
||||
]
|
||||
annotations_1["annotations"][i]["bbox"] = new_bbox
|
||||
|
||||
images = [image_0, image_1]
|
||||
annotations = [annotations_0, annotations_1]
|
||||
|
||||
image_processing = YolosImageProcessor()
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
return_segmentation_masks=True,
|
||||
return_tensors="pt", # do_convert_annotations=True
|
||||
)
|
||||
|
||||
# Check the pixel values have been padded
|
||||
postprocessed_height, postprocessed_width = 800, 1066
|
||||
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||
|
||||
# Check the bounding boxes have been adjusted for padded images
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
expected_boxes_0 = torch.tensor(
|
||||
[
|
||||
[0.6879, 0.4609, 0.0755, 0.3691],
|
||||
[0.2118, 0.3359, 0.2601, 0.1566],
|
||||
[0.5011, 0.5000, 0.9979, 1.0000],
|
||||
[0.5010, 0.5020, 0.9979, 0.9959],
|
||||
[0.3284, 0.5944, 0.5884, 0.8112],
|
||||
[0.8394, 0.5445, 0.3213, 0.9110],
|
||||
]
|
||||
)
|
||||
expected_boxes_1 = torch.tensor(
|
||||
[
|
||||
[0.4130, 0.2765, 0.0453, 0.2215],
|
||||
[0.1272, 0.2016, 0.1561, 0.0940],
|
||||
[0.3757, 0.4933, 0.7488, 0.9865],
|
||||
[0.3759, 0.5002, 0.7492, 0.9955],
|
||||
[0.1971, 0.5456, 0.3532, 0.8646],
|
||||
[0.5790, 0.4115, 0.3430, 0.7161],
|
||||
]
|
||||
)
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||
|
||||
# Check the masks have also been padded
|
||||
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
|
||||
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||
# format and not in the range [0, 1]
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
return_segmentation_masks=True,
|
||||
do_convert_annotations=False,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
# Convert to absolute coordinates
|
||||
unnormalized_boxes_0 = torch.vstack(
|
||||
[
|
||||
expected_boxes_0[:, 0] * postprocessed_width,
|
||||
expected_boxes_0[:, 1] * postprocessed_height,
|
||||
expected_boxes_0[:, 2] * postprocessed_width,
|
||||
expected_boxes_0[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
unnormalized_boxes_1 = torch.vstack(
|
||||
[
|
||||
expected_boxes_1[:, 0] * postprocessed_width,
|
||||
expected_boxes_1[:, 1] * postprocessed_height,
|
||||
expected_boxes_1[:, 2] * postprocessed_width,
|
||||
expected_boxes_1[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||
expected_boxes_0 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
expected_boxes_1 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||
|
||||
@slow
|
||||
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->Yolos
|
||||
def test_batched_coco_panoptic_annotations(self):
|
||||
# prepare image, target and masks_path
|
||||
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
|
||||
|
||||
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
|
||||
target = json.loads(f.read())
|
||||
|
||||
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
|
||||
|
||||
w_0, h_0 = image_0.size
|
||||
w_1, h_1 = image_1.size
|
||||
for i in range(len(annotation_1["segments_info"])):
|
||||
coords = annotation_1["segments_info"][i]["bbox"]
|
||||
new_bbox = [
|
||||
coords[0] * w_1 / w_0,
|
||||
coords[1] * h_1 / h_0,
|
||||
coords[2] * w_1 / w_0,
|
||||
coords[3] * h_1 / h_0,
|
||||
]
|
||||
annotation_1["segments_info"][i]["bbox"] = new_bbox
|
||||
|
||||
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
|
||||
|
||||
images = [image_0, image_1]
|
||||
annotations = [annotation_0, annotation_1]
|
||||
|
||||
# encode them
|
||||
image_processing = YolosImageProcessor(format="coco_panoptic")
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
masks_path=masks_path,
|
||||
return_tensors="pt",
|
||||
return_segmentation_masks=True,
|
||||
)
|
||||
|
||||
# Check the pixel values have been padded
|
||||
postprocessed_height, postprocessed_width = 800, 1066
|
||||
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
|
||||
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
|
||||
|
||||
# Check the bounding boxes have been adjusted for padded images
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
expected_boxes_0 = torch.tensor(
|
||||
[
|
||||
[0.2625, 0.5437, 0.4688, 0.8625],
|
||||
[0.7719, 0.4104, 0.4531, 0.7125],
|
||||
[0.5000, 0.4927, 0.9969, 0.9854],
|
||||
[0.1688, 0.2000, 0.2063, 0.0917],
|
||||
[0.5492, 0.2760, 0.0578, 0.2187],
|
||||
[0.4992, 0.4990, 0.9984, 0.9979],
|
||||
]
|
||||
)
|
||||
expected_boxes_1 = torch.tensor(
|
||||
[
|
||||
[0.1576, 0.3262, 0.2814, 0.5175],
|
||||
[0.4634, 0.2463, 0.2720, 0.4275],
|
||||
[0.3002, 0.2956, 0.5985, 0.5913],
|
||||
[0.1013, 0.1200, 0.1238, 0.0550],
|
||||
[0.3297, 0.1656, 0.0347, 0.1312],
|
||||
[0.2997, 0.2994, 0.5994, 0.5987],
|
||||
]
|
||||
)
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
|
||||
|
||||
# Check the masks have also been padded
|
||||
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
|
||||
|
||||
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
|
||||
# format and not in the range [0, 1]
|
||||
encoding = image_processing(
|
||||
images=images,
|
||||
annotations=annotations,
|
||||
masks_path=masks_path,
|
||||
return_segmentation_masks=True,
|
||||
do_convert_annotations=False,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
|
||||
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
|
||||
# Convert to absolute coordinates
|
||||
unnormalized_boxes_0 = torch.vstack(
|
||||
[
|
||||
expected_boxes_0[:, 0] * postprocessed_width,
|
||||
expected_boxes_0[:, 1] * postprocessed_height,
|
||||
expected_boxes_0[:, 2] * postprocessed_width,
|
||||
expected_boxes_0[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
unnormalized_boxes_1 = torch.vstack(
|
||||
[
|
||||
expected_boxes_1[:, 0] * postprocessed_width,
|
||||
expected_boxes_1[:, 1] * postprocessed_height,
|
||||
expected_boxes_1[:, 2] * postprocessed_width,
|
||||
expected_boxes_1[:, 3] * postprocessed_height,
|
||||
]
|
||||
).T
|
||||
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
|
||||
expected_boxes_0 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
|
||||
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
|
||||
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
expected_boxes_1 = torch.vstack(
|
||||
[
|
||||
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
|
||||
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
|
||||
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
|
||||
]
|
||||
).T
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||
|
||||
Reference in New Issue
Block a user