Add DetrImageProcessorFast (#34063)

* add fully functionning image_processing_detr_fast

* Create tensors on the correct device

* fix copies

* fix doc

* add tests equivalence cpu gpu

* fix doc en

* add relative imports and copied from

* Fix copies and nit
This commit is contained in:
Yoni Gozlan
2024-10-21 09:05:05 -04:00
committed by GitHub
parent 24bdc94da5
commit a4122813d1
16 changed files with 2779 additions and 1047 deletions

View File

@@ -181,6 +181,15 @@ If you're interested in submitting a resource to be included here, please feel f
- post_process_instance_segmentation - post_process_instance_segmentation
- post_process_panoptic_segmentation - post_process_panoptic_segmentation
## DetrImageProcessorFast
[[autodoc]] DetrImageProcessorFast
- preprocess
- post_process_object_detection
- post_process_semantic_segmentation
- post_process_instance_segmentation
- post_process_panoptic_segmentation
## DetrFeatureExtractor ## DetrFeatureExtractor
[[autodoc]] DetrFeatureExtractor [[autodoc]] DetrFeatureExtractor

View File

@@ -184,6 +184,15 @@ DETR の使用を開始するのに役立つ公式 Hugging Face およびコミ
- post_process_instance_segmentation - post_process_instance_segmentation
- post_process_panoptic_segmentation - post_process_panoptic_segmentation
## DetrImageProcessorFast
[[autodoc]] DetrImageProcessorFast
- preprocess
- post_process_object_detection
- post_process_semantic_segmentation
- post_process_instance_segmentation
- post_process_panoptic_segmentation
## DetrFeatureExtractor ## DetrFeatureExtractor
[[autodoc]] DetrFeatureExtractor [[autodoc]] DetrFeatureExtractor

View File

@@ -1191,7 +1191,7 @@ else:
_import_structure["models.deprecated.efficientformer"].append("EfficientFormerImageProcessor") _import_structure["models.deprecated.efficientformer"].append("EfficientFormerImageProcessor")
_import_structure["models.deprecated.tvlt"].append("TvltImageProcessor") _import_structure["models.deprecated.tvlt"].append("TvltImageProcessor")
_import_structure["models.deprecated.vit_hybrid"].extend(["ViTHybridImageProcessor"]) _import_structure["models.deprecated.vit_hybrid"].extend(["ViTHybridImageProcessor"])
_import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor"]) _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor", "DetrImageProcessorFast"])
_import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"]) _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"])
_import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"]) _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
_import_structure["models.efficientnet"].append("EfficientNetImageProcessor") _import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
@@ -6090,7 +6090,7 @@ if TYPE_CHECKING:
from .models.deprecated.efficientformer import EfficientFormerImageProcessor from .models.deprecated.efficientformer import EfficientFormerImageProcessor
from .models.deprecated.tvlt import TvltImageProcessor from .models.deprecated.tvlt import TvltImageProcessor
from .models.deprecated.vit_hybrid import ViTHybridImageProcessor from .models.deprecated.vit_hybrid import ViTHybridImageProcessor
from .models.detr import DetrFeatureExtractor, DetrImageProcessor from .models.detr import DetrFeatureExtractor, DetrImageProcessor, DetrImageProcessorFast
from .models.donut import DonutFeatureExtractor, DonutImageProcessor from .models.donut import DonutFeatureExtractor, DonutImageProcessor
from .models.dpt import DPTFeatureExtractor, DPTImageProcessor from .models.dpt import DPTFeatureExtractor, DPTImageProcessor
from .models.efficientnet import EfficientNetImageProcessor from .models.efficientnet import EfficientNetImageProcessor

View File

@@ -32,6 +32,7 @@ from .utils.import_utils import (
is_tf_available, is_tf_available,
is_torch_available, is_torch_available,
is_torchvision_available, is_torchvision_available,
is_torchvision_v2_available,
is_vision_available, is_vision_available,
requires_backends, requires_backends,
) )
@@ -51,7 +52,9 @@ if is_tf_available():
if is_flax_available(): if is_flax_available():
import jax.numpy as jnp import jax.numpy as jnp
if is_torchvision_available(): if is_torchvision_v2_available():
from torchvision.transforms.v2 import functional as F
elif is_torchvision_available():
from torchvision.transforms import functional as F from torchvision.transforms import functional as F

View File

@@ -72,7 +72,7 @@ else:
("deit", ("DeiTImageProcessor",)), ("deit", ("DeiTImageProcessor",)),
("depth_anything", ("DPTImageProcessor",)), ("depth_anything", ("DPTImageProcessor",)),
("deta", ("DetaImageProcessor",)), ("deta", ("DetaImageProcessor",)),
("detr", ("DetrImageProcessor",)), ("detr", ("DetrImageProcessor", "DetrImageProcessorFast")),
("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")), ("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")),
("dinov2", ("BitImageProcessor",)), ("dinov2", ("BitImageProcessor",)),
("donut-swin", ("DonutImageProcessor",)), ("donut-swin", ("DonutImageProcessor",)),

View File

@@ -27,6 +27,7 @@ except OptionalDependencyNotAvailable:
else: else:
_import_structure["feature_extraction_detr"] = ["DetrFeatureExtractor"] _import_structure["feature_extraction_detr"] = ["DetrFeatureExtractor"]
_import_structure["image_processing_detr"] = ["DetrImageProcessor"] _import_structure["image_processing_detr"] = ["DetrImageProcessor"]
_import_structure["image_processing_detr_fast"] = ["DetrImageProcessorFast"]
try: try:
if not is_torch_available(): if not is_torch_available():
@@ -53,6 +54,7 @@ if TYPE_CHECKING:
else: else:
from .feature_extraction_detr import DetrFeatureExtractor from .feature_extraction_detr import DetrFeatureExtractor
from .image_processing_detr import DetrImageProcessor from .image_processing_detr import DetrImageProcessor
from .image_processing_detr_fast import DetrImageProcessorFast
try: try:
if not is_torch_available(): if not is_torch_available():

File diff suppressed because it is too large Load Diff

View File

@@ -225,6 +225,7 @@ from .import_utils import (
is_torchdynamo_available, is_torchdynamo_available,
is_torchdynamo_compiling, is_torchdynamo_compiling,
is_torchvision_available, is_torchvision_available,
is_torchvision_v2_available,
is_training_run_on_sagemaker, is_training_run_on_sagemaker,
is_uroman_available, is_uroman_available,
is_vision_available, is_vision_available,

View File

@@ -191,6 +191,13 @@ class DetrImageProcessor(metaclass=DummyObject):
requires_backends(self, ["vision"]) requires_backends(self, ["vision"])
class DetrImageProcessorFast(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class DonutFeatureExtractor(metaclass=DummyObject): class DonutFeatureExtractor(metaclass=DummyObject):
_backends = ["vision"] _backends = ["vision"]

View File

@@ -186,7 +186,7 @@ _tokenizers_available = _is_package_available("tokenizers")
_torchaudio_available = _is_package_available("torchaudio") _torchaudio_available = _is_package_available("torchaudio")
_torchao_available = _is_package_available("torchao") _torchao_available = _is_package_available("torchao")
_torchdistx_available = _is_package_available("torchdistx") _torchdistx_available = _is_package_available("torchdistx")
_torchvision_available = _is_package_available("torchvision") _torchvision_available, _torchvision_version = _is_package_available("torchvision", return_version=True)
_mlx_available = _is_package_available("mlx") _mlx_available = _is_package_available("mlx")
_hqq_available, _hqq_version = _is_package_available("hqq", return_version=True) _hqq_available, _hqq_version = _is_package_available("hqq", return_version=True)
_tiktoken_available = _is_package_available("tiktoken") _tiktoken_available = _is_package_available("tiktoken")
@@ -362,6 +362,14 @@ def is_torchvision_available():
return _torchvision_available return _torchvision_available
def is_torchvision_v2_available():
if not is_torchvision_available():
return False
# NOTE: We require torchvision>=0.15 as v2 transforms are available from this version: https://pytorch.org/vision/stable/transforms.html#v1-or-v2-which-one-should-i-use
return version.parse(_torchvision_version) >= version.parse("0.15")
def is_galore_torch_available(): def is_galore_torch_available():
return _galore_torch_available return _galore_torch_available

View File

@@ -282,96 +282,97 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
images = [image_0, image_1] images = [image_0, image_1]
annotations = [annotations_0, annotations_1] annotations = [annotations_0, annotations_1]
image_processing = ConditionalDetrImageProcessor() for image_processing_class in self.image_processor_list:
encoding = image_processing( image_processing = image_processing_class()
images=images, encoding = image_processing(
annotations=annotations, images=images,
return_segmentation_masks=True, annotations=annotations,
return_tensors="pt", # do_convert_annotations=True return_segmentation_masks=True,
) return_tensors="pt", # do_convert_annotations=True
)
# Check the pixel values have been padded # Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066 postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape) self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images # Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor( expected_boxes_0 = torch.tensor(
[ [
[0.6879, 0.4609, 0.0755, 0.3691], [0.6879, 0.4609, 0.0755, 0.3691],
[0.2118, 0.3359, 0.2601, 0.1566], [0.2118, 0.3359, 0.2601, 0.1566],
[0.5011, 0.5000, 0.9979, 1.0000], [0.5011, 0.5000, 0.9979, 1.0000],
[0.5010, 0.5020, 0.9979, 0.9959], [0.5010, 0.5020, 0.9979, 0.9959],
[0.3284, 0.5944, 0.5884, 0.8112], [0.3284, 0.5944, 0.5884, 0.8112],
[0.8394, 0.5445, 0.3213, 0.9110], [0.8394, 0.5445, 0.3213, 0.9110],
] ]
) )
expected_boxes_1 = torch.tensor( expected_boxes_1 = torch.tensor(
[ [
[0.4130, 0.2765, 0.0453, 0.2215], [0.4130, 0.2765, 0.0453, 0.2215],
[0.1272, 0.2016, 0.1561, 0.0940], [0.1272, 0.2016, 0.1561, 0.0940],
[0.3757, 0.4933, 0.7488, 0.9865], [0.3757, 0.4933, 0.7488, 0.9865],
[0.3759, 0.5002, 0.7492, 0.9955], [0.3759, 0.5002, 0.7492, 0.9955],
[0.1971, 0.5456, 0.3532, 0.8646], [0.1971, 0.5456, 0.3532, 0.8646],
[0.5790, 0.4115, 0.3430, 0.7161], [0.5790, 0.4115, 0.3430, 0.7161],
] ]
) )
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded # Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1] # format and not in the range [0, 1]
encoding = image_processing( encoding = image_processing(
images=images, images=images,
annotations=annotations, annotations=annotations,
return_segmentation_masks=True, return_segmentation_masks=True,
do_convert_annotations=False, do_convert_annotations=False,
return_tensors="pt", return_tensors="pt",
) )
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates # Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack( unnormalized_boxes_0 = torch.vstack(
[ [
expected_boxes_0[:, 0] * postprocessed_width, expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height, expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width, expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height, expected_boxes_0[:, 3] * postprocessed_height,
] ]
).T ).T
unnormalized_boxes_1 = torch.vstack( unnormalized_boxes_1 = torch.vstack(
[ [
expected_boxes_1[:, 0] * postprocessed_width, expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height, expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width, expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height, expected_boxes_1[:, 3] * postprocessed_height,
] ]
).T ).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack( expected_boxes_0 = torch.vstack(
[ [
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
] ]
).T ).T
expected_boxes_1 = torch.vstack( expected_boxes_1 = torch.vstack(
[ [
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
] ]
).T ).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->ConditionalDetr # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->ConditionalDetr
def test_batched_coco_panoptic_annotations(self): def test_batched_coco_panoptic_annotations(self):
@@ -402,146 +403,148 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
images = [image_0, image_1] images = [image_0, image_1]
annotations = [annotation_0, annotation_1] annotations = [annotation_0, annotation_1]
# encode them for image_processing_class in self.image_processor_list:
image_processing = ConditionalDetrImageProcessor(format="coco_panoptic") # encode them
encoding = image_processing( image_processing = image_processing_class(format="coco_panoptic")
images=images, encoding = image_processing(
annotations=annotations, images=images,
masks_path=masks_path, annotations=annotations,
return_tensors="pt", masks_path=masks_path,
return_segmentation_masks=True, return_tensors="pt",
) return_segmentation_masks=True,
)
# Check the pixel values have been padded # Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066 postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape) self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images # Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor( expected_boxes_0 = torch.tensor(
[ [
[0.2625, 0.5437, 0.4688, 0.8625], [0.2625, 0.5437, 0.4688, 0.8625],
[0.7719, 0.4104, 0.4531, 0.7125], [0.7719, 0.4104, 0.4531, 0.7125],
[0.5000, 0.4927, 0.9969, 0.9854], [0.5000, 0.4927, 0.9969, 0.9854],
[0.1688, 0.2000, 0.2063, 0.0917], [0.1688, 0.2000, 0.2063, 0.0917],
[0.5492, 0.2760, 0.0578, 0.2187], [0.5492, 0.2760, 0.0578, 0.2187],
[0.4992, 0.4990, 0.9984, 0.9979], [0.4992, 0.4990, 0.9984, 0.9979],
] ]
) )
expected_boxes_1 = torch.tensor( expected_boxes_1 = torch.tensor(
[ [
[0.1576, 0.3262, 0.2814, 0.5175], [0.1576, 0.3262, 0.2814, 0.5175],
[0.4634, 0.2463, 0.2720, 0.4275], [0.4634, 0.2463, 0.2720, 0.4275],
[0.3002, 0.2956, 0.5985, 0.5913], [0.3002, 0.2956, 0.5985, 0.5913],
[0.1013, 0.1200, 0.1238, 0.0550], [0.1013, 0.1200, 0.1238, 0.0550],
[0.3297, 0.1656, 0.0347, 0.1312], [0.3297, 0.1656, 0.0347, 0.1312],
[0.2997, 0.2994, 0.5994, 0.5987], [0.2997, 0.2994, 0.5994, 0.5987],
] ]
) )
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded # Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1] # format and not in the range [0, 1]
encoding = image_processing( encoding = image_processing(
images=images, images=images,
annotations=annotations, annotations=annotations,
masks_path=masks_path, masks_path=masks_path,
return_segmentation_masks=True, return_segmentation_masks=True,
do_convert_annotations=False, do_convert_annotations=False,
return_tensors="pt", return_tensors="pt",
) )
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates # Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack( unnormalized_boxes_0 = torch.vstack(
[ [
expected_boxes_0[:, 0] * postprocessed_width, expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height, expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width, expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height, expected_boxes_0[:, 3] * postprocessed_height,
] ]
).T ).T
unnormalized_boxes_1 = torch.vstack( unnormalized_boxes_1 = torch.vstack(
[ [
expected_boxes_1[:, 0] * postprocessed_width, expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height, expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width, expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height, expected_boxes_1[:, 3] * postprocessed_height,
] ]
).T ).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack( expected_boxes_0 = torch.vstack(
[ [
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
] ]
).T ).T
expected_boxes_1 = torch.vstack( expected_boxes_1 = torch.vstack(
[ [
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
] ]
).T ).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->ConditionalDetr # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->ConditionalDetr
def test_max_width_max_height_resizing_and_pad_strategy(self): def test_max_width_max_height_resizing_and_pad_strategy(self):
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) for image_processing_class in self.image_processor_list:
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = ConditionalDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, size={"max_height": 100, "max_width": 100},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = ConditionalDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = ConditionalDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = ConditionalDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 301, "width": 101}, pad_size={"height": 301, "width": 101},
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
### Check for batch ### Check for batch
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = ConditionalDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 150, "max_width": 100}, size={"max_height": 150, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 150, "width": 100}, pad_size={"height": 150, "width": 100},
) )
inputs = image_processor(images=[image_1, image_2], return_tensors="pt") inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
def test_longest_edge_shortest_edge_resizing_strategy(self): def test_longest_edge_shortest_edge_resizing_strategy(self):
image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) image_1 = torch.ones([958, 653, 3], dtype=torch.uint8)

View File

@@ -284,96 +284,97 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
images = [image_0, image_1] images = [image_0, image_1]
annotations = [annotations_0, annotations_1] annotations = [annotations_0, annotations_1]
image_processing = DeformableDetrImageProcessor() for image_processing_class in self.image_processor_list:
encoding = image_processing( image_processing = image_processing_class()
images=images, encoding = image_processing(
annotations=annotations, images=images,
return_segmentation_masks=True, annotations=annotations,
return_tensors="pt", # do_convert_annotations=True return_segmentation_masks=True,
) return_tensors="pt", # do_convert_annotations=True
)
# Check the pixel values have been padded # Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066 postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape) self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images # Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor( expected_boxes_0 = torch.tensor(
[ [
[0.6879, 0.4609, 0.0755, 0.3691], [0.6879, 0.4609, 0.0755, 0.3691],
[0.2118, 0.3359, 0.2601, 0.1566], [0.2118, 0.3359, 0.2601, 0.1566],
[0.5011, 0.5000, 0.9979, 1.0000], [0.5011, 0.5000, 0.9979, 1.0000],
[0.5010, 0.5020, 0.9979, 0.9959], [0.5010, 0.5020, 0.9979, 0.9959],
[0.3284, 0.5944, 0.5884, 0.8112], [0.3284, 0.5944, 0.5884, 0.8112],
[0.8394, 0.5445, 0.3213, 0.9110], [0.8394, 0.5445, 0.3213, 0.9110],
] ]
) )
expected_boxes_1 = torch.tensor( expected_boxes_1 = torch.tensor(
[ [
[0.4130, 0.2765, 0.0453, 0.2215], [0.4130, 0.2765, 0.0453, 0.2215],
[0.1272, 0.2016, 0.1561, 0.0940], [0.1272, 0.2016, 0.1561, 0.0940],
[0.3757, 0.4933, 0.7488, 0.9865], [0.3757, 0.4933, 0.7488, 0.9865],
[0.3759, 0.5002, 0.7492, 0.9955], [0.3759, 0.5002, 0.7492, 0.9955],
[0.1971, 0.5456, 0.3532, 0.8646], [0.1971, 0.5456, 0.3532, 0.8646],
[0.5790, 0.4115, 0.3430, 0.7161], [0.5790, 0.4115, 0.3430, 0.7161],
] ]
) )
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded # Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1] # format and not in the range [0, 1]
encoding = image_processing( encoding = image_processing(
images=images, images=images,
annotations=annotations, annotations=annotations,
return_segmentation_masks=True, return_segmentation_masks=True,
do_convert_annotations=False, do_convert_annotations=False,
return_tensors="pt", return_tensors="pt",
) )
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates # Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack( unnormalized_boxes_0 = torch.vstack(
[ [
expected_boxes_0[:, 0] * postprocessed_width, expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height, expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width, expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height, expected_boxes_0[:, 3] * postprocessed_height,
] ]
).T ).T
unnormalized_boxes_1 = torch.vstack( unnormalized_boxes_1 = torch.vstack(
[ [
expected_boxes_1[:, 0] * postprocessed_width, expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height, expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width, expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height, expected_boxes_1[:, 3] * postprocessed_height,
] ]
).T ).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack( expected_boxes_0 = torch.vstack(
[ [
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
] ]
).T ).T
expected_boxes_1 = torch.vstack( expected_boxes_1 = torch.vstack(
[ [
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
] ]
).T ).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->DeformableDetr # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->DeformableDetr
def test_batched_coco_panoptic_annotations(self): def test_batched_coco_panoptic_annotations(self):
@@ -404,146 +405,148 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
images = [image_0, image_1] images = [image_0, image_1]
annotations = [annotation_0, annotation_1] annotations = [annotation_0, annotation_1]
# encode them for image_processing_class in self.image_processor_list:
image_processing = DeformableDetrImageProcessor(format="coco_panoptic") # encode them
encoding = image_processing( image_processing = image_processing_class(format="coco_panoptic")
images=images, encoding = image_processing(
annotations=annotations, images=images,
masks_path=masks_path, annotations=annotations,
return_tensors="pt", masks_path=masks_path,
return_segmentation_masks=True, return_tensors="pt",
) return_segmentation_masks=True,
)
# Check the pixel values have been padded # Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066 postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape) self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images # Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor( expected_boxes_0 = torch.tensor(
[ [
[0.2625, 0.5437, 0.4688, 0.8625], [0.2625, 0.5437, 0.4688, 0.8625],
[0.7719, 0.4104, 0.4531, 0.7125], [0.7719, 0.4104, 0.4531, 0.7125],
[0.5000, 0.4927, 0.9969, 0.9854], [0.5000, 0.4927, 0.9969, 0.9854],
[0.1688, 0.2000, 0.2063, 0.0917], [0.1688, 0.2000, 0.2063, 0.0917],
[0.5492, 0.2760, 0.0578, 0.2187], [0.5492, 0.2760, 0.0578, 0.2187],
[0.4992, 0.4990, 0.9984, 0.9979], [0.4992, 0.4990, 0.9984, 0.9979],
] ]
) )
expected_boxes_1 = torch.tensor( expected_boxes_1 = torch.tensor(
[ [
[0.1576, 0.3262, 0.2814, 0.5175], [0.1576, 0.3262, 0.2814, 0.5175],
[0.4634, 0.2463, 0.2720, 0.4275], [0.4634, 0.2463, 0.2720, 0.4275],
[0.3002, 0.2956, 0.5985, 0.5913], [0.3002, 0.2956, 0.5985, 0.5913],
[0.1013, 0.1200, 0.1238, 0.0550], [0.1013, 0.1200, 0.1238, 0.0550],
[0.3297, 0.1656, 0.0347, 0.1312], [0.3297, 0.1656, 0.0347, 0.1312],
[0.2997, 0.2994, 0.5994, 0.5987], [0.2997, 0.2994, 0.5994, 0.5987],
] ]
) )
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded # Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1] # format and not in the range [0, 1]
encoding = image_processing( encoding = image_processing(
images=images, images=images,
annotations=annotations, annotations=annotations,
masks_path=masks_path, masks_path=masks_path,
return_segmentation_masks=True, return_segmentation_masks=True,
do_convert_annotations=False, do_convert_annotations=False,
return_tensors="pt", return_tensors="pt",
) )
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates # Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack( unnormalized_boxes_0 = torch.vstack(
[ [
expected_boxes_0[:, 0] * postprocessed_width, expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height, expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width, expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height, expected_boxes_0[:, 3] * postprocessed_height,
] ]
).T ).T
unnormalized_boxes_1 = torch.vstack( unnormalized_boxes_1 = torch.vstack(
[ [
expected_boxes_1[:, 0] * postprocessed_width, expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height, expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width, expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height, expected_boxes_1[:, 3] * postprocessed_height,
] ]
).T ).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack( expected_boxes_0 = torch.vstack(
[ [
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
] ]
).T ).T
expected_boxes_1 = torch.vstack( expected_boxes_1 = torch.vstack(
[ [
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
] ]
).T ).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->DeformableDetr # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->DeformableDetr
def test_max_width_max_height_resizing_and_pad_strategy(self): def test_max_width_max_height_resizing_and_pad_strategy(self):
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) for image_processing_class in self.image_processor_list:
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = DeformableDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, size={"max_height": 100, "max_width": 100},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = DeformableDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = DeformableDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = DeformableDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 301, "width": 101}, pad_size={"height": 301, "width": 101},
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
### Check for batch ### Check for batch
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = DeformableDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 150, "max_width": 100}, size={"max_height": 150, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 150, "width": 100}, pad_size={"height": 150, "width": 100},
) )
inputs = image_processor(images=[image_1, image_2], return_tensors="pt") inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
def test_longest_edge_shortest_edge_resizing_strategy(self): def test_longest_edge_shortest_edge_resizing_strategy(self):
image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) image_1 = torch.ones([958, 653, 3], dtype=torch.uint8)

View File

@@ -19,8 +19,8 @@ import unittest
import numpy as np import numpy as np
from transformers.testing_utils import require_torch, require_vision, slow from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow
from transformers.utils import is_torch_available, is_vision_available from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
@@ -33,6 +33,9 @@ if is_vision_available():
from transformers import DetrImageProcessor from transformers import DetrImageProcessor
if is_torchvision_available():
from transformers import DetrImageProcessorFast
class DetrImageProcessingTester(unittest.TestCase): class DetrImageProcessingTester(unittest.TestCase):
def __init__( def __init__(
@@ -51,6 +54,7 @@ class DetrImageProcessingTester(unittest.TestCase):
image_std=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5],
do_pad=True, do_pad=True,
): ):
super().__init__()
# by setting size["longest_edge"] > max_resolution we're effectively not testing this :p # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333} size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
self.parent = parent self.parent = parent
@@ -132,6 +136,7 @@ class DetrImageProcessingTester(unittest.TestCase):
@require_vision @require_vision
class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase): class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = DetrImageProcessor if is_vision_available() else None image_processing_class = DetrImageProcessor if is_vision_available() else None
fast_image_processing_class = DetrImageProcessorFast if is_torchvision_available() else None
def setUp(self): def setUp(self):
super().setUp() super().setUp()
@@ -142,26 +147,28 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
return self.image_processor_tester.prepare_image_processor_dict() return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self): def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict) for image_processing_class in self.image_processor_list:
self.assertTrue(hasattr(image_processing, "image_mean")) image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "image_std")) self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "do_normalize")) self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_rescale")) self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "rescale_factor")) self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "do_resize")) self.assertTrue(hasattr(image_processing, "rescale_factor"))
self.assertTrue(hasattr(image_processing, "size")) self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "do_pad")) self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_pad"))
def test_image_processor_from_dict_with_kwargs(self): def test_image_processor_from_dict_with_kwargs(self):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict) for image_processing_class in self.image_processor_list:
self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333}) image_processor = image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.do_pad, True) self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
self.assertEqual(image_processor.do_pad, True)
image_processor = self.image_processing_class.from_dict( image_processor = image_processing_class.from_dict(
self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
) )
self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84}) self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
self.assertEqual(image_processor.do_pad, False) self.assertEqual(image_processor.do_pad, False)
def test_should_raise_if_annotation_format_invalid(self): def test_should_raise_if_annotation_format_invalid(self):
image_processor_dict = self.image_processor_tester.prepare_image_processor_dict() image_processor_dict = self.image_processor_tester.prepare_image_processor_dict()
@@ -178,12 +185,13 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
} }
image_processor_params = {**image_processor_dict, **{"format": "_INVALID_FORMAT_"}} image_processor_params = {**image_processor_dict, **{"format": "_INVALID_FORMAT_"}}
image_processor = self.image_processing_class(**image_processor_params) for image_processing_class in self.image_processor_list:
image_processor = image_processing_class(**image_processor_params)
with self.assertRaises(ValueError) as e: with self.assertRaises(ValueError) as e:
image_processor(**params) image_processor(**params)
self.assertTrue(str(e.exception).startswith("_INVALID_FORMAT_ is not a valid AnnotationFormat")) self.assertTrue(str(e.exception).startswith("_INVALID_FORMAT_ is not a valid AnnotationFormat"))
def test_valid_coco_detection_annotations(self): def test_valid_coco_detection_annotations(self):
# prepare image and target # prepare image and target
@@ -193,32 +201,33 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
params = {"image_id": 39769, "annotations": target} params = {"image_id": 39769, "annotations": target}
# encode them for image_processing_class in self.image_processor_list:
image_processing = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") # encode them
image_processing = image_processing_class.from_pretrained("facebook/detr-resnet-50")
# legal encodings (single image) # legal encodings (single image)
_ = image_processing(images=image, annotations=params, return_tensors="pt") _ = image_processing(images=image, annotations=params, return_tensors="pt")
_ = image_processing(images=image, annotations=[params], return_tensors="pt") _ = image_processing(images=image, annotations=[params], return_tensors="pt")
# legal encodings (batch of one image) # legal encodings (batch of one image)
_ = image_processing(images=[image], annotations=params, return_tensors="pt") _ = image_processing(images=[image], annotations=params, return_tensors="pt")
_ = image_processing(images=[image], annotations=[params], return_tensors="pt") _ = image_processing(images=[image], annotations=[params], return_tensors="pt")
# legal encoding (batch of more than one image) # legal encoding (batch of more than one image)
n = 5 n = 5
_ = image_processing(images=[image] * n, annotations=[params] * n, return_tensors="pt") _ = image_processing(images=[image] * n, annotations=[params] * n, return_tensors="pt")
# example of an illegal encoding (missing the 'image_id' key) # example of an illegal encoding (missing the 'image_id' key)
with self.assertRaises(ValueError) as e: with self.assertRaises(ValueError) as e:
image_processing(images=image, annotations={"annotations": target}, return_tensors="pt") image_processing(images=image, annotations={"annotations": target}, return_tensors="pt")
self.assertTrue(str(e.exception).startswith("Invalid COCO detection annotations")) self.assertTrue(str(e.exception).startswith("Invalid COCO detection annotations"))
# example of an illegal encoding (unequal lengths of images and annotations) # example of an illegal encoding (unequal lengths of images and annotations)
with self.assertRaises(ValueError) as e: with self.assertRaises(ValueError) as e:
image_processing(images=[image] * n, annotations=[params] * (n - 1), return_tensors="pt") image_processing(images=[image] * n, annotations=[params] * (n - 1), return_tensors="pt")
self.assertTrue(str(e.exception) == "The number of images (5) and annotations (4) do not match.") self.assertTrue(str(e.exception) == "The number of images (5) and annotations (4) do not match.")
@slow @slow
def test_call_pytorch_with_coco_detection_annotations(self): def test_call_pytorch_with_coco_detection_annotations(self):
@@ -229,40 +238,41 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
target = {"image_id": 39769, "annotations": target} target = {"image_id": 39769, "annotations": target}
# encode them for image_processing_class in self.image_processor_list:
image_processing = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") # encode them
encoding = image_processing(images=image, annotations=target, return_tensors="pt") image_processing = image_processing_class.from_pretrained("facebook/detr-resnet-50")
encoding = image_processing(images=image, annotations=target, return_tensors="pt")
# verify pixel values # verify pixel values
expected_shape = torch.Size([1, 3, 800, 1066]) expected_shape = torch.Size([1, 3, 800, 1066])
self.assertEqual(encoding["pixel_values"].shape, expected_shape) self.assertEqual(encoding["pixel_values"].shape, expected_shape)
expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
# verify area # verify area
expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438]) expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
# verify boxes # verify boxes
expected_boxes_shape = torch.Size([6, 4]) expected_boxes_shape = torch.Size([6, 4])
self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
# verify image_id # verify image_id
expected_image_id = torch.tensor([39769]) expected_image_id = torch.tensor([39769])
self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
# verify is_crowd # verify is_crowd
expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
# verify class_labels # verify class_labels
expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
# verify orig_size # verify orig_size
expected_orig_size = torch.tensor([480, 640]) expected_orig_size = torch.tensor([480, 640])
self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
# verify size # verify size
expected_size = torch.tensor([800, 1066]) expected_size = torch.tensor([800, 1066])
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
@slow @slow
def test_call_pytorch_with_coco_panoptic_annotations(self): def test_call_pytorch_with_coco_panoptic_annotations(self):
@@ -275,43 +285,45 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
# encode them for image_processing_class in self.image_processor_list:
image_processing = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50-panoptic") # encode them
encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt") image_processing = image_processing_class.from_pretrained("facebook/detr-resnet-50-panoptic")
encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
# verify pixel values # verify pixel values
expected_shape = torch.Size([1, 3, 800, 1066]) expected_shape = torch.Size([1, 3, 800, 1066])
self.assertEqual(encoding["pixel_values"].shape, expected_shape) self.assertEqual(encoding["pixel_values"].shape, expected_shape)
expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
# verify area # verify area
expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147]) expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
# verify boxes # verify boxes
expected_boxes_shape = torch.Size([6, 4]) expected_boxes_shape = torch.Size([6, 4])
self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625]) expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
# verify image_id # verify image_id
expected_image_id = torch.tensor([39769]) expected_image_id = torch.tensor([39769])
self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
# verify is_crowd # verify is_crowd
expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
# verify class_labels # verify class_labels
expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93]) expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
# verify masks # verify masks
expected_masks_sum = 822873 expected_masks_sum = 822873
self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum) relative_error = torch.abs(encoding["labels"][0]["masks"].sum() - expected_masks_sum) / expected_masks_sum
# verify orig_size self.assertTrue(relative_error < 1e-3)
expected_orig_size = torch.tensor([480, 640]) # verify orig_size
self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) expected_orig_size = torch.tensor([480, 640])
# verify size self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
expected_size = torch.tensor([800, 1066]) # verify size
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) expected_size = torch.tensor([800, 1066])
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
@slow @slow
def test_batched_coco_detection_annotations(self): def test_batched_coco_detection_annotations(self):
@@ -340,96 +352,97 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
images = [image_0, image_1] images = [image_0, image_1]
annotations = [annotations_0, annotations_1] annotations = [annotations_0, annotations_1]
image_processing = DetrImageProcessor() for image_processing_class in self.image_processor_list:
encoding = image_processing( image_processing = image_processing_class()
images=images, encoding = image_processing(
annotations=annotations, images=images,
return_segmentation_masks=True, annotations=annotations,
return_tensors="pt", # do_convert_annotations=True return_segmentation_masks=True,
) return_tensors="pt", # do_convert_annotations=True
)
# Check the pixel values have been padded # Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066 postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape) self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images # Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor( expected_boxes_0 = torch.tensor(
[ [
[0.6879, 0.4609, 0.0755, 0.3691], [0.6879, 0.4609, 0.0755, 0.3691],
[0.2118, 0.3359, 0.2601, 0.1566], [0.2118, 0.3359, 0.2601, 0.1566],
[0.5011, 0.5000, 0.9979, 1.0000], [0.5011, 0.5000, 0.9979, 1.0000],
[0.5010, 0.5020, 0.9979, 0.9959], [0.5010, 0.5020, 0.9979, 0.9959],
[0.3284, 0.5944, 0.5884, 0.8112], [0.3284, 0.5944, 0.5884, 0.8112],
[0.8394, 0.5445, 0.3213, 0.9110], [0.8394, 0.5445, 0.3213, 0.9110],
] ]
) )
expected_boxes_1 = torch.tensor( expected_boxes_1 = torch.tensor(
[ [
[0.4130, 0.2765, 0.0453, 0.2215], [0.4130, 0.2765, 0.0453, 0.2215],
[0.1272, 0.2016, 0.1561, 0.0940], [0.1272, 0.2016, 0.1561, 0.0940],
[0.3757, 0.4933, 0.7488, 0.9865], [0.3757, 0.4933, 0.7488, 0.9865],
[0.3759, 0.5002, 0.7492, 0.9955], [0.3759, 0.5002, 0.7492, 0.9955],
[0.1971, 0.5456, 0.3532, 0.8646], [0.1971, 0.5456, 0.3532, 0.8646],
[0.5790, 0.4115, 0.3430, 0.7161], [0.5790, 0.4115, 0.3430, 0.7161],
] ]
) )
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded # Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1] # format and not in the range [0, 1]
encoding = image_processing( encoding = image_processing(
images=images, images=images,
annotations=annotations, annotations=annotations,
return_segmentation_masks=True, return_segmentation_masks=True,
do_convert_annotations=False, do_convert_annotations=False,
return_tensors="pt", return_tensors="pt",
) )
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates # Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack( unnormalized_boxes_0 = torch.vstack(
[ [
expected_boxes_0[:, 0] * postprocessed_width, expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height, expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width, expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height, expected_boxes_0[:, 3] * postprocessed_height,
] ]
).T ).T
unnormalized_boxes_1 = torch.vstack( unnormalized_boxes_1 = torch.vstack(
[ [
expected_boxes_1[:, 0] * postprocessed_width, expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height, expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width, expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height, expected_boxes_1[:, 3] * postprocessed_height,
] ]
).T ).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack( expected_boxes_0 = torch.vstack(
[ [
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
] ]
).T ).T
expected_boxes_1 = torch.vstack( expected_boxes_1 = torch.vstack(
[ [
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
] ]
).T ).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
def test_batched_coco_panoptic_annotations(self): def test_batched_coco_panoptic_annotations(self):
# prepare image, target and masks_path # prepare image, target and masks_path
@@ -459,194 +472,318 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
images = [image_0, image_1] images = [image_0, image_1]
annotations = [annotation_0, annotation_1] annotations = [annotation_0, annotation_1]
# encode them for image_processing_class in self.image_processor_list:
image_processing = DetrImageProcessor(format="coco_panoptic") # encode them
encoding = image_processing( image_processing = image_processing_class(format="coco_panoptic")
images=images, encoding = image_processing(
annotations=annotations, images=images,
masks_path=masks_path, annotations=annotations,
return_tensors="pt", masks_path=masks_path,
return_segmentation_masks=True, return_tensors="pt",
) return_segmentation_masks=True,
)
# Check the pixel values have been padded # Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066 postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape) self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images # Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor( expected_boxes_0 = torch.tensor(
[ [
[0.2625, 0.5437, 0.4688, 0.8625], [0.2625, 0.5437, 0.4688, 0.8625],
[0.7719, 0.4104, 0.4531, 0.7125], [0.7719, 0.4104, 0.4531, 0.7125],
[0.5000, 0.4927, 0.9969, 0.9854], [0.5000, 0.4927, 0.9969, 0.9854],
[0.1688, 0.2000, 0.2063, 0.0917], [0.1688, 0.2000, 0.2063, 0.0917],
[0.5492, 0.2760, 0.0578, 0.2187], [0.5492, 0.2760, 0.0578, 0.2187],
[0.4992, 0.4990, 0.9984, 0.9979], [0.4992, 0.4990, 0.9984, 0.9979],
] ]
) )
expected_boxes_1 = torch.tensor( expected_boxes_1 = torch.tensor(
[ [
[0.1576, 0.3262, 0.2814, 0.5175], [0.1576, 0.3262, 0.2814, 0.5175],
[0.4634, 0.2463, 0.2720, 0.4275], [0.4634, 0.2463, 0.2720, 0.4275],
[0.3002, 0.2956, 0.5985, 0.5913], [0.3002, 0.2956, 0.5985, 0.5913],
[0.1013, 0.1200, 0.1238, 0.0550], [0.1013, 0.1200, 0.1238, 0.0550],
[0.3297, 0.1656, 0.0347, 0.1312], [0.3297, 0.1656, 0.0347, 0.1312],
[0.2997, 0.2994, 0.5994, 0.5987], [0.2997, 0.2994, 0.5994, 0.5987],
] ]
) )
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded # Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1] # format and not in the range [0, 1]
encoding = image_processing( encoding = image_processing(
images=images, images=images,
annotations=annotations, annotations=annotations,
masks_path=masks_path, masks_path=masks_path,
return_segmentation_masks=True, return_segmentation_masks=True,
do_convert_annotations=False, do_convert_annotations=False,
return_tensors="pt", return_tensors="pt",
) )
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates # Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack( unnormalized_boxes_0 = torch.vstack(
[ [
expected_boxes_0[:, 0] * postprocessed_width, expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height, expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width, expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height, expected_boxes_0[:, 3] * postprocessed_height,
] ]
).T ).T
unnormalized_boxes_1 = torch.vstack( unnormalized_boxes_1 = torch.vstack(
[ [
expected_boxes_1[:, 0] * postprocessed_width, expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height, expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width, expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height, expected_boxes_1[:, 3] * postprocessed_height,
] ]
).T ).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack( expected_boxes_0 = torch.vstack(
[ [
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
] ]
).T ).T
expected_boxes_1 = torch.vstack( expected_boxes_1 = torch.vstack(
[ [
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
] ]
).T ).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
def test_max_width_max_height_resizing_and_pad_strategy(self): def test_max_width_max_height_resizing_and_pad_strategy(self):
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) for image_processing_class in self.image_processor_list:
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, size={"max_height": 100, "max_width": 100},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 301, "width": 101}, pad_size={"height": 301, "width": 101},
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
### Check for batch ### Check for batch
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 150, "max_width": 100}, size={"max_height": 150, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 150, "width": 100}, pad_size={"height": 150, "width": 100},
) )
inputs = image_processor(images=[image_1, image_2], return_tensors="pt") inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
def test_longest_edge_shortest_edge_resizing_strategy(self): def test_longest_edge_shortest_edge_resizing_strategy(self):
image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) for image_processing_class in self.image_processor_list:
image_1 = torch.ones([958, 653, 3], dtype=torch.uint8)
# max size is set; width < height; # max size is set; width < height;
# do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436 # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"longest_edge": 640, "shortest_edge": 640}, size={"longest_edge": 640, "shortest_edge": 640},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436]))
image_2 = torch.ones([653, 958, 3], dtype=torch.uint8) image_2 = torch.ones([653, 958, 3], dtype=torch.uint8)
# max size is set; height < width; # max size is set; height < width;
# do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640 # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"longest_edge": 640, "shortest_edge": 640}, size={"longest_edge": 640, "shortest_edge": 640},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_2], return_tensors="pt") inputs = image_processor(images=[image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640]))
image_3 = torch.ones([100, 120, 3], dtype=torch.uint8) image_3 = torch.ones([100, 120, 3], dtype=torch.uint8)
# max size is set; width == size; height > max_size; # max size is set; width == size; height > max_size;
# do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98 # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"longest_edge": 118, "shortest_edge": 100}, size={"longest_edge": 118, "shortest_edge": 100},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_3], return_tensors="pt") inputs = image_processor(images=[image_3], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118]))
image_4 = torch.ones([128, 50, 3], dtype=torch.uint8) image_4 = torch.ones([128, 50, 3], dtype=torch.uint8)
# max size is set; height == size; width < max_size; # max size is set; height == size; width < max_size;
# do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128 # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"longest_edge": 256, "shortest_edge": 50}, size={"longest_edge": 256, "shortest_edge": 50},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_4], return_tensors="pt") inputs = image_processor(images=[image_4], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50]))
image_5 = torch.ones([50, 50, 3], dtype=torch.uint8) image_5 = torch.ones([50, 50, 3], dtype=torch.uint8)
# max size is set; height == width; width < max_size; # max size is set; height == width; width < max_size;
# do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50 # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"longest_edge": 117, "shortest_edge": 50}, size={"longest_edge": 117, "shortest_edge": 50},
do_pad=False, do_pad=False,
)
inputs = image_processor(images=[image_5], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50]))
@slow
@require_torch_gpu
def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self):
# prepare image and target
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
target = json.loads(f.read())
target = {"image_id": 39769, "annotations": target}
processor = self.image_processor_list[1].from_pretrained("facebook/detr-resnet-50")
# 1. run processor on CPU
encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu")
# 2. run processor on GPU
encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device="cuda")
# verify pixel values
self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape)
self.assertTrue(
torch.allclose(
encoding_cpu["pixel_values"][0, 0, 0, :3],
encoding_gpu["pixel_values"][0, 0, 0, :3].to("cpu"),
atol=1e-4,
)
) )
inputs = image_processor(images=[image_5], return_tensors="pt") # verify area
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50])) self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["area"], encoding_gpu["labels"][0]["area"].to("cpu")))
# verify boxes
self.assertEqual(encoding_cpu["labels"][0]["boxes"].shape, encoding_gpu["labels"][0]["boxes"].shape)
self.assertTrue(
torch.allclose(
encoding_cpu["labels"][0]["boxes"][0], encoding_gpu["labels"][0]["boxes"][0].to("cpu"), atol=1e-3
)
)
# verify image_id
self.assertTrue(
torch.allclose(encoding_cpu["labels"][0]["image_id"], encoding_gpu["labels"][0]["image_id"].to("cpu"))
)
# verify is_crowd
self.assertTrue(
torch.allclose(encoding_cpu["labels"][0]["iscrowd"], encoding_gpu["labels"][0]["iscrowd"].to("cpu"))
)
# verify class_labels
self.assertTrue(
torch.allclose(
encoding_cpu["labels"][0]["class_labels"], encoding_gpu["labels"][0]["class_labels"].to("cpu")
)
)
# verify orig_size
self.assertTrue(
torch.allclose(encoding_cpu["labels"][0]["orig_size"], encoding_gpu["labels"][0]["orig_size"].to("cpu"))
)
# verify size
self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu")))
@slow
@require_torch_gpu
def test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations(self):
# prepare image, target and masks_path
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
target = json.loads(f.read())
target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
processor = self.image_processor_list[1].from_pretrained("facebook/detr-resnet-50-panoptic")
# 1. run processor on CPU
encoding_cpu = processor(
images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cpu"
)
# 2. run processor on GPU
encoding_gpu = processor(
images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cuda"
)
# verify pixel values
self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape)
self.assertTrue(
torch.allclose(
encoding_cpu["pixel_values"][0, 0, 0, :3],
encoding_gpu["pixel_values"][0, 0, 0, :3].to("cpu"),
atol=1e-4,
)
)
# verify area
self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["area"], encoding_gpu["labels"][0]["area"].to("cpu")))
# verify boxes
self.assertEqual(encoding_cpu["labels"][0]["boxes"].shape, encoding_gpu["labels"][0]["boxes"].shape)
self.assertTrue(
torch.allclose(
encoding_cpu["labels"][0]["boxes"][0], encoding_gpu["labels"][0]["boxes"][0].to("cpu"), atol=1e-3
)
)
# verify image_id
self.assertTrue(
torch.allclose(encoding_cpu["labels"][0]["image_id"], encoding_gpu["labels"][0]["image_id"].to("cpu"))
)
# verify is_crowd
self.assertTrue(
torch.allclose(encoding_cpu["labels"][0]["iscrowd"], encoding_gpu["labels"][0]["iscrowd"].to("cpu"))
)
# verify class_labels
self.assertTrue(
torch.allclose(
encoding_cpu["labels"][0]["class_labels"], encoding_gpu["labels"][0]["class_labels"].to("cpu")
)
)
# verify masks
masks_sum_cpu = encoding_cpu["labels"][0]["masks"].sum()
masks_sum_gpu = encoding_gpu["labels"][0]["masks"].sum()
relative_error = torch.abs(masks_sum_cpu - masks_sum_gpu) / masks_sum_cpu
self.assertTrue(relative_error < 1e-3)
# verify orig_size
self.assertTrue(
torch.allclose(encoding_cpu["labels"][0]["orig_size"], encoding_gpu["labels"][0]["orig_size"].to("cpu"))
)
# verify size
self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu")))

View File

@@ -269,96 +269,97 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
images = [image_0, image_1] images = [image_0, image_1]
annotations = [annotations_0, annotations_1] annotations = [annotations_0, annotations_1]
image_processing = GroundingDinoImageProcessor() for image_processing_class in self.image_processor_list:
encoding = image_processing( image_processing = image_processing_class()
images=images, encoding = image_processing(
annotations=annotations, images=images,
return_segmentation_masks=True, annotations=annotations,
return_tensors="pt", # do_convert_annotations=True return_segmentation_masks=True,
) return_tensors="pt", # do_convert_annotations=True
)
# Check the pixel values have been padded # Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066 postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape) self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images # Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor( expected_boxes_0 = torch.tensor(
[ [
[0.6879, 0.4609, 0.0755, 0.3691], [0.6879, 0.4609, 0.0755, 0.3691],
[0.2118, 0.3359, 0.2601, 0.1566], [0.2118, 0.3359, 0.2601, 0.1566],
[0.5011, 0.5000, 0.9979, 1.0000], [0.5011, 0.5000, 0.9979, 1.0000],
[0.5010, 0.5020, 0.9979, 0.9959], [0.5010, 0.5020, 0.9979, 0.9959],
[0.3284, 0.5944, 0.5884, 0.8112], [0.3284, 0.5944, 0.5884, 0.8112],
[0.8394, 0.5445, 0.3213, 0.9110], [0.8394, 0.5445, 0.3213, 0.9110],
] ]
) )
expected_boxes_1 = torch.tensor( expected_boxes_1 = torch.tensor(
[ [
[0.4130, 0.2765, 0.0453, 0.2215], [0.4130, 0.2765, 0.0453, 0.2215],
[0.1272, 0.2016, 0.1561, 0.0940], [0.1272, 0.2016, 0.1561, 0.0940],
[0.3757, 0.4933, 0.7488, 0.9865], [0.3757, 0.4933, 0.7488, 0.9865],
[0.3759, 0.5002, 0.7492, 0.9955], [0.3759, 0.5002, 0.7492, 0.9955],
[0.1971, 0.5456, 0.3532, 0.8646], [0.1971, 0.5456, 0.3532, 0.8646],
[0.5790, 0.4115, 0.3430, 0.7161], [0.5790, 0.4115, 0.3430, 0.7161],
] ]
) )
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded # Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1] # format and not in the range [0, 1]
encoding = image_processing( encoding = image_processing(
images=images, images=images,
annotations=annotations, annotations=annotations,
return_segmentation_masks=True, return_segmentation_masks=True,
do_convert_annotations=False, do_convert_annotations=False,
return_tensors="pt", return_tensors="pt",
) )
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates # Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack( unnormalized_boxes_0 = torch.vstack(
[ [
expected_boxes_0[:, 0] * postprocessed_width, expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height, expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width, expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height, expected_boxes_0[:, 3] * postprocessed_height,
] ]
).T ).T
unnormalized_boxes_1 = torch.vstack( unnormalized_boxes_1 = torch.vstack(
[ [
expected_boxes_1[:, 0] * postprocessed_width, expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height, expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width, expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height, expected_boxes_1[:, 3] * postprocessed_height,
] ]
).T ).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack( expected_boxes_0 = torch.vstack(
[ [
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
] ]
).T ).T
expected_boxes_1 = torch.vstack( expected_boxes_1 = torch.vstack(
[ [
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
] ]
).T ).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
@slow @slow
# Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_call_pytorch_with_coco_panoptic_annotations with DeformableDetr->GroundingDino # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_call_pytorch_with_coco_panoptic_annotations with DeformableDetr->GroundingDino
@@ -440,146 +441,148 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
images = [image_0, image_1] images = [image_0, image_1]
annotations = [annotation_0, annotation_1] annotations = [annotation_0, annotation_1]
# encode them for image_processing_class in self.image_processor_list:
image_processing = GroundingDinoImageProcessor(format="coco_panoptic") # encode them
encoding = image_processing( image_processing = image_processing_class(format="coco_panoptic")
images=images, encoding = image_processing(
annotations=annotations, images=images,
masks_path=masks_path, annotations=annotations,
return_tensors="pt", masks_path=masks_path,
return_segmentation_masks=True, return_tensors="pt",
) return_segmentation_masks=True,
)
# Check the pixel values have been padded # Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066 postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape) self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images # Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor( expected_boxes_0 = torch.tensor(
[ [
[0.2625, 0.5437, 0.4688, 0.8625], [0.2625, 0.5437, 0.4688, 0.8625],
[0.7719, 0.4104, 0.4531, 0.7125], [0.7719, 0.4104, 0.4531, 0.7125],
[0.5000, 0.4927, 0.9969, 0.9854], [0.5000, 0.4927, 0.9969, 0.9854],
[0.1688, 0.2000, 0.2063, 0.0917], [0.1688, 0.2000, 0.2063, 0.0917],
[0.5492, 0.2760, 0.0578, 0.2187], [0.5492, 0.2760, 0.0578, 0.2187],
[0.4992, 0.4990, 0.9984, 0.9979], [0.4992, 0.4990, 0.9984, 0.9979],
] ]
) )
expected_boxes_1 = torch.tensor( expected_boxes_1 = torch.tensor(
[ [
[0.1576, 0.3262, 0.2814, 0.5175], [0.1576, 0.3262, 0.2814, 0.5175],
[0.4634, 0.2463, 0.2720, 0.4275], [0.4634, 0.2463, 0.2720, 0.4275],
[0.3002, 0.2956, 0.5985, 0.5913], [0.3002, 0.2956, 0.5985, 0.5913],
[0.1013, 0.1200, 0.1238, 0.0550], [0.1013, 0.1200, 0.1238, 0.0550],
[0.3297, 0.1656, 0.0347, 0.1312], [0.3297, 0.1656, 0.0347, 0.1312],
[0.2997, 0.2994, 0.5994, 0.5987], [0.2997, 0.2994, 0.5994, 0.5987],
] ]
) )
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded # Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1] # format and not in the range [0, 1]
encoding = image_processing( encoding = image_processing(
images=images, images=images,
annotations=annotations, annotations=annotations,
masks_path=masks_path, masks_path=masks_path,
return_segmentation_masks=True, return_segmentation_masks=True,
do_convert_annotations=False, do_convert_annotations=False,
return_tensors="pt", return_tensors="pt",
) )
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates # Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack( unnormalized_boxes_0 = torch.vstack(
[ [
expected_boxes_0[:, 0] * postprocessed_width, expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height, expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width, expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height, expected_boxes_0[:, 3] * postprocessed_height,
] ]
).T ).T
unnormalized_boxes_1 = torch.vstack( unnormalized_boxes_1 = torch.vstack(
[ [
expected_boxes_1[:, 0] * postprocessed_width, expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height, expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width, expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height, expected_boxes_1[:, 3] * postprocessed_height,
] ]
).T ).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack( expected_boxes_0 = torch.vstack(
[ [
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
] ]
).T ).T
expected_boxes_1 = torch.vstack( expected_boxes_1 = torch.vstack(
[ [
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
] ]
).T ).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->GroundingDino # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->GroundingDino
def test_max_width_max_height_resizing_and_pad_strategy(self): def test_max_width_max_height_resizing_and_pad_strategy(self):
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) for image_processing_class in self.image_processor_list:
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = GroundingDinoImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, size={"max_height": 100, "max_width": 100},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = GroundingDinoImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = GroundingDinoImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = GroundingDinoImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 301, "width": 101}, pad_size={"height": 301, "width": 101},
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
### Check for batch ### Check for batch
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = GroundingDinoImageProcessor( image_processor = image_processing_class(
size={"max_height": 150, "max_width": 100}, size={"max_height": 150, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 150, "width": 100}, pad_size={"height": 150, "width": 100},
) )
inputs = image_processor(images=[image_1, image_2], return_tensors="pt") inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
def test_longest_edge_shortest_edge_resizing_strategy(self): def test_longest_edge_shortest_edge_resizing_strategy(self):
image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) image_1 = torch.ones([958, 653, 3], dtype=torch.uint8)

View File

@@ -553,47 +553,48 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Yolos # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Yolos
def test_max_width_max_height_resizing_and_pad_strategy(self): def test_max_width_max_height_resizing_and_pad_strategy(self):
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) for image_processing_class in self.image_processor_list:
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = YolosImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, size={"max_height": 100, "max_width": 100},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = YolosImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = YolosImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = YolosImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 301, "width": 101}, pad_size={"height": 301, "width": 101},
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
### Check for batch ### Check for batch
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = YolosImageProcessor( image_processor = image_processing_class(
size={"max_height": 150, "max_width": 100}, size={"max_height": 150, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 150, "width": 100}, pad_size={"height": 150, "width": 100},
) )
inputs = image_processor(images=[image_1, image_2], return_tensors="pt") inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))

View File

@@ -191,7 +191,7 @@ class ImageProcessingTestMixin:
dummy_images = torch.randint(0, 255, (4, 3, 224, 224), dtype=torch.uint8) dummy_images = torch.randint(0, 255, (4, 3, 224, 224), dtype=torch.uint8)
image_processor_slow = self.image_processing_class(**self.image_processor_dict) image_processor_slow = self.image_processing_class(**self.image_processor_dict)
image_processor_fast = self.fast_image_processing_class() image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
fast_time = measure_time(image_processor_fast, dummy_images) fast_time = measure_time(image_processor_fast, dummy_images)
slow_time = measure_time(image_processor_slow, dummy_images) slow_time = measure_time(image_processor_slow, dummy_images)