Add DetrImageProcessorFast (#34063)

* add fully functionning image_processing_detr_fast

* Create tensors on the correct device

* fix copies

* fix doc

* add tests equivalence cpu gpu

* fix doc en

* add relative imports and copied from

* Fix copies and nit
This commit is contained in:
Yoni Gozlan
2024-10-21 09:05:05 -04:00
committed by GitHub
parent 24bdc94da5
commit a4122813d1
16 changed files with 2779 additions and 1047 deletions

View File

@@ -181,6 +181,15 @@ If you're interested in submitting a resource to be included here, please feel f
- post_process_instance_segmentation - post_process_instance_segmentation
- post_process_panoptic_segmentation - post_process_panoptic_segmentation
## DetrImageProcessorFast
[[autodoc]] DetrImageProcessorFast
- preprocess
- post_process_object_detection
- post_process_semantic_segmentation
- post_process_instance_segmentation
- post_process_panoptic_segmentation
## DetrFeatureExtractor ## DetrFeatureExtractor
[[autodoc]] DetrFeatureExtractor [[autodoc]] DetrFeatureExtractor

View File

@@ -184,6 +184,15 @@ DETR の使用を開始するのに役立つ公式 Hugging Face およびコミ
- post_process_instance_segmentation - post_process_instance_segmentation
- post_process_panoptic_segmentation - post_process_panoptic_segmentation
## DetrImageProcessorFast
[[autodoc]] DetrImageProcessorFast
- preprocess
- post_process_object_detection
- post_process_semantic_segmentation
- post_process_instance_segmentation
- post_process_panoptic_segmentation
## DetrFeatureExtractor ## DetrFeatureExtractor
[[autodoc]] DetrFeatureExtractor [[autodoc]] DetrFeatureExtractor

View File

@@ -1191,7 +1191,7 @@ else:
_import_structure["models.deprecated.efficientformer"].append("EfficientFormerImageProcessor") _import_structure["models.deprecated.efficientformer"].append("EfficientFormerImageProcessor")
_import_structure["models.deprecated.tvlt"].append("TvltImageProcessor") _import_structure["models.deprecated.tvlt"].append("TvltImageProcessor")
_import_structure["models.deprecated.vit_hybrid"].extend(["ViTHybridImageProcessor"]) _import_structure["models.deprecated.vit_hybrid"].extend(["ViTHybridImageProcessor"])
_import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor"]) _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor", "DetrImageProcessorFast"])
_import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"]) _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"])
_import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"]) _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
_import_structure["models.efficientnet"].append("EfficientNetImageProcessor") _import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
@@ -6090,7 +6090,7 @@ if TYPE_CHECKING:
from .models.deprecated.efficientformer import EfficientFormerImageProcessor from .models.deprecated.efficientformer import EfficientFormerImageProcessor
from .models.deprecated.tvlt import TvltImageProcessor from .models.deprecated.tvlt import TvltImageProcessor
from .models.deprecated.vit_hybrid import ViTHybridImageProcessor from .models.deprecated.vit_hybrid import ViTHybridImageProcessor
from .models.detr import DetrFeatureExtractor, DetrImageProcessor from .models.detr import DetrFeatureExtractor, DetrImageProcessor, DetrImageProcessorFast
from .models.donut import DonutFeatureExtractor, DonutImageProcessor from .models.donut import DonutFeatureExtractor, DonutImageProcessor
from .models.dpt import DPTFeatureExtractor, DPTImageProcessor from .models.dpt import DPTFeatureExtractor, DPTImageProcessor
from .models.efficientnet import EfficientNetImageProcessor from .models.efficientnet import EfficientNetImageProcessor

View File

@@ -32,6 +32,7 @@ from .utils.import_utils import (
is_tf_available, is_tf_available,
is_torch_available, is_torch_available,
is_torchvision_available, is_torchvision_available,
is_torchvision_v2_available,
is_vision_available, is_vision_available,
requires_backends, requires_backends,
) )
@@ -51,7 +52,9 @@ if is_tf_available():
if is_flax_available(): if is_flax_available():
import jax.numpy as jnp import jax.numpy as jnp
if is_torchvision_available(): if is_torchvision_v2_available():
from torchvision.transforms.v2 import functional as F
elif is_torchvision_available():
from torchvision.transforms import functional as F from torchvision.transforms import functional as F

View File

@@ -72,7 +72,7 @@ else:
("deit", ("DeiTImageProcessor",)), ("deit", ("DeiTImageProcessor",)),
("depth_anything", ("DPTImageProcessor",)), ("depth_anything", ("DPTImageProcessor",)),
("deta", ("DetaImageProcessor",)), ("deta", ("DetaImageProcessor",)),
("detr", ("DetrImageProcessor",)), ("detr", ("DetrImageProcessor", "DetrImageProcessorFast")),
("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")), ("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")),
("dinov2", ("BitImageProcessor",)), ("dinov2", ("BitImageProcessor",)),
("donut-swin", ("DonutImageProcessor",)), ("donut-swin", ("DonutImageProcessor",)),

View File

@@ -27,6 +27,7 @@ except OptionalDependencyNotAvailable:
else: else:
_import_structure["feature_extraction_detr"] = ["DetrFeatureExtractor"] _import_structure["feature_extraction_detr"] = ["DetrFeatureExtractor"]
_import_structure["image_processing_detr"] = ["DetrImageProcessor"] _import_structure["image_processing_detr"] = ["DetrImageProcessor"]
_import_structure["image_processing_detr_fast"] = ["DetrImageProcessorFast"]
try: try:
if not is_torch_available(): if not is_torch_available():
@@ -53,6 +54,7 @@ if TYPE_CHECKING:
else: else:
from .feature_extraction_detr import DetrFeatureExtractor from .feature_extraction_detr import DetrFeatureExtractor
from .image_processing_detr import DetrImageProcessor from .image_processing_detr import DetrImageProcessor
from .image_processing_detr_fast import DetrImageProcessorFast
try: try:
if not is_torch_available(): if not is_torch_available():

File diff suppressed because it is too large Load Diff

View File

@@ -225,6 +225,7 @@ from .import_utils import (
is_torchdynamo_available, is_torchdynamo_available,
is_torchdynamo_compiling, is_torchdynamo_compiling,
is_torchvision_available, is_torchvision_available,
is_torchvision_v2_available,
is_training_run_on_sagemaker, is_training_run_on_sagemaker,
is_uroman_available, is_uroman_available,
is_vision_available, is_vision_available,

View File

@@ -191,6 +191,13 @@ class DetrImageProcessor(metaclass=DummyObject):
requires_backends(self, ["vision"]) requires_backends(self, ["vision"])
class DetrImageProcessorFast(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class DonutFeatureExtractor(metaclass=DummyObject): class DonutFeatureExtractor(metaclass=DummyObject):
_backends = ["vision"] _backends = ["vision"]

View File

@@ -186,7 +186,7 @@ _tokenizers_available = _is_package_available("tokenizers")
_torchaudio_available = _is_package_available("torchaudio") _torchaudio_available = _is_package_available("torchaudio")
_torchao_available = _is_package_available("torchao") _torchao_available = _is_package_available("torchao")
_torchdistx_available = _is_package_available("torchdistx") _torchdistx_available = _is_package_available("torchdistx")
_torchvision_available = _is_package_available("torchvision") _torchvision_available, _torchvision_version = _is_package_available("torchvision", return_version=True)
_mlx_available = _is_package_available("mlx") _mlx_available = _is_package_available("mlx")
_hqq_available, _hqq_version = _is_package_available("hqq", return_version=True) _hqq_available, _hqq_version = _is_package_available("hqq", return_version=True)
_tiktoken_available = _is_package_available("tiktoken") _tiktoken_available = _is_package_available("tiktoken")
@@ -362,6 +362,14 @@ def is_torchvision_available():
return _torchvision_available return _torchvision_available
def is_torchvision_v2_available():
if not is_torchvision_available():
return False
# NOTE: We require torchvision>=0.15 as v2 transforms are available from this version: https://pytorch.org/vision/stable/transforms.html#v1-or-v2-which-one-should-i-use
return version.parse(_torchvision_version) >= version.parse("0.15")
def is_galore_torch_available(): def is_galore_torch_available():
return _galore_torch_available return _galore_torch_available

View File

@@ -282,7 +282,8 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
images = [image_0, image_1] images = [image_0, image_1]
annotations = [annotations_0, annotations_1] annotations = [annotations_0, annotations_1]
image_processing = ConditionalDetrImageProcessor() for image_processing_class in self.image_processor_list:
image_processing = image_processing_class()
encoding = image_processing( encoding = image_processing(
images=images, images=images,
annotations=annotations, annotations=annotations,
@@ -402,8 +403,9 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
images = [image_0, image_1] images = [image_0, image_1]
annotations = [annotation_0, annotation_1] annotations = [annotation_0, annotation_1]
for image_processing_class in self.image_processor_list:
# encode them # encode them
image_processing = ConditionalDetrImageProcessor(format="coco_panoptic") image_processing = image_processing_class(format="coco_panoptic")
encoding = image_processing( encoding = image_processing(
images=images, images=images,
annotations=annotations, annotations=annotations,
@@ -498,10 +500,11 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->ConditionalDetr # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->ConditionalDetr
def test_max_width_max_height_resizing_and_pad_strategy(self): def test_max_width_max_height_resizing_and_pad_strategy(self):
for image_processing_class in self.image_processor_list:
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = ConditionalDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, size={"max_height": 100, "max_width": 100},
do_pad=False, do_pad=False,
) )
@@ -509,21 +512,21 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = ConditionalDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = ConditionalDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = ConditionalDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 301, "width": 101}, pad_size={"height": 301, "width": 101},
@@ -535,7 +538,7 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = ConditionalDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 150, "max_width": 100}, size={"max_height": 150, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 150, "width": 100}, pad_size={"height": 150, "width": 100},

View File

@@ -284,7 +284,8 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
images = [image_0, image_1] images = [image_0, image_1]
annotations = [annotations_0, annotations_1] annotations = [annotations_0, annotations_1]
image_processing = DeformableDetrImageProcessor() for image_processing_class in self.image_processor_list:
image_processing = image_processing_class()
encoding = image_processing( encoding = image_processing(
images=images, images=images,
annotations=annotations, annotations=annotations,
@@ -404,8 +405,9 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
images = [image_0, image_1] images = [image_0, image_1]
annotations = [annotation_0, annotation_1] annotations = [annotation_0, annotation_1]
for image_processing_class in self.image_processor_list:
# encode them # encode them
image_processing = DeformableDetrImageProcessor(format="coco_panoptic") image_processing = image_processing_class(format="coco_panoptic")
encoding = image_processing( encoding = image_processing(
images=images, images=images,
annotations=annotations, annotations=annotations,
@@ -500,10 +502,11 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->DeformableDetr # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->DeformableDetr
def test_max_width_max_height_resizing_and_pad_strategy(self): def test_max_width_max_height_resizing_and_pad_strategy(self):
for image_processing_class in self.image_processor_list:
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = DeformableDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, size={"max_height": 100, "max_width": 100},
do_pad=False, do_pad=False,
) )
@@ -511,21 +514,21 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = DeformableDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = DeformableDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = DeformableDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 301, "width": 101}, pad_size={"height": 301, "width": 101},
@@ -537,7 +540,7 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = DeformableDetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 150, "max_width": 100}, size={"max_height": 150, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 150, "width": 100}, pad_size={"height": 150, "width": 100},

View File

@@ -19,8 +19,8 @@ import unittest
import numpy as np import numpy as np
from transformers.testing_utils import require_torch, require_vision, slow from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow
from transformers.utils import is_torch_available, is_vision_available from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
@@ -33,6 +33,9 @@ if is_vision_available():
from transformers import DetrImageProcessor from transformers import DetrImageProcessor
if is_torchvision_available():
from transformers import DetrImageProcessorFast
class DetrImageProcessingTester(unittest.TestCase): class DetrImageProcessingTester(unittest.TestCase):
def __init__( def __init__(
@@ -51,6 +54,7 @@ class DetrImageProcessingTester(unittest.TestCase):
image_std=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5],
do_pad=True, do_pad=True,
): ):
super().__init__()
# by setting size["longest_edge"] > max_resolution we're effectively not testing this :p # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333} size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
self.parent = parent self.parent = parent
@@ -132,6 +136,7 @@ class DetrImageProcessingTester(unittest.TestCase):
@require_vision @require_vision
class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase): class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = DetrImageProcessor if is_vision_available() else None image_processing_class = DetrImageProcessor if is_vision_available() else None
fast_image_processing_class = DetrImageProcessorFast if is_torchvision_available() else None
def setUp(self): def setUp(self):
super().setUp() super().setUp()
@@ -142,7 +147,8 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
return self.image_processor_tester.prepare_image_processor_dict() return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self): def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict) for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "image_mean")) self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std")) self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_normalize")) self.assertTrue(hasattr(image_processing, "do_normalize"))
@@ -153,11 +159,12 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
self.assertTrue(hasattr(image_processing, "do_pad")) self.assertTrue(hasattr(image_processing, "do_pad"))
def test_image_processor_from_dict_with_kwargs(self): def test_image_processor_from_dict_with_kwargs(self):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict) for image_processing_class in self.image_processor_list:
image_processor = image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333}) self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
self.assertEqual(image_processor.do_pad, True) self.assertEqual(image_processor.do_pad, True)
image_processor = self.image_processing_class.from_dict( image_processor = image_processing_class.from_dict(
self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
) )
self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84}) self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
@@ -178,7 +185,8 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
} }
image_processor_params = {**image_processor_dict, **{"format": "_INVALID_FORMAT_"}} image_processor_params = {**image_processor_dict, **{"format": "_INVALID_FORMAT_"}}
image_processor = self.image_processing_class(**image_processor_params) for image_processing_class in self.image_processor_list:
image_processor = image_processing_class(**image_processor_params)
with self.assertRaises(ValueError) as e: with self.assertRaises(ValueError) as e:
image_processor(**params) image_processor(**params)
@@ -193,8 +201,9 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
params = {"image_id": 39769, "annotations": target} params = {"image_id": 39769, "annotations": target}
for image_processing_class in self.image_processor_list:
# encode them # encode them
image_processing = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") image_processing = image_processing_class.from_pretrained("facebook/detr-resnet-50")
# legal encodings (single image) # legal encodings (single image)
_ = image_processing(images=image, annotations=params, return_tensors="pt") _ = image_processing(images=image, annotations=params, return_tensors="pt")
@@ -229,8 +238,9 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
target = {"image_id": 39769, "annotations": target} target = {"image_id": 39769, "annotations": target}
for image_processing_class in self.image_processor_list:
# encode them # encode them
image_processing = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") image_processing = image_processing_class.from_pretrained("facebook/detr-resnet-50")
encoding = image_processing(images=image, annotations=target, return_tensors="pt") encoding = image_processing(images=image, annotations=target, return_tensors="pt")
# verify pixel values # verify pixel values
@@ -275,8 +285,9 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
for image_processing_class in self.image_processor_list:
# encode them # encode them
image_processing = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50-panoptic") image_processing = image_processing_class.from_pretrained("facebook/detr-resnet-50-panoptic")
encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt") encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
# verify pixel values # verify pixel values
@@ -305,7 +316,8 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
# verify masks # verify masks
expected_masks_sum = 822873 expected_masks_sum = 822873
self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum) relative_error = torch.abs(encoding["labels"][0]["masks"].sum() - expected_masks_sum) / expected_masks_sum
self.assertTrue(relative_error < 1e-3)
# verify orig_size # verify orig_size
expected_orig_size = torch.tensor([480, 640]) expected_orig_size = torch.tensor([480, 640])
self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
@@ -340,7 +352,8 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
images = [image_0, image_1] images = [image_0, image_1]
annotations = [annotations_0, annotations_1] annotations = [annotations_0, annotations_1]
image_processing = DetrImageProcessor() for image_processing_class in self.image_processor_list:
image_processing = image_processing_class()
encoding = image_processing( encoding = image_processing(
images=images, images=images,
annotations=annotations, annotations=annotations,
@@ -459,8 +472,9 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
images = [image_0, image_1] images = [image_0, image_1]
annotations = [annotation_0, annotation_1] annotations = [annotation_0, annotation_1]
for image_processing_class in self.image_processor_list:
# encode them # encode them
image_processing = DetrImageProcessor(format="coco_panoptic") image_processing = image_processing_class(format="coco_panoptic")
encoding = image_processing( encoding = image_processing(
images=images, images=images,
annotations=annotations, annotations=annotations,
@@ -554,10 +568,11 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
def test_max_width_max_height_resizing_and_pad_strategy(self): def test_max_width_max_height_resizing_and_pad_strategy(self):
for image_processing_class in self.image_processor_list:
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, size={"max_height": 100, "max_width": 100},
do_pad=False, do_pad=False,
) )
@@ -565,21 +580,21 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 301, "width": 101}, pad_size={"height": 301, "width": 101},
@@ -591,7 +606,7 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"max_height": 150, "max_width": 100}, size={"max_height": 150, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 150, "width": 100}, pad_size={"height": 150, "width": 100},
@@ -600,11 +615,12 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
def test_longest_edge_shortest_edge_resizing_strategy(self): def test_longest_edge_shortest_edge_resizing_strategy(self):
for image_processing_class in self.image_processor_list:
image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) image_1 = torch.ones([958, 653, 3], dtype=torch.uint8)
# max size is set; width < height; # max size is set; width < height;
# do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436 # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"longest_edge": 640, "shortest_edge": 640}, size={"longest_edge": 640, "shortest_edge": 640},
do_pad=False, do_pad=False,
) )
@@ -614,7 +630,7 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
image_2 = torch.ones([653, 958, 3], dtype=torch.uint8) image_2 = torch.ones([653, 958, 3], dtype=torch.uint8)
# max size is set; height < width; # max size is set; height < width;
# do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640 # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"longest_edge": 640, "shortest_edge": 640}, size={"longest_edge": 640, "shortest_edge": 640},
do_pad=False, do_pad=False,
) )
@@ -624,7 +640,7 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
image_3 = torch.ones([100, 120, 3], dtype=torch.uint8) image_3 = torch.ones([100, 120, 3], dtype=torch.uint8)
# max size is set; width == size; height > max_size; # max size is set; width == size; height > max_size;
# do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98 # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"longest_edge": 118, "shortest_edge": 100}, size={"longest_edge": 118, "shortest_edge": 100},
do_pad=False, do_pad=False,
) )
@@ -634,7 +650,7 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
image_4 = torch.ones([128, 50, 3], dtype=torch.uint8) image_4 = torch.ones([128, 50, 3], dtype=torch.uint8)
# max size is set; height == size; width < max_size; # max size is set; height == size; width < max_size;
# do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128 # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"longest_edge": 256, "shortest_edge": 50}, size={"longest_edge": 256, "shortest_edge": 50},
do_pad=False, do_pad=False,
) )
@@ -644,9 +660,130 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
image_5 = torch.ones([50, 50, 3], dtype=torch.uint8) image_5 = torch.ones([50, 50, 3], dtype=torch.uint8)
# max size is set; height == width; width < max_size; # max size is set; height == width; width < max_size;
# do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50 # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50
image_processor = DetrImageProcessor( image_processor = image_processing_class(
size={"longest_edge": 117, "shortest_edge": 50}, size={"longest_edge": 117, "shortest_edge": 50},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_5], return_tensors="pt") inputs = image_processor(images=[image_5], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50]))
@slow
@require_torch_gpu
def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self):
# prepare image and target
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
target = json.loads(f.read())
target = {"image_id": 39769, "annotations": target}
processor = self.image_processor_list[1].from_pretrained("facebook/detr-resnet-50")
# 1. run processor on CPU
encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu")
# 2. run processor on GPU
encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device="cuda")
# verify pixel values
self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape)
self.assertTrue(
torch.allclose(
encoding_cpu["pixel_values"][0, 0, 0, :3],
encoding_gpu["pixel_values"][0, 0, 0, :3].to("cpu"),
atol=1e-4,
)
)
# verify area
self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["area"], encoding_gpu["labels"][0]["area"].to("cpu")))
# verify boxes
self.assertEqual(encoding_cpu["labels"][0]["boxes"].shape, encoding_gpu["labels"][0]["boxes"].shape)
self.assertTrue(
torch.allclose(
encoding_cpu["labels"][0]["boxes"][0], encoding_gpu["labels"][0]["boxes"][0].to("cpu"), atol=1e-3
)
)
# verify image_id
self.assertTrue(
torch.allclose(encoding_cpu["labels"][0]["image_id"], encoding_gpu["labels"][0]["image_id"].to("cpu"))
)
# verify is_crowd
self.assertTrue(
torch.allclose(encoding_cpu["labels"][0]["iscrowd"], encoding_gpu["labels"][0]["iscrowd"].to("cpu"))
)
# verify class_labels
self.assertTrue(
torch.allclose(
encoding_cpu["labels"][0]["class_labels"], encoding_gpu["labels"][0]["class_labels"].to("cpu")
)
)
# verify orig_size
self.assertTrue(
torch.allclose(encoding_cpu["labels"][0]["orig_size"], encoding_gpu["labels"][0]["orig_size"].to("cpu"))
)
# verify size
self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu")))
@slow
@require_torch_gpu
def test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations(self):
# prepare image, target and masks_path
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
target = json.loads(f.read())
target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
processor = self.image_processor_list[1].from_pretrained("facebook/detr-resnet-50-panoptic")
# 1. run processor on CPU
encoding_cpu = processor(
images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cpu"
)
# 2. run processor on GPU
encoding_gpu = processor(
images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cuda"
)
# verify pixel values
self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape)
self.assertTrue(
torch.allclose(
encoding_cpu["pixel_values"][0, 0, 0, :3],
encoding_gpu["pixel_values"][0, 0, 0, :3].to("cpu"),
atol=1e-4,
)
)
# verify area
self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["area"], encoding_gpu["labels"][0]["area"].to("cpu")))
# verify boxes
self.assertEqual(encoding_cpu["labels"][0]["boxes"].shape, encoding_gpu["labels"][0]["boxes"].shape)
self.assertTrue(
torch.allclose(
encoding_cpu["labels"][0]["boxes"][0], encoding_gpu["labels"][0]["boxes"][0].to("cpu"), atol=1e-3
)
)
# verify image_id
self.assertTrue(
torch.allclose(encoding_cpu["labels"][0]["image_id"], encoding_gpu["labels"][0]["image_id"].to("cpu"))
)
# verify is_crowd
self.assertTrue(
torch.allclose(encoding_cpu["labels"][0]["iscrowd"], encoding_gpu["labels"][0]["iscrowd"].to("cpu"))
)
# verify class_labels
self.assertTrue(
torch.allclose(
encoding_cpu["labels"][0]["class_labels"], encoding_gpu["labels"][0]["class_labels"].to("cpu")
)
)
# verify masks
masks_sum_cpu = encoding_cpu["labels"][0]["masks"].sum()
masks_sum_gpu = encoding_gpu["labels"][0]["masks"].sum()
relative_error = torch.abs(masks_sum_cpu - masks_sum_gpu) / masks_sum_cpu
self.assertTrue(relative_error < 1e-3)
# verify orig_size
self.assertTrue(
torch.allclose(encoding_cpu["labels"][0]["orig_size"], encoding_gpu["labels"][0]["orig_size"].to("cpu"))
)
# verify size
self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu")))

View File

@@ -269,7 +269,8 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
images = [image_0, image_1] images = [image_0, image_1]
annotations = [annotations_0, annotations_1] annotations = [annotations_0, annotations_1]
image_processing = GroundingDinoImageProcessor() for image_processing_class in self.image_processor_list:
image_processing = image_processing_class()
encoding = image_processing( encoding = image_processing(
images=images, images=images,
annotations=annotations, annotations=annotations,
@@ -440,8 +441,9 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
images = [image_0, image_1] images = [image_0, image_1]
annotations = [annotation_0, annotation_1] annotations = [annotation_0, annotation_1]
for image_processing_class in self.image_processor_list:
# encode them # encode them
image_processing = GroundingDinoImageProcessor(format="coco_panoptic") image_processing = image_processing_class(format="coco_panoptic")
encoding = image_processing( encoding = image_processing(
images=images, images=images,
annotations=annotations, annotations=annotations,
@@ -536,10 +538,11 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->GroundingDino # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->GroundingDino
def test_max_width_max_height_resizing_and_pad_strategy(self): def test_max_width_max_height_resizing_and_pad_strategy(self):
for image_processing_class in self.image_processor_list:
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = GroundingDinoImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, size={"max_height": 100, "max_width": 100},
do_pad=False, do_pad=False,
) )
@@ -547,21 +550,21 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = GroundingDinoImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = GroundingDinoImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = GroundingDinoImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 301, "width": 101}, pad_size={"height": 301, "width": 101},
@@ -573,7 +576,7 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = GroundingDinoImageProcessor( image_processor = image_processing_class(
size={"max_height": 150, "max_width": 100}, size={"max_height": 150, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 150, "width": 100}, pad_size={"height": 150, "width": 100},

View File

@@ -553,10 +553,11 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Yolos # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Yolos
def test_max_width_max_height_resizing_and_pad_strategy(self): def test_max_width_max_height_resizing_and_pad_strategy(self):
for image_processing_class in self.image_processor_list:
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = YolosImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, size={"max_height": 100, "max_width": 100},
do_pad=False, do_pad=False,
) )
@@ -564,21 +565,21 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = YolosImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=False, do_pad=False,
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = YolosImageProcessor( image_processor = image_processing_class(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
) )
inputs = image_processor(images=[image_1], return_tensors="pt") inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = YolosImageProcessor( image_processor = image_processing_class(
size={"max_height": 300, "max_width": 100}, size={"max_height": 300, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 301, "width": 101}, pad_size={"height": 301, "width": 101},
@@ -590,7 +591,7 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = YolosImageProcessor( image_processor = image_processing_class(
size={"max_height": 150, "max_width": 100}, size={"max_height": 150, "max_width": 100},
do_pad=True, do_pad=True,
pad_size={"height": 150, "width": 100}, pad_size={"height": 150, "width": 100},

View File

@@ -191,7 +191,7 @@ class ImageProcessingTestMixin:
dummy_images = torch.randint(0, 255, (4, 3, 224, 224), dtype=torch.uint8) dummy_images = torch.randint(0, 255, (4, 3, 224, 224), dtype=torch.uint8)
image_processor_slow = self.image_processing_class(**self.image_processor_dict) image_processor_slow = self.image_processing_class(**self.image_processor_dict)
image_processor_fast = self.fast_image_processing_class() image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
fast_time = measure_time(image_processor_fast, dummy_images) fast_time = measure_time(image_processor_fast, dummy_images)
slow_time = measure_time(image_processor_slow, dummy_images) slow_time = measure_time(image_processor_slow, dummy_images)