Add Idefics2/3 and SmolVLM Fast image processors + improvements for fast image processors (#38157)

* add working idefics2 fast and improvements for fast nested images processing

* add fast image processors idefics 3 and smolvlm

* cleanup tests

* fic doc idefics2

* PR review and fix issues after merge

* Force providing disable_grouping to group_images_by_shape

* simplify group_images_by_shape

* fix modular

* Fix nits after review
This commit is contained in:
Yoni Gozlan
2025-06-23 10:17:25 -04:00
committed by GitHub
parent 1a96127e46
commit d29482cc91
61 changed files with 2023 additions and 425 deletions

View File

@@ -298,11 +298,10 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_encoding_slow = image_processor_slow(dummy_image, segmentation_maps=dummy_map, return_tensors="pt")
image_encoding_fast = image_processor_fast(dummy_image, segmentation_maps=dummy_map, return_tensors="pt")
self.assertTrue(torch.allclose(image_encoding_slow.pixel_values, image_encoding_fast.pixel_values, atol=1e-1))
self.assertLessEqual(
torch.mean(torch.abs(image_encoding_slow.pixel_values - image_encoding_fast.pixel_values)).item(), 1e-3
self._assert_slow_fast_tensors_equivalence(image_encoding_slow.pixel_values, image_encoding_fast.pixel_values)
self._assert_slow_fast_tensors_equivalence(
image_encoding_slow.labels.float(), image_encoding_fast.labels.float()
)
self.assertTrue(torch.allclose(image_encoding_slow.labels, image_encoding_fast.labels, atol=1e-1))
def test_slow_fast_equivalence_batched(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
@@ -324,7 +323,5 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
encoding_slow = image_processor_slow(dummy_images, segmentation_maps=dummy_maps, return_tensors="pt")
encoding_fast = image_processor_fast(dummy_images, segmentation_maps=dummy_maps, return_tensors="pt")
self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1))
self.assertLessEqual(
torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 1e-3
)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
self._assert_slow_fast_tensors_equivalence(encoding_slow.labels.float(), encoding_fast.labels.float())

View File

@@ -19,14 +19,11 @@ from typing import Optional, Union
import requests
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from transformers.utils import is_torchvision_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
@@ -124,10 +121,6 @@ class BridgeTowerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "size_divisor"))
def _assertEquivalence(self, a, b):
self.assertTrue(torch.allclose(a, b, atol=1e-1))
self.assertLessEqual(torch.mean(torch.abs(a - b)).item(), 1e-3)
@require_vision
@require_torch
def test_slow_fast_equivalence(self):
@@ -146,8 +139,8 @@ class BridgeTowerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
self._assertEquivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
self._assertEquivalence(encoding_slow.pixel_mask.float(), encoding_fast.pixel_mask.float())
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_mask.float(), encoding_fast.pixel_mask.float())
@require_vision
@require_torch
@@ -170,5 +163,5 @@ class BridgeTowerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
self._assertEquivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
self._assertEquivalence(encoding_slow.pixel_mask.float(), encoding_fast.pixel_mask.float())
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_mask.float(), encoding_fast.pixel_mask.float())

View File

@@ -418,15 +418,8 @@ class FlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
encoding_fast = image_processor_fast(
dummy_image, return_tensors="pt", return_codebook_pixels=True, return_image_mask=True
)
self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1))
self.assertLessEqual(
torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 1e-3
)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
self.assertTrue(
torch.allclose(encoding_slow.codebook_pixel_values, encoding_fast.codebook_pixel_values, atol=1e-1)
)
self.assertLessEqual(
torch.mean(torch.abs(encoding_slow.codebook_pixel_values - encoding_fast.codebook_pixel_values)).item(),
1e-3,
self._assert_slow_fast_tensors_equivalence(
encoding_slow.codebook_pixel_values, encoding_fast.codebook_pixel_values
)

View File

@@ -286,7 +286,4 @@ class Gemma3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
torch.testing.assert_close(encoding_slow.num_crops, encoding_fast.num_crops)
self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1))
self.assertLessEqual(
torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 1e-3
)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)

View File

@@ -125,10 +125,7 @@ class GotOcr2ProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
torch.testing.assert_close(encoding_slow.num_patches, encoding_fast.num_patches)
self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1))
self.assertLessEqual(
torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 1e-3
)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
def test_slow_fast_equivalence_batched_crop_to_patches(self):
# Prepare image inputs so that we have two groups of images with equal resolution with a group of images with
@@ -144,10 +141,7 @@ class GotOcr2ProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
torch.testing.assert_close(encoding_slow.num_patches, encoding_fast.num_patches)
self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1))
self.assertLessEqual(
torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 1e-3
)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
def test_crop_to_patches(self):
# test slow image processor

View File

@@ -1,4 +1,5 @@
# Copyright 2024 HuggingFace Inc.
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -12,13 +13,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin
@@ -28,6 +28,8 @@ if is_vision_available():
from transformers import Idefics2ImageProcessor
if is_torchvision_available():
from transformers import Idefics2ImageProcessorFast
if is_torch_available():
import torch
@@ -88,10 +90,6 @@ class Idefics2ImageProcessingTester:
}
def get_expected_values(self, image_inputs, batched=False):
"""
This function computes the expected height and width when providing images to BridgeTowerImageProcessor,
assuming do_resize is set to True with a scalar size and size_divisor.
"""
if not batched:
shortest_edge = self.size["shortest_edge"]
longest_edge = self.size["longest_edge"]
@@ -142,11 +140,6 @@ class Idefics2ImageProcessingTester:
numpify=False,
torchify=False,
):
"""This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
or a list of PyTorch tensors if one specifies torchify=True.
One can specify whether the images are of the same resolution or not.
"""
assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
batch_size = batch_size if batch_size is not None else self.batch_size
@@ -162,23 +155,19 @@ class Idefics2ImageProcessingTester:
if equal_resolution:
width = height = max_resolution
else:
# To avoid getting image width/height 0
if size_divisor is not None:
# If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
min_resolution = max(size_divisor, min_resolution)
width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
images.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8))
images_list.append(images)
if not numpify and not torchify:
# PIL expects the channel dimension as last dimension
images_list = [[Image.fromarray(np.moveaxis(image, 0, -1)) for image in images] for images in images_list]
if torchify:
images_list = [[torch.from_numpy(image) for image in images] for images in images_list]
if numpify:
# Numpy images are typically in channels last format
images_list = [[image.transpose(1, 2, 0) for image in images] for images in images_list]
return images_list
@@ -188,6 +177,7 @@ class Idefics2ImageProcessingTester:
@require_vision
class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = Idefics2ImageProcessor if is_vision_available() else None
fast_image_processing_class = Idefics2ImageProcessorFast if is_torchvision_available() else None
def setUp(self):
super().setUp()
@@ -198,22 +188,23 @@ class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "rescale_factor"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_pad"))
self.assertTrue(hasattr(image_processing, "do_image_splitting"))
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "rescale_factor"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_pad"))
self.assertTrue(hasattr(image_processing, "do_image_splitting"))
def test_call_numpy(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
image_processing = image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for sample_images in image_inputs:
@@ -238,7 +229,7 @@ class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processor_dict = self.image_processor_dict
image_processor_dict["image_mean"] = [0.5, 0.5, 0.5, 0.5]
image_processor_dict["image_std"] = [0.5, 0.5, 0.5, 0.5]
image_processing = self.image_processing_class(**image_processor_dict)
image_processing = image_processing_class(**image_processor_dict)
# create random numpy tensors
self.image_processor_tester.num_channels = 4
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
@@ -266,7 +257,7 @@ class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def test_call_pil(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
image_processing = image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
for images in image_inputs:
@@ -288,7 +279,7 @@ class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def test_call_pytorch(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
image_processing = image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
@@ -308,3 +299,104 @@ class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
tuple(encoded_images.shape),
(self.image_processor_tester.batch_size, *expected_output_image_shape),
)
def test_image_splitting(self):
for image_processing_class in self.image_processor_list:
image_processor_dict = self.image_processor_dict.copy()
image_processor_dict["do_image_splitting"] = True
image_processing = image_processing_class(**image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs(
equal_resolution=True, torchify=True, num_images=1
)
result = image_processing(image_inputs[0], return_tensors="pt")
self.assertEqual(result.pixel_values.shape[1], 5)
image_processor_dict["do_image_splitting"] = False
image_processing = image_processing_class(**image_processor_dict)
result = image_processing(image_inputs[0], return_tensors="pt")
if len(result.pixel_values.shape) == 5:
self.assertEqual(result.pixel_values.shape[1], 1)
else:
self.assertEqual(result.pixel_values.shape[1], self.image_processor_tester.num_channels)
def test_pixel_attention_mask(self):
for image_processing_class in self.image_processor_list:
image_processor_dict = self.image_processor_dict.copy()
image_processor_dict["do_pad"] = True
image_processing = image_processing_class(**image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
result = image_processing(image_inputs, return_tensors="pt")
self.assertIn("pixel_attention_mask", result)
self.assertEqual(result.pixel_attention_mask.shape[-2:], result.pixel_values.shape[-2:])
image_processor_dict["do_pad"] = False
image_processor_dict["do_image_splitting"] = False
image_processing = image_processing_class(**image_processor_dict)
equal_size_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
result = image_processing(equal_size_inputs, return_tensors="pt")
self.assertNotIn("pixel_attention_mask", result)
def test_convert_rgb(self):
for image_processing_class in self.image_processor_list:
rgba_image = Image.new("RGBA", (100, 100), (255, 0, 0, 128))
# Test with do_convert_rgb=True - this should work for all processors
image_processor_dict = self.image_processor_dict.copy()
image_processor_dict["do_convert_rgb"] = True
image_processing = image_processing_class(**image_processor_dict)
result = image_processing([rgba_image], return_tensors="pt")
self.assertIsNotNone(result.pixel_values)
rgb_image = rgba_image.convert("RGB")
image_processor_dict["do_convert_rgb"] = False
image_processing = image_processing_class(**image_processor_dict)
# Use the RGB image instead of RGBA when do_convert_rgb=False
result = image_processing([rgb_image], return_tensors="pt")
self.assertIsNotNone(result.pixel_values)
# Additional test: verifying proper handling of regular RGB images
rgb_image = Image.new("RGB", (100, 100), (255, 0, 0))
result = image_processing([rgb_image], return_tensors="pt")
self.assertIsNotNone(result.pixel_values)
def test_slow_fast_equivalence_batched(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")
if self.image_processing_class is None or self.fast_image_processing_class is None:
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
self.skipTest(
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
)
dummy_images = self.image_processor_tester.prepare_image_inputs(
equal_resolution=False, num_images=5, torchify=True
)
# pop some images to have non homogenous batches:
indices_to_pop = [i if np.random.random() < 0.5 else None for i in range(len(dummy_images))]
for i in indices_to_pop:
if i is not None:
dummy_images[i].pop()
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
self._assert_slow_fast_tensors_equivalence(
encoding_slow.pixel_attention_mask.float(), encoding_fast.pixel_attention_mask.float()
)

View File

@@ -1,3 +1,4 @@
# coding=utf-8
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,10 +17,11 @@
import unittest
import numpy as np
import requests
from transformers.image_utils import PILImageResampling
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin
@@ -29,6 +31,9 @@ if is_vision_available():
from transformers import Idefics3ImageProcessor
if is_torchvision_available():
from transformers import Idefics3ImageProcessorFast
if is_torch_available():
import torch
@@ -164,6 +169,7 @@ class Idefics3ImageProcessingTester:
@require_vision
class Idefics3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = Idefics3ImageProcessor if is_vision_available() else None
fast_image_processing_class = Idefics3ImageProcessorFast if is_torchvision_available() else None
def setUp(self):
super().setUp()
@@ -174,25 +180,26 @@ class Idefics3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "resample"))
self.assertTrue(hasattr(image_processing, "do_image_splitting"))
self.assertTrue(hasattr(image_processing, "max_image_size"))
self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "rescale_factor"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_pad"))
self.assertTrue(hasattr(image_processing, "do_image_splitting"))
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "resample"))
self.assertTrue(hasattr(image_processing, "do_image_splitting"))
self.assertTrue(hasattr(image_processing, "max_image_size"))
self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "rescale_factor"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_pad"))
self.assertTrue(hasattr(image_processing, "do_image_splitting"))
def test_call_numpy(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
image_processing = image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for sample_images in image_inputs:
@@ -216,7 +223,7 @@ class Idefics3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processor_dict = self.image_processor_dict
image_processing = self.image_processing_class(**image_processor_dict)
image_processing = image_processing_class(**image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
@@ -239,7 +246,7 @@ class Idefics3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def test_call_pil(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
image_processing = image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
for images in image_inputs:
@@ -261,7 +268,7 @@ class Idefics3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def test_call_pytorch(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
image_processing = image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
@@ -281,3 +288,73 @@ class Idefics3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
tuple(encoded_images.shape),
(self.image_processor_tester.batch_size, *expected_output_image_shape),
)
@require_vision
@require_torch
def test_slow_fast_equivalence(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")
if self.image_processing_class is None or self.fast_image_processing_class is None:
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
dummy_image = Image.open(
requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw
)
dummy_image = dummy_image.resize((100, 150))
image_processor_slow = self.image_processing_class(
**self.image_processor_dict, resample=PILImageResampling.BICUBIC
)
image_processor_fast = self.fast_image_processing_class(
**self.image_processor_dict, resample=PILImageResampling.BICUBIC
)
encoding_slow = image_processor_slow(dummy_image, return_tensors="pt", return_row_col_info=True)
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt", return_row_col_info=True)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
self._assert_slow_fast_tensors_equivalence(
encoding_slow.pixel_attention_mask.float(), encoding_fast.pixel_attention_mask.float()
)
self.assertEqual(encoding_slow.rows, encoding_fast.rows)
self.assertEqual(encoding_slow.cols, encoding_fast.cols)
@require_vision
@require_torch
def test_slow_fast_equivalence_batched(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")
if self.image_processing_class is None or self.fast_image_processing_class is None:
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
self.skipTest(
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
)
dummy_images = self.image_processor_tester.prepare_image_inputs(
equal_resolution=False, num_images=5, torchify=True
)
# pop some images to have non homogenous batches:
indices_to_pop = [i if np.random.random() < 0.5 else None for i in range(len(dummy_images))]
for i in indices_to_pop:
if i is not None:
dummy_images[i].pop()
image_processor_slow = self.image_processing_class(
**self.image_processor_dict, resample=PILImageResampling.BICUBIC
)
image_processor_fast = self.fast_image_processing_class(
**self.image_processor_dict, resample=PILImageResampling.BICUBIC
)
encoding_slow = image_processor_slow(dummy_images, return_tensors="pt", return_row_col_info=True)
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt", return_row_col_info=True)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=3e-1)
self._assert_slow_fast_tensors_equivalence(
encoding_slow.pixel_attention_mask.float(), encoding_fast.pixel_attention_mask.float()
)
self.assertEqual(encoding_slow.rows, encoding_fast.rows)
self.assertEqual(encoding_slow.cols, encoding_fast.cols)

View File

@@ -15,9 +15,21 @@
import unittest
import requests
from packaging import version
from transformers.testing_utils import require_pytesseract, require_torch, require_vision
from transformers.utils import is_pytesseract_available, is_torch_available, is_torchvision_available
from transformers.testing_utils import (
require_pytesseract,
require_torch,
require_torch_accelerator,
require_vision,
slow,
torch_device,
)
from transformers.utils import (
is_pytesseract_available,
is_torch_available,
is_torchvision_available,
)
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
@@ -157,16 +169,8 @@ class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
self.assertTrue(
torch.allclose(
encoding_slow.pixel_values.float() / 255, encoding_fast.pixel_values.float() / 255, atol=1e-1
)
)
self.assertLessEqual(
torch.mean(
torch.abs(encoding_slow.pixel_values.float() - encoding_fast.pixel_values.float()) / 255
).item(),
1e-3,
self._assert_slow_fast_tensors_equivalence(
encoding_slow.pixel_values.float() / 255, encoding_fast.pixel_values.float() / 255
)
@require_vision
@@ -190,14 +194,28 @@ class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
self.assertTrue(
torch.allclose(
encoding_slow.pixel_values.float() / 255, encoding_fast.pixel_values.float() / 255, atol=1e-1
)
self._assert_slow_fast_tensors_equivalence(
encoding_slow.pixel_values.float() / 255, encoding_fast.pixel_values.float() / 255
)
self.assertLessEqual(
torch.mean(
torch.abs(encoding_slow.pixel_values.float() - encoding_fast.pixel_values.float()) / 255
).item(),
1e-3,
# Overriding as we can't use torch.testing.assert_close on int8 tensors
@slow
@require_torch_accelerator
@require_vision
def test_can_compile_fast_image_processor(self):
if self.fast_image_processing_class is None:
self.skipTest("Skipping compilation test as fast image processor is not defined")
if version.parse(torch.__version__) < version.parse("2.3"):
self.skipTest(reason="This test requires torch >= 2.3 to run.")
torch.compiler.reset()
input_image = torch.randint(0, 255, (3, 224, 224), dtype=torch.uint8)
image_processor = self.fast_image_processing_class(**self.image_processor_dict)
output_eager = image_processor(input_image, device=torch_device, return_tensors="pt")
image_processor = torch.compile(image_processor, mode="reduce-overhead")
output_compiled = image_processor(input_image, device=torch_device, return_tensors="pt")
self._assert_slow_fast_tensors_equivalence(
output_eager.pixel_values.float() / 255, output_compiled.pixel_values.float() / 255
)

View File

@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import unittest
import numpy as np
@@ -214,29 +213,6 @@ class PixtralImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list)
self.assertEqual(tuple(batch_encoded_images.shape), expected_output_image_shape)
@require_vision
@require_torch
def test_fast_is_faster_than_slow(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping speed test")
if self.image_processing_class is None or self.fast_image_processing_class is None:
self.skipTest(reason="Skipping speed test as one of the image processors is not defined")
def measure_time(image_processor, image):
start = time.time()
_ = image_processor(image, return_tensors="pt")
return time.time() - start
image_inputs_list = self.image_processor_tester.prepare_image_inputs(torchify=True)
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
fast_time = measure_time(image_processor_fast, image_inputs_list)
slow_time = measure_time(image_processor_slow, image_inputs_list)
self.assertLessEqual(fast_time, slow_time)
@require_vision
@require_torch
def test_slow_fast_equivalence(self):
@@ -255,9 +231,7 @@ class PixtralImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
torch.testing.assert_close(
encoding_slow.pixel_values[0][0], encoding_fast.pixel_values[0][0], rtol=100, atol=1e-1
)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values[0][0], encoding_fast.pixel_values[0][0])
@require_vision
@require_torch
@@ -282,14 +256,8 @@ class PixtralImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
for i in range(len(encoding_slow.pixel_values)):
self.assertTrue(
torch.allclose(encoding_slow.pixel_values[i][0], encoding_fast.pixel_values[i][0], atol=1e-1)
)
self.assertLessEqual(
torch.mean(torch.abs(encoding_slow.pixel_values[i][0] - encoding_fast.pixel_values[i][0])).item(), 1e-3
)
torch.testing.assert_close(
encoding_slow.pixel_values[0][0], encoding_fast.pixel_values[0][0], rtol=100, atol=1e-1
self._assert_slow_fast_tensors_equivalence(
encoding_slow.pixel_values[i][0], encoding_fast.pixel_values[i][0]
)
@slow
@@ -309,8 +277,8 @@ class PixtralImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processor = torch.compile(image_processor, mode="reduce-overhead")
output_compiled = image_processor(input_image, device=torch_device, return_tensors="pt")
torch.testing.assert_close(
output_eager.pixel_values[0][0], output_compiled.pixel_values[0][0], rtol=1e-4, atol=1e-4
self._assert_slow_fast_tensors_equivalence(
output_eager.pixel_values[0][0], output_compiled.pixel_values[0][0], atol=1e-4, rtol=1e-4, mean_atol=1e-5
)
@unittest.skip(reason="PixtralImageProcessor doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy

View File

@@ -362,6 +362,4 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
torch.testing.assert_close(
encoding_slow.pixel_values, encoding_fast.pixel_values, rtol=100, atol=1e-2
) # @yoni bit weird that we have such diffs
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)

View File

@@ -18,14 +18,11 @@ import unittest
import requests
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from transformers.utils import is_torchvision_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
@@ -150,7 +147,6 @@ class Siglip2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def test_call_numpy_4_channels(self):
pass
# increase mean tolerance to 1e-3 -> 2e-3
# Ignore copy
def test_slow_fast_equivalence(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
@@ -167,10 +163,7 @@ class Siglip2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
torch.testing.assert_close(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1, rtol=1e-1)
self.assertLessEqual(
torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 2e-3
)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
# increase mean tolerance to 1e-3 -> 2e-3
# Ignore copy
@@ -193,7 +186,4 @@ class Siglip2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
torch.testing.assert_close(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1, rtol=1e-1)
self.assertLessEqual(
torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 2e-3
)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)

View File

@@ -1,3 +1,4 @@
# coding=utf-8
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,10 +17,11 @@
import unittest
import numpy as np
import requests
from transformers.image_utils import PILImageResampling
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin
@@ -29,6 +31,9 @@ if is_vision_available():
from transformers import SmolVLMImageProcessor
if is_torchvision_available():
from transformers import SmolVLMImageProcessorFast
if is_torch_available():
import torch
@@ -164,6 +169,7 @@ class SmolVLMImageProcessingTester:
@require_vision
class SmolVLMImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = SmolVLMImageProcessor if is_vision_available() else None
fast_image_processing_class = SmolVLMImageProcessorFast if is_torchvision_available() else None
def setUp(self):
super().setUp()
@@ -174,25 +180,26 @@ class SmolVLMImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "resample"))
self.assertTrue(hasattr(image_processing, "do_image_splitting"))
self.assertTrue(hasattr(image_processing, "max_image_size"))
self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "rescale_factor"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_pad"))
self.assertTrue(hasattr(image_processing, "do_image_splitting"))
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "resample"))
self.assertTrue(hasattr(image_processing, "do_image_splitting"))
self.assertTrue(hasattr(image_processing, "max_image_size"))
self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "rescale_factor"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_pad"))
self.assertTrue(hasattr(image_processing, "do_image_splitting"))
def test_call_numpy(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
image_processing = image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for sample_images in image_inputs:
@@ -216,7 +223,7 @@ class SmolVLMImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processor_dict = self.image_processor_dict
image_processing = self.image_processing_class(**image_processor_dict)
image_processing = image_processing_class(**image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
@@ -239,7 +246,7 @@ class SmolVLMImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def test_call_pil(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
image_processing = image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
for images in image_inputs:
@@ -261,7 +268,7 @@ class SmolVLMImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def test_call_pytorch(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
image_processing = image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
@@ -281,3 +288,73 @@ class SmolVLMImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
tuple(encoded_images.shape),
(self.image_processor_tester.batch_size, *expected_output_image_shape),
)
@require_vision
@require_torch
def test_slow_fast_equivalence(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")
if self.image_processing_class is None or self.fast_image_processing_class is None:
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
dummy_image = Image.open(
requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw
)
dummy_image = dummy_image.resize((100, 150))
image_processor_slow = self.image_processing_class(
**self.image_processor_dict, resample=PILImageResampling.BICUBIC
)
image_processor_fast = self.fast_image_processing_class(
**self.image_processor_dict, resample=PILImageResampling.BICUBIC
)
encoding_slow = image_processor_slow(dummy_image, return_tensors="pt", return_row_col_info=True)
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt", return_row_col_info=True)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
self._assert_slow_fast_tensors_equivalence(
encoding_slow.pixel_attention_mask.float(), encoding_fast.pixel_attention_mask.float()
)
self.assertEqual(encoding_slow.rows, encoding_fast.rows)
self.assertEqual(encoding_slow.cols, encoding_fast.cols)
@require_vision
@require_torch
def test_slow_fast_equivalence_batched(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")
if self.image_processing_class is None or self.fast_image_processing_class is None:
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
self.skipTest(
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
)
dummy_images = self.image_processor_tester.prepare_image_inputs(
equal_resolution=False, num_images=5, torchify=True
)
# pop some images to have non homogenous batches:
indices_to_pop = [i if np.random.random() < 0.5 else None for i in range(len(dummy_images))]
for i in indices_to_pop:
if i is not None:
dummy_images[i].pop()
image_processor_slow = self.image_processing_class(
**self.image_processor_dict, resample=PILImageResampling.BICUBIC
)
image_processor_fast = self.fast_image_processing_class(
**self.image_processor_dict, resample=PILImageResampling.BICUBIC
)
encoding_slow = image_processor_slow(dummy_images, return_tensors="pt", return_row_col_info=True)
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt", return_row_col_info=True)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=3e-1)
self._assert_slow_fast_tensors_equivalence(
encoding_slow.pixel_attention_mask.float(), encoding_fast.pixel_attention_mask.float()
)
self.assertEqual(encoding_slow.rows, encoding_fast.rows)
self.assertEqual(encoding_slow.cols, encoding_fast.cols)

View File

@@ -197,7 +197,7 @@ class Swin2SRImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
encoded_slow = image_processor_slow(image_inputs, return_tensors="pt").pixel_values
encoded_fast = image_processor_fast(image_inputs, return_tensors="pt").pixel_values
encoded_slow = image_processor_slow(image_inputs, return_tensors="pt")
encoded_fast = image_processor_fast(image_inputs, return_tensors="pt")
self.assertTrue(torch.allclose(encoded_slow, encoded_fast, atol=1e-1))
self._assert_slow_fast_tensors_equivalence(encoded_slow.pixel_values, encoded_fast.pixel_values)

View File

@@ -312,10 +312,7 @@ class VitMatteImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
encoding_slow = image_processor_slow(dummy_image, trimaps=dummy_trimap, return_tensors="pt")
encoding_fast = image_processor_fast(dummy_image, trimaps=dummy_trimap, return_tensors="pt")
self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1))
self.assertLessEqual(
torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 1e-3
)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
def test_slow_fast_equivalence_batched(self):
# this only checks on equal resolution, since the slow processor doesn't work otherwise
@@ -338,10 +335,7 @@ class VitMatteImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
encoding_slow = image_processor_slow(dummy_images, trimaps=dummy_trimaps, return_tensors="pt")
encoding_fast = image_processor_fast(dummy_images, trimaps=dummy_trimaps, return_tensors="pt")
self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1))
self.assertLessEqual(
torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 1e-3
)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
@slow
@require_torch_accelerator