Add Idefics2/3 and SmolVLM Fast image processors + improvements for fast image processors (#38157)

* add working idefics2 fast and improvements for fast nested images processing

* add fast image processors idefics 3 and smolvlm

* cleanup tests

* fic doc idefics2

* PR review and fix issues after merge

* Force providing disable_grouping to group_images_by_shape

* simplify group_images_by_shape

* fix modular

* Fix nits after review
This commit is contained in:
Yoni Gozlan
2025-06-23 10:17:25 -04:00
committed by GitHub
parent 1a96127e46
commit d29482cc91
61 changed files with 2023 additions and 425 deletions

View File

@@ -1,4 +1,5 @@
# Copyright 2024 HuggingFace Inc.
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -12,13 +13,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin
@@ -28,6 +28,8 @@ if is_vision_available():
from transformers import Idefics2ImageProcessor
if is_torchvision_available():
from transformers import Idefics2ImageProcessorFast
if is_torch_available():
import torch
@@ -88,10 +90,6 @@ class Idefics2ImageProcessingTester:
}
def get_expected_values(self, image_inputs, batched=False):
"""
This function computes the expected height and width when providing images to BridgeTowerImageProcessor,
assuming do_resize is set to True with a scalar size and size_divisor.
"""
if not batched:
shortest_edge = self.size["shortest_edge"]
longest_edge = self.size["longest_edge"]
@@ -142,11 +140,6 @@ class Idefics2ImageProcessingTester:
numpify=False,
torchify=False,
):
"""This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
or a list of PyTorch tensors if one specifies torchify=True.
One can specify whether the images are of the same resolution or not.
"""
assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
batch_size = batch_size if batch_size is not None else self.batch_size
@@ -162,23 +155,19 @@ class Idefics2ImageProcessingTester:
if equal_resolution:
width = height = max_resolution
else:
# To avoid getting image width/height 0
if size_divisor is not None:
# If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
min_resolution = max(size_divisor, min_resolution)
width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
images.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8))
images_list.append(images)
if not numpify and not torchify:
# PIL expects the channel dimension as last dimension
images_list = [[Image.fromarray(np.moveaxis(image, 0, -1)) for image in images] for images in images_list]
if torchify:
images_list = [[torch.from_numpy(image) for image in images] for images in images_list]
if numpify:
# Numpy images are typically in channels last format
images_list = [[image.transpose(1, 2, 0) for image in images] for images in images_list]
return images_list
@@ -188,6 +177,7 @@ class Idefics2ImageProcessingTester:
@require_vision
class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = Idefics2ImageProcessor if is_vision_available() else None
fast_image_processing_class = Idefics2ImageProcessorFast if is_torchvision_available() else None
def setUp(self):
super().setUp()
@@ -198,22 +188,23 @@ class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "rescale_factor"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_pad"))
self.assertTrue(hasattr(image_processing, "do_image_splitting"))
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "rescale_factor"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_pad"))
self.assertTrue(hasattr(image_processing, "do_image_splitting"))
def test_call_numpy(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
image_processing = image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for sample_images in image_inputs:
@@ -238,7 +229,7 @@ class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processor_dict = self.image_processor_dict
image_processor_dict["image_mean"] = [0.5, 0.5, 0.5, 0.5]
image_processor_dict["image_std"] = [0.5, 0.5, 0.5, 0.5]
image_processing = self.image_processing_class(**image_processor_dict)
image_processing = image_processing_class(**image_processor_dict)
# create random numpy tensors
self.image_processor_tester.num_channels = 4
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
@@ -266,7 +257,7 @@ class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def test_call_pil(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
image_processing = image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
for images in image_inputs:
@@ -288,7 +279,7 @@ class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def test_call_pytorch(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
image_processing = image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
@@ -308,3 +299,104 @@ class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
tuple(encoded_images.shape),
(self.image_processor_tester.batch_size, *expected_output_image_shape),
)
def test_image_splitting(self):
for image_processing_class in self.image_processor_list:
image_processor_dict = self.image_processor_dict.copy()
image_processor_dict["do_image_splitting"] = True
image_processing = image_processing_class(**image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs(
equal_resolution=True, torchify=True, num_images=1
)
result = image_processing(image_inputs[0], return_tensors="pt")
self.assertEqual(result.pixel_values.shape[1], 5)
image_processor_dict["do_image_splitting"] = False
image_processing = image_processing_class(**image_processor_dict)
result = image_processing(image_inputs[0], return_tensors="pt")
if len(result.pixel_values.shape) == 5:
self.assertEqual(result.pixel_values.shape[1], 1)
else:
self.assertEqual(result.pixel_values.shape[1], self.image_processor_tester.num_channels)
def test_pixel_attention_mask(self):
for image_processing_class in self.image_processor_list:
image_processor_dict = self.image_processor_dict.copy()
image_processor_dict["do_pad"] = True
image_processing = image_processing_class(**image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
result = image_processing(image_inputs, return_tensors="pt")
self.assertIn("pixel_attention_mask", result)
self.assertEqual(result.pixel_attention_mask.shape[-2:], result.pixel_values.shape[-2:])
image_processor_dict["do_pad"] = False
image_processor_dict["do_image_splitting"] = False
image_processing = image_processing_class(**image_processor_dict)
equal_size_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
result = image_processing(equal_size_inputs, return_tensors="pt")
self.assertNotIn("pixel_attention_mask", result)
def test_convert_rgb(self):
for image_processing_class in self.image_processor_list:
rgba_image = Image.new("RGBA", (100, 100), (255, 0, 0, 128))
# Test with do_convert_rgb=True - this should work for all processors
image_processor_dict = self.image_processor_dict.copy()
image_processor_dict["do_convert_rgb"] = True
image_processing = image_processing_class(**image_processor_dict)
result = image_processing([rgba_image], return_tensors="pt")
self.assertIsNotNone(result.pixel_values)
rgb_image = rgba_image.convert("RGB")
image_processor_dict["do_convert_rgb"] = False
image_processing = image_processing_class(**image_processor_dict)
# Use the RGB image instead of RGBA when do_convert_rgb=False
result = image_processing([rgb_image], return_tensors="pt")
self.assertIsNotNone(result.pixel_values)
# Additional test: verifying proper handling of regular RGB images
rgb_image = Image.new("RGB", (100, 100), (255, 0, 0))
result = image_processing([rgb_image], return_tensors="pt")
self.assertIsNotNone(result.pixel_values)
def test_slow_fast_equivalence_batched(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")
if self.image_processing_class is None or self.fast_image_processing_class is None:
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
self.skipTest(
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
)
dummy_images = self.image_processor_tester.prepare_image_inputs(
equal_resolution=False, num_images=5, torchify=True
)
# pop some images to have non homogenous batches:
indices_to_pop = [i if np.random.random() < 0.5 else None for i in range(len(dummy_images))]
for i in indices_to_pop:
if i is not None:
dummy_images[i].pop()
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
self._assert_slow_fast_tensors_equivalence(
encoding_slow.pixel_attention_mask.float(), encoding_fast.pixel_attention_mask.float()
)