add Qwen2-VL image processor fast (#35733)

* add qwen2_vl image processor fast

* add device to ImagesKwargs

* remove automatic fix copies

* fix fast_is_faster_than_slow

* remove unnecessary import
This commit is contained in:
Yoni Gozlan
2025-01-21 11:49:05 -05:00
committed by GitHub
parent 3df90103b8
commit 107f9f5127
9 changed files with 584 additions and 127 deletions

View File

@@ -20,7 +20,7 @@ import numpy as np
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs
@@ -33,6 +33,9 @@ if is_vision_available():
from transformers import Qwen2VLImageProcessor
if is_torchvision_available():
from transformers import Qwen2VLImageProcessorFast
class Qwen2VLImageProcessingTester:
def __init__(
@@ -114,6 +117,7 @@ class Qwen2VLImageProcessingTester:
@require_vision
class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = Qwen2VLImageProcessor if is_vision_available() else None
fast_image_processing_class = Qwen2VLImageProcessorFast if is_torchvision_available() else None
def setUp(self):
super().setUp()
@@ -124,28 +128,30 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "min_pixels"))
self.assertTrue(hasattr(image_processing, "max_pixels"))
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "patch_size"))
self.assertTrue(hasattr(image_processing, "temporal_patch_size"))
self.assertTrue(hasattr(image_processing, "merge_size"))
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "min_pixels"))
self.assertTrue(hasattr(image_processing, "max_pixels"))
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "patch_size"))
self.assertTrue(hasattr(image_processing, "temporal_patch_size"))
self.assertTrue(hasattr(image_processing, "merge_size"))
def test_image_processor_from_dict_with_kwargs(self):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.min_pixels, 56 * 56)
self.assertEqual(image_processor.max_pixels, 28 * 28 * 1280)
for image_processing_class in self.image_processor_list:
image_processor = image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.min_pixels, 56 * 56)
self.assertEqual(image_processor.max_pixels, 28 * 28 * 1280)
image_processor = self.image_processing_class.from_dict(
self.image_processor_dict, min_pixels=256 * 256, max_pixels=640 * 640
)
self.assertEqual(image_processor.min_pixels, 256 * 256)
self.assertEqual(image_processor.max_pixels, 640 * 640)
image_processor = image_processing_class.from_dict(
self.image_processor_dict, min_pixels=256 * 256, max_pixels=640 * 640
)
self.assertEqual(image_processor.min_pixels, 256 * 256)
self.assertEqual(image_processor.max_pixels, 640 * 640)
def test_select_best_resolution(self):
# Test with a final resize resolution
@@ -153,134 +159,140 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
self.assertEqual(best_resolution, (560, 280))
def test_call_pil(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
for image in image_inputs:
self.assertIsInstance(image[0], Image.Image)
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
for image in image_inputs:
self.assertIsInstance(image[0], Image.Image)
# Test not batched input
prcocess_out = image_processing(image_inputs[0], return_tensors="pt")
encoded_images = prcocess_out.pixel_values
image_grid_thws = prcocess_out.image_grid_thw
expected_output_image_shape = (4900, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]])
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test not batched input
prcocess_out = image_processing(image_inputs[0], return_tensors="pt")
encoded_images = prcocess_out.pixel_values
image_grid_thws = prcocess_out.image_grid_thw
expected_output_image_shape = (4900, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]])
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched
prcocess_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = prcocess_out.pixel_values
image_grid_thws = prcocess_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched
prcocess_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = prcocess_out.pixel_values
image_grid_thws = prcocess_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
def test_call_numpy(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
for image in image_inputs:
self.assertIsInstance(image[0], np.ndarray)
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
for image in image_inputs:
self.assertIsInstance(image[0], np.ndarray)
# Test not batched input
prcocess_out = image_processing(image_inputs[0], return_tensors="pt")
encoded_images = prcocess_out.pixel_values
image_grid_thws = prcocess_out.image_grid_thw
expected_output_image_shape = (4900, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]])
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test not batched input
prcocess_out = image_processing(image_inputs[0], return_tensors="pt")
encoded_images = prcocess_out.pixel_values
image_grid_thws = prcocess_out.image_grid_thw
expected_output_image_shape = (4900, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]])
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched
prcocess_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = prcocess_out.pixel_values
image_grid_thws = prcocess_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched
prcocess_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = prcocess_out.pixel_values
image_grid_thws = prcocess_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
def test_call_pytorch(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
for image in image_inputs:
self.assertIsInstance(image[0], torch.Tensor)
for image in image_inputs:
self.assertIsInstance(image[0], torch.Tensor)
# Test not batched input
prcocess_out = image_processing(image_inputs[0], return_tensors="pt")
encoded_images = prcocess_out.pixel_values
image_grid_thws = prcocess_out.image_grid_thw
expected_output_image_shape = (4900, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]])
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test not batched input
prcocess_out = image_processing(image_inputs[0], return_tensors="pt")
encoded_images = prcocess_out.pixel_values
image_grid_thws = prcocess_out.image_grid_thw
expected_output_image_shape = (4900, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]])
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched
prcocess_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = prcocess_out.pixel_values
image_grid_thws = prcocess_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched
prcocess_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = prcocess_out.pixel_values
image_grid_thws = prcocess_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
@unittest.skip(reason="Qwen2VLImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")
def test_call_numpy_4_channels(self):
pass
def test_nested_input(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
# Test batched as a list of images
prcocess_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = prcocess_out.pixel_values
image_grid_thws = prcocess_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched as a list of images
prcocess_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = prcocess_out.pixel_values
image_grid_thws = prcocess_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched as a nested list of images, where each sublist is one batch
image_inputs_nested = image_inputs[:3] + image_inputs[3:]
prcocess_out = image_processing(image_inputs_nested, return_tensors="pt")
encoded_images_nested = prcocess_out.pixel_values
image_grid_thws_nested = prcocess_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched as a nested list of images, where each sublist is one batch
image_inputs_nested = image_inputs[:3] + image_inputs[3:]
prcocess_out = image_processing(image_inputs_nested, return_tensors="pt")
encoded_images_nested = prcocess_out.pixel_values
image_grid_thws_nested = prcocess_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Image processor should return same pixel values, independently of ipnut format
self.assertTrue((encoded_images_nested == encoded_images).all())
self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all())
# Image processor should return same pixel values, independently of ipnut format
self.assertTrue((encoded_images_nested == encoded_images).all())
self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all())
def test_video_inputs(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
expected_dims_by_frames = {1: 34300, 2: 34300, 3: 68600, 4: 68600, 5: 102900, 6: 102900}
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
expected_dims_by_frames = {1: 34300, 2: 34300, 3: 68600, 4: 68600, 5: 102900, 6: 102900}
for num_frames, expected_dims in expected_dims_by_frames.items():
image_processor_tester = Qwen2VLImageProcessingTester(self, num_frames=num_frames)
video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
encoded_video = prcocess_out.pixel_values_videos
expected_output_video_shape = (expected_dims, 1176)
self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
for num_frames, expected_dims in expected_dims_by_frames.items():
image_processor_tester = Qwen2VLImageProcessingTester(self, num_frames=num_frames)
video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
encoded_video = prcocess_out.pixel_values_videos
expected_output_video_shape = (expected_dims, 1176)
self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
def test_custom_patch_size(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
for patch_size in (1, 3, 5, 7):
image_processor_tester = Qwen2VLImageProcessingTester(self, patch_size=patch_size)
video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
encoded_video = prcocess_out.pixel_values_videos
expected_output_video_shape = (171500, 1176)
self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
for patch_size in (1, 3, 5, 7):
image_processor_tester = Qwen2VLImageProcessingTester(self, patch_size=patch_size)
video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
encoded_video = prcocess_out.pixel_values_videos
expected_output_video_shape = (171500, 1176)
self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)