From 41b9b92b52215bed472c9a534a06abbc3a9a95cd Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Thu, 3 Apr 2025 19:48:56 +0200 Subject: [PATCH] [qwen-vl] fix image processor (#37258) * fix * add test --- .../models/qwen2_vl/image_processing_qwen2_vl.py | 11 ++++++----- .../qwen2_vl/image_processing_qwen2_vl_fast.py | 11 ++++++----- .../qwen2_vl/test_image_processing_qwen2_vl.py | 15 +++++++++++++++ 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py index 10bc8bc69a..732d44d53b 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py @@ -379,17 +379,18 @@ class Qwen2VLImageProcessor(BaseImageProcessor): - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. """ + min_pixels = min_pixels if min_pixels is not None else self.min_pixels + max_pixels = max_pixels if max_pixels is not None else self.max_pixels + if size is not None: if "shortest_edge" not in size or "longest_edge" not in size: raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") min_pixels = size["shortest_edge"] + elif min_pixels is not None and max_pixels is not None: + # backward compatibility: override size with min_pixels and max_pixels if they are provided + size = {"shortest_edge": min_pixels, "longest_edge": max_pixels} else: size = {**self.size} - # backward compatibility: override size with min_pixels and max_pixels if they are provided - if min_pixels is not None: - size["shortest_edge"] = min_pixels - if max_pixels is not None: - size["longest_edge"] = max_pixels do_resize = do_resize if do_resize is not None else self.do_resize diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py index 661f5ed8b1..60b62449d3 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py @@ -334,17 +334,18 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast): device (`torch.device`, *optional*): The device to process the images on. If unset, the device is inferred from the input images. """ + min_pixels = min_pixels if min_pixels is not None else self.min_pixels + max_pixels = max_pixels if max_pixels is not None else self.max_pixels + if size is not None: if "shortest_edge" not in size or "longest_edge" not in size: raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") min_pixels = size["shortest_edge"] + elif min_pixels is not None and max_pixels is not None: + # backward compatibility: override size with min_pixels and max_pixels if they are provided + size = {"shortest_edge": min_pixels, "longest_edge": max_pixels} else: size = {**self.size} - # backward compatibility: override size with min_pixels and max_pixels if they are provided - if min_pixels is not None: - size["shortest_edge"] = min_pixels - if max_pixels is not None: - size["longest_edge"] = max_pixels do_resize = do_resize if do_resize is not None else self.do_resize size = size if size is not None else self.size diff --git a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py index bfa4dca85e..95d758f438 100644 --- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import tempfile import unittest import numpy as np @@ -298,6 +299,20 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): expected_output_video_shape = (171500, 1176) self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape) + def test_custom_image_size(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + with tempfile.TemporaryDirectory() as tmpdirname: + image_processing.save_pretrained(tmpdirname) + image_processor_loaded = image_processing_class.from_pretrained( + tmpdirname, max_pixels=56 * 56, min_pixels=28 * 28 + ) + + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True) + prcocess_out = image_processor_loaded(image_inputs, return_tensors="pt") + expected_output_video_shape = [112, 1176] + self.assertListEqual(list(prcocess_out.pixel_values.shape), expected_output_video_shape) + @require_vision @require_torch def test_slow_fast_equivalence(self):