[qwen-vl] fix image processor (#37258)

* fix

* add test
This commit is contained in:
Raushan Turganbay
2025-04-03 19:48:56 +02:00
committed by GitHub
parent 8dd0a2b89c
commit 41b9b92b52
3 changed files with 27 additions and 10 deletions

View File

@@ -379,17 +379,18 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
""" """
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
if size is not None: if size is not None:
if "shortest_edge" not in size or "longest_edge" not in size: if "shortest_edge" not in size or "longest_edge" not in size:
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
min_pixels = size["shortest_edge"] min_pixels = size["shortest_edge"]
elif min_pixels is not None and max_pixels is not None:
# backward compatibility: override size with min_pixels and max_pixels if they are provided
size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
else: else:
size = {**self.size} size = {**self.size}
# backward compatibility: override size with min_pixels and max_pixels if they are provided
if min_pixels is not None:
size["shortest_edge"] = min_pixels
if max_pixels is not None:
size["longest_edge"] = max_pixels
do_resize = do_resize if do_resize is not None else self.do_resize do_resize = do_resize if do_resize is not None else self.do_resize

View File

@@ -334,17 +334,18 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
device (`torch.device`, *optional*): device (`torch.device`, *optional*):
The device to process the images on. If unset, the device is inferred from the input images. The device to process the images on. If unset, the device is inferred from the input images.
""" """
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
if size is not None: if size is not None:
if "shortest_edge" not in size or "longest_edge" not in size: if "shortest_edge" not in size or "longest_edge" not in size:
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
min_pixels = size["shortest_edge"] min_pixels = size["shortest_edge"]
elif min_pixels is not None and max_pixels is not None:
# backward compatibility: override size with min_pixels and max_pixels if they are provided
size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
else: else:
size = {**self.size} size = {**self.size}
# backward compatibility: override size with min_pixels and max_pixels if they are provided
if min_pixels is not None:
size["shortest_edge"] = min_pixels
if max_pixels is not None:
size["longest_edge"] = max_pixels
do_resize = do_resize if do_resize is not None else self.do_resize do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size size = size if size is not None else self.size

View File

@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import tempfile
import unittest import unittest
import numpy as np import numpy as np
@@ -298,6 +299,20 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
expected_output_video_shape = (171500, 1176) expected_output_video_shape = (171500, 1176)
self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape) self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
def test_custom_image_size(self):
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
image_processing.save_pretrained(tmpdirname)
image_processor_loaded = image_processing_class.from_pretrained(
tmpdirname, max_pixels=56 * 56, min_pixels=28 * 28
)
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
prcocess_out = image_processor_loaded(image_inputs, return_tensors="pt")
expected_output_video_shape = [112, 1176]
self.assertListEqual(list(prcocess_out.pixel_values.shape), expected_output_video_shape)
@require_vision @require_vision
@require_torch @require_torch
def test_slow_fast_equivalence(self): def test_slow_fast_equivalence(self):