From 41b9b92b52215bed472c9a534a06abbc3a9a95cd Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Thu, 3 Apr 2025 19:48:56 +0200
Subject: [PATCH] [qwen-vl] fix image processor (#37258)

* fix

* add test
---
 .../models/qwen2_vl/image_processing_qwen2_vl.py  | 11 ++++++-----
 .../qwen2_vl/image_processing_qwen2_vl_fast.py    | 11 ++++++-----
 .../qwen2_vl/test_image_processing_qwen2_vl.py    | 15 +++++++++++++++
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
index 10bc8bc69a..732d44d53b 100644
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
@@ -379,17 +379,18 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
 
         """
+        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
+        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
+
         if size is not None:
             if "shortest_edge" not in size or "longest_edge" not in size:
                 raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
             min_pixels = size["shortest_edge"]
+        elif min_pixels is not None and max_pixels is not None:
+            # backward compatibility: override size with min_pixels and max_pixels if they are provided
+            size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
         else:
             size = {**self.size}
-        # backward compatibility: override size with min_pixels and max_pixels if they are provided
-        if min_pixels is not None:
-            size["shortest_edge"] = min_pixels
-        if max_pixels is not None:
-            size["longest_edge"] = max_pixels
 
         do_resize = do_resize if do_resize is not None else self.do_resize
 
diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
index 661f5ed8b1..60b62449d3 100644
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
@@ -334,17 +334,18 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
             device (`torch.device`, *optional*):
                 The device to process the images on. If unset, the device is inferred from the input images.
         """
+        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
+        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
+
         if size is not None:
             if "shortest_edge" not in size or "longest_edge" not in size:
                 raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
             min_pixels = size["shortest_edge"]
+        elif min_pixels is not None and max_pixels is not None:
+            # backward compatibility: override size with min_pixels and max_pixels if they are provided
+            size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
         else:
             size = {**self.size}
-        # backward compatibility: override size with min_pixels and max_pixels if they are provided
-        if min_pixels is not None:
-            size["shortest_edge"] = min_pixels
-        if max_pixels is not None:
-            size["longest_edge"] = max_pixels
 
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
diff --git a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
index bfa4dca85e..95d758f438 100644
--- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tempfile
 import unittest
 
 import numpy as np
@@ -298,6 +299,20 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
                 expected_output_video_shape = (171500, 1176)
                 self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
 
+    def test_custom_image_size(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                image_processing.save_pretrained(tmpdirname)
+                image_processor_loaded = image_processing_class.from_pretrained(
+                    tmpdirname, max_pixels=56 * 56, min_pixels=28 * 28
+                )
+
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
+            prcocess_out = image_processor_loaded(image_inputs, return_tensors="pt")
+            expected_output_video_shape = [112, 1176]
+            self.assertListEqual(list(prcocess_out.pixel_values.shape), expected_output_video_shape)
+
     @require_vision
     @require_torch
     def test_slow_fast_equivalence(self):