From afdb821318e06e670c7238a9059e7e031065e319 Mon Sep 17 00:00:00 2001
From: rdonggroq <rdong@groq.com>
Date: Tue, 10 Jun 2025 04:59:22 -0400
Subject: [PATCH] Fix smart resize (#38706)

* Fix smart_resize bug

* Add smart_resize test

* Remove unnecessary error checking

* Fix smart_resize tests

---------

Co-authored-by: Richard Dong <rdong@rdong.c.groq-143208.internal>
---
 .../models/emu3/image_processing_emu3.py      |  8 +-
 .../qwen2_vl/image_processing_qwen2_vl.py     |  8 +-
 .../test_image_processing_qwen2_vl.py         | 73 +++++++++++--------
 3 files changed, 49 insertions(+), 40 deletions(-)

diff --git a/src/transformers/models/emu3/image_processing_emu3.py b/src/transformers/models/emu3/image_processing_emu3.py
index be57f8f21e..c82f2dc42a 100644
--- a/src/transformers/models/emu3/image_processing_emu3.py
+++ b/src/transformers/models/emu3/image_processing_emu3.py
@@ -81,9 +81,7 @@ def smart_resize(
     3. The aspect ratio of the image is maintained as closely as possible.
 
     """
-    if height < factor or width < factor:
-        raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
-    elif max(height, width) / min(height, width) > 200:
+    if max(height, width) / min(height, width) > 200:
         raise ValueError(
             f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
         )
@@ -91,8 +89,8 @@ def smart_resize(
     w_bar = round(width / factor) * factor
     if h_bar * w_bar > max_pixels:
         beta = math.sqrt((height * width) / max_pixels)
-        h_bar = math.floor(height / beta / factor) * factor
-        w_bar = math.floor(width / beta / factor) * factor
+        h_bar = max(factor, math.floor(height / beta / factor) * factor)
+        w_bar = max(factor, math.floor(width / beta / factor) * factor)
     elif h_bar * w_bar < min_pixels:
         beta = math.sqrt(min_pixels / (height * width))
         h_bar = math.ceil(height * beta / factor) * factor
diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
index 48e8594b12..a4826428ac 100644
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
@@ -64,9 +64,7 @@ def smart_resize(
     3. The aspect ratio of the image is maintained as closely as possible.
 
     """
-    if height < factor or width < factor:
-        raise ValueError(f"height:{height} and width:{width} must be larger than factor:{factor}")
-    elif max(height, width) / min(height, width) > 200:
+    if max(height, width) / min(height, width) > 200:
         raise ValueError(
             f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
         )
@@ -74,8 +72,8 @@ def smart_resize(
     w_bar = round(width / factor) * factor
     if h_bar * w_bar > max_pixels:
         beta = math.sqrt((height * width) / max_pixels)
-        h_bar = math.floor(height / beta / factor) * factor
-        w_bar = math.floor(width / beta / factor) * factor
+        h_bar = max(factor, math.floor(height / beta / factor) * factor)
+        w_bar = max(factor, math.floor(width / beta / factor) * factor)
     elif h_bar * w_bar < min_pixels:
         beta = math.sqrt(min_pixels / (height * width))
         h_bar = math.ceil(height * beta / factor) * factor
diff --git a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
index 5e600338b3..2171a7ddb6 100644
--- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import itertools
 import tempfile
 import unittest
 
@@ -169,18 +170,18 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
                 self.assertIsInstance(image[0], Image.Image)
 
             # Test not batched input
-            prcocess_out = image_processing(image_inputs[0], return_tensors="pt")
-            encoded_images = prcocess_out.pixel_values
-            image_grid_thws = prcocess_out.image_grid_thw
+            process_out = image_processing(image_inputs[0], return_tensors="pt")
+            encoded_images = process_out.pixel_values
+            image_grid_thws = process_out.image_grid_thw
             expected_output_image_shape = (4900, 1176)
             expected_image_grid_thws = torch.Tensor([[1, 70, 70]])
             self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
             self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
 
             # Test batched
-            prcocess_out = image_processing(image_inputs, return_tensors="pt")
-            encoded_images = prcocess_out.pixel_values
-            image_grid_thws = prcocess_out.image_grid_thw
+            process_out = image_processing(image_inputs, return_tensors="pt")
+            encoded_images = process_out.pixel_values
+            image_grid_thws = process_out.image_grid_thw
             expected_output_image_shape = (34300, 1176)
             expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
             self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
@@ -196,18 +197,18 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
                 self.assertIsInstance(image[0], np.ndarray)
 
             # Test not batched input
-            prcocess_out = image_processing(image_inputs[0], return_tensors="pt")
-            encoded_images = prcocess_out.pixel_values
-            image_grid_thws = prcocess_out.image_grid_thw
+            process_out = image_processing(image_inputs[0], return_tensors="pt")
+            encoded_images = process_out.pixel_values
+            image_grid_thws = process_out.image_grid_thw
             expected_output_image_shape = (4900, 1176)
             expected_image_grid_thws = torch.Tensor([[1, 70, 70]])
             self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
             self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
 
             # Test batched
-            prcocess_out = image_processing(image_inputs, return_tensors="pt")
-            encoded_images = prcocess_out.pixel_values
-            image_grid_thws = prcocess_out.image_grid_thw
+            process_out = image_processing(image_inputs, return_tensors="pt")
+            encoded_images = process_out.pixel_values
+            image_grid_thws = process_out.image_grid_thw
             expected_output_image_shape = (34300, 1176)
             expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
             self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
@@ -224,18 +225,18 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
                 self.assertIsInstance(image[0], torch.Tensor)
 
             # Test not batched input
-            prcocess_out = image_processing(image_inputs[0], return_tensors="pt")
-            encoded_images = prcocess_out.pixel_values
-            image_grid_thws = prcocess_out.image_grid_thw
+            process_out = image_processing(image_inputs[0], return_tensors="pt")
+            encoded_images = process_out.pixel_values
+            image_grid_thws = process_out.image_grid_thw
             expected_output_image_shape = (4900, 1176)
             expected_image_grid_thws = torch.Tensor([[1, 70, 70]])
             self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
             self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
 
             # Test batched
-            prcocess_out = image_processing(image_inputs, return_tensors="pt")
-            encoded_images = prcocess_out.pixel_values
-            image_grid_thws = prcocess_out.image_grid_thw
+            process_out = image_processing(image_inputs, return_tensors="pt")
+            encoded_images = process_out.pixel_values
+            image_grid_thws = process_out.image_grid_thw
             expected_output_image_shape = (34300, 1176)
             expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
             self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
@@ -251,9 +252,9 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
             image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
 
             # Test batched as a list of images
-            prcocess_out = image_processing(image_inputs, return_tensors="pt")
-            encoded_images = prcocess_out.pixel_values
-            image_grid_thws = prcocess_out.image_grid_thw
+            process_out = image_processing(image_inputs, return_tensors="pt")
+            encoded_images = process_out.pixel_values
+            image_grid_thws = process_out.image_grid_thw
             expected_output_image_shape = (34300, 1176)
             expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
             self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
@@ -261,9 +262,9 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
 
             # Test batched as a nested list of images, where each sublist is one batch
             image_inputs_nested = image_inputs[:3] + image_inputs[3:]
-            prcocess_out = image_processing(image_inputs_nested, return_tensors="pt")
-            encoded_images_nested = prcocess_out.pixel_values
-            image_grid_thws_nested = prcocess_out.image_grid_thw
+            process_out = image_processing(image_inputs_nested, return_tensors="pt")
+            encoded_images_nested = process_out.pixel_values
+            image_grid_thws_nested = process_out.image_grid_thw
             expected_output_image_shape = (34300, 1176)
             expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
             self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape)
@@ -281,8 +282,8 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
             for num_frames, expected_dims in expected_dims_by_frames.items():
                 image_processor_tester = Qwen2VLImageProcessingTester(self, num_frames=num_frames)
                 video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
-                prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
-                encoded_video = prcocess_out.pixel_values_videos
+                process_out = image_processing(None, videos=video_inputs, return_tensors="pt")
+                encoded_video = process_out.pixel_values_videos
                 expected_output_video_shape = (expected_dims, 1176)
                 self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
 
@@ -293,8 +294,8 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
             for patch_size in (1, 3, 5, 7):
                 image_processor_tester = Qwen2VLImageProcessingTester(self, patch_size=patch_size)
                 video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
-                prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
-                encoded_video = prcocess_out.pixel_values_videos
+                process_out = image_processing(None, videos=video_inputs, return_tensors="pt")
+                encoded_video = process_out.pixel_values_videos
                 expected_output_video_shape = (171500, 1176)
                 self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
 
@@ -308,9 +309,21 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
                 )
 
             image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
-            prcocess_out = image_processor_loaded(image_inputs, return_tensors="pt")
+            process_out = image_processor_loaded(image_inputs, return_tensors="pt")
             expected_output_video_shape = [112, 1176]
-            self.assertListEqual(list(prcocess_out.pixel_values.shape), expected_output_video_shape)
+            self.assertListEqual(list(process_out.pixel_values.shape), expected_output_video_shape)
+
+    def test_custom_pixels(self):
+        pixel_choices = frozenset(itertools.product((100, 150, 200, 20000), (100, 150, 200, 20000)))
+        for image_processing_class in self.image_processor_list:
+            image_processor_dict = self.image_processor_dict.copy()
+            for a_pixels, b_pixels in pixel_choices:
+                image_processor_dict["min_pixels"] = min(a_pixels, b_pixels)
+                image_processor_dict["max_pixels"] = max(a_pixels, b_pixels)
+                image_processor = image_processing_class(**image_processor_dict)
+                image_inputs = self.image_processor_tester.prepare_image_inputs()
+                # Just checking that it doesn't raise an error
+                image_processor(image_inputs, return_tensors="pt")
 
     def test_temporal_padding(self):
         for image_processing_class in self.image_processor_list: