Add Idefics2/3 and SmolVLM Fast image processors + improvements for fast image processors (#38157)

* add working idefics2 fast and improvements for fast nested images processing * add fast image processors idefics 3 and smolvlm * cleanup tests * fic doc idefics2 * PR review and fix issues after merge * Force providing disable_grouping to group_images_by_shape * simplify group_images_by_shape * fix modular * Fix nits after review
2025-06-23 10:17:25 -04:00
parent 1a96127e46
commit d29482cc91
61 changed files with 2023 additions and 425 deletions
--- a/tests/models/siglip2/test_image_processing_siglip2.py
+++ b/tests/models/siglip2/test_image_processing_siglip2.py
@@ -18,14 +18,11 @@ import unittest
 import requests

 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+from transformers.utils import is_torchvision_available, is_vision_available

 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs


-if is_torch_available():
-    import torch
-
 if is_vision_available():
    from PIL import Image

@@ -150,7 +147,6 @@ class Siglip2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    def test_call_numpy_4_channels(self):
        pass

-    # increase mean tolerance to 1e-3 -> 2e-3
    # Ignore copy
    def test_slow_fast_equivalence(self):
        if not self.test_slow_image_processor or not self.test_fast_image_processor:
@@ -167,10 +163,7 @@ class Siglip2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):

        encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
        encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
-        torch.testing.assert_close(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1, rtol=1e-1)
-        self.assertLessEqual(
-            torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 2e-3
-        )
+        self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)

    # increase mean tolerance to 1e-3 -> 2e-3
    # Ignore copy
@@ -193,7 +186,4 @@ class Siglip2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
        encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")

-        torch.testing.assert_close(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1, rtol=1e-1)
-        self.assertLessEqual(
-            torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 2e-3
-        )
+        self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)