Add Idefics2/3 and SmolVLM Fast image processors + improvements for fast image processors (#38157)
* add working idefics2 fast and improvements for fast nested images processing * add fast image processors idefics 3 and smolvlm * cleanup tests * fic doc idefics2 * PR review and fix issues after merge * Force providing disable_grouping to group_images_by_shape * simplify group_images_by_shape * fix modular * Fix nits after review
This commit is contained in:
@@ -12,7 +12,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import time
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
@@ -214,29 +213,6 @@ class PixtralImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list)
|
||||
self.assertEqual(tuple(batch_encoded_images.shape), expected_output_image_shape)
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_fast_is_faster_than_slow(self):
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest(reason="Skipping speed test")
|
||||
|
||||
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||
self.skipTest(reason="Skipping speed test as one of the image processors is not defined")
|
||||
|
||||
def measure_time(image_processor, image):
|
||||
start = time.time()
|
||||
_ = image_processor(image, return_tensors="pt")
|
||||
return time.time() - start
|
||||
|
||||
image_inputs_list = self.image_processor_tester.prepare_image_inputs(torchify=True)
|
||||
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
fast_time = measure_time(image_processor_fast, image_inputs_list)
|
||||
slow_time = measure_time(image_processor_slow, image_inputs_list)
|
||||
|
||||
self.assertLessEqual(fast_time, slow_time)
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_slow_fast_equivalence(self):
|
||||
@@ -255,9 +231,7 @@ class PixtralImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
|
||||
encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
|
||||
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
|
||||
torch.testing.assert_close(
|
||||
encoding_slow.pixel_values[0][0], encoding_fast.pixel_values[0][0], rtol=100, atol=1e-1
|
||||
)
|
||||
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values[0][0], encoding_fast.pixel_values[0][0])
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
@@ -282,14 +256,8 @@ class PixtralImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
|
||||
|
||||
for i in range(len(encoding_slow.pixel_values)):
|
||||
self.assertTrue(
|
||||
torch.allclose(encoding_slow.pixel_values[i][0], encoding_fast.pixel_values[i][0], atol=1e-1)
|
||||
)
|
||||
self.assertLessEqual(
|
||||
torch.mean(torch.abs(encoding_slow.pixel_values[i][0] - encoding_fast.pixel_values[i][0])).item(), 1e-3
|
||||
)
|
||||
torch.testing.assert_close(
|
||||
encoding_slow.pixel_values[0][0], encoding_fast.pixel_values[0][0], rtol=100, atol=1e-1
|
||||
self._assert_slow_fast_tensors_equivalence(
|
||||
encoding_slow.pixel_values[i][0], encoding_fast.pixel_values[i][0]
|
||||
)
|
||||
|
||||
@slow
|
||||
@@ -309,8 +277,8 @@ class PixtralImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processor = torch.compile(image_processor, mode="reduce-overhead")
|
||||
output_compiled = image_processor(input_image, device=torch_device, return_tensors="pt")
|
||||
|
||||
torch.testing.assert_close(
|
||||
output_eager.pixel_values[0][0], output_compiled.pixel_values[0][0], rtol=1e-4, atol=1e-4
|
||||
self._assert_slow_fast_tensors_equivalence(
|
||||
output_eager.pixel_values[0][0], output_compiled.pixel_values[0][0], atol=1e-4, rtol=1e-4, mean_atol=1e-5
|
||||
)
|
||||
|
||||
@unittest.skip(reason="PixtralImageProcessor doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
|
||||
|
||||
Reference in New Issue
Block a user