From e6a7981711304474355dfa41884e78b63b8318c8 Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Thu, 13 Feb 2025 17:14:30 -0500 Subject: [PATCH] Fix make_batched_videos and add tests (#36143) * add support for initial shift in video processing and other fixes * revert modifications video loading functions --- src/transformers/image_utils.py | 2 +- tests/utils/test_image_utils.py | 38 ++++++++++++++++++++++++--------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index 2cec08ae9c..ad439b5d9f 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -314,7 +314,7 @@ def make_batched_videos(videos) -> VideoInput: if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): # case 1: nested batch of videos so we flatten it if not is_pil_image(videos[0][0]) and videos[0][0].ndim == 4: - videos = [video for batch_list in videos for video in batch_list] + videos = [[video for batch_list in batched_videos for video in batch_list] for batched_videos in videos] # case 2: list of videos represented as list of video frames return videos diff --git a/tests/utils/test_image_utils.py b/tests/utils/test_image_utils.py index d4ce1435a1..1d2682a85b 100644 --- a/tests/utils/test_image_utils.py +++ b/tests/utils/test_image_utils.py @@ -424,14 +424,14 @@ class ImageFeatureExtractionTester(unittest.TestCase): def test_make_batched_videos_numpy(self): # Test a single image is converted to a list of 1 video with 1 frame images = np.random.randint(0, 256, (16, 32, 3)) - videos_list = make_nested_list_of_images(images) + videos_list = make_batched_videos(images) self.assertIsInstance(videos_list[0], list) self.assertEqual(len(videos_list), 1) self.assertTrue(np.array_equal(videos_list[0][0], images)) # Test a 4d array of images is converted to a a list of 1 video images = np.random.randint(0, 256, (4, 16, 32, 3)) - videos_list = make_nested_list_of_images(images) + videos_list = make_batched_videos(images) self.assertIsInstance(videos_list[0], list) self.assertIsInstance(videos_list[0][0], np.ndarray) self.assertEqual(len(videos_list), 1) @@ -440,7 +440,7 @@ class ImageFeatureExtractionTester(unittest.TestCase): # Test a list of images is converted to a list of videos images = [np.random.randint(0, 256, (16, 32, 3)) for _ in range(4)] - videos_list = make_nested_list_of_images(images) + videos_list = make_batched_videos(images) self.assertIsInstance(videos_list[0], list) self.assertEqual(len(videos_list), 1) self.assertEqual(len(videos_list[0]), 4) @@ -448,7 +448,7 @@ class ImageFeatureExtractionTester(unittest.TestCase): # Test a nested list of images is left unchanged images = [[np.random.randint(0, 256, (16, 32, 3)) for _ in range(2)] for _ in range(2)] - videos_list = make_nested_list_of_images(images) + videos_list = make_batched_videos(images) self.assertIsInstance(videos_list[0], list) self.assertEqual(len(videos_list), 2) self.assertEqual(len(videos_list[0]), 2) @@ -456,25 +456,34 @@ class ImageFeatureExtractionTester(unittest.TestCase): # Test a list of 4d array images is converted to a list of videos images = [np.random.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)] - videos_list = make_nested_list_of_images(images) + videos_list = make_batched_videos(images) self.assertIsInstance(videos_list[0], list) self.assertIsInstance(videos_list[0][0], np.ndarray) self.assertEqual(len(videos_list), 2) self.assertEqual(len(videos_list[0]), 4) self.assertTrue(np.array_equal(videos_list[0][0], images[0][0])) + # Test a batch of list of 4d array images is converted to a list of videos + images = [[np.random.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)] for _ in range(2)] + videos_list = make_batched_videos(images) + self.assertIsInstance(videos_list[0], list) + self.assertIsInstance(videos_list[0][0], np.ndarray) + self.assertEqual(len(videos_list), 2) + self.assertEqual(len(videos_list[0]), 8) + self.assertTrue(np.array_equal(videos_list[0][0], images[0][0][0])) + @require_torch def test_make_batched_videos_torch(self): # Test a single image is converted to a list of 1 video with 1 frame images = torch.randint(0, 256, (16, 32, 3)) - videos_list = make_nested_list_of_images(images) + videos_list = make_batched_videos(images) self.assertIsInstance(videos_list[0], list) self.assertEqual(len(videos_list[0]), 1) self.assertTrue(np.array_equal(videos_list[0][0], images)) # Test a 4d tensor of images is converted to a list of 1 video images = torch.randint(0, 256, (4, 16, 32, 3)) - videos_list = make_nested_list_of_images(images) + videos_list = make_batched_videos(images) self.assertIsInstance(videos_list[0], list) self.assertIsInstance(videos_list[0][0], torch.Tensor) self.assertEqual(len(videos_list), 1) @@ -483,7 +492,7 @@ class ImageFeatureExtractionTester(unittest.TestCase): # Test a list of images is converted to a list of videos images = [torch.randint(0, 256, (16, 32, 3)) for _ in range(4)] - videos_list = make_nested_list_of_images(images) + videos_list = make_batched_videos(images) self.assertIsInstance(videos_list[0], list) self.assertEqual(len(videos_list), 1) self.assertEqual(len(videos_list[0]), 4) @@ -491,7 +500,7 @@ class ImageFeatureExtractionTester(unittest.TestCase): # Test a nested list of images is left unchanged images = [[torch.randint(0, 256, (16, 32, 3)) for _ in range(2)] for _ in range(2)] - videos_list = make_nested_list_of_images(images) + videos_list = make_batched_videos(images) self.assertIsInstance(videos_list[0], list) self.assertEqual(len(videos_list), 2) self.assertEqual(len(videos_list[0]), 2) @@ -499,13 +508,22 @@ class ImageFeatureExtractionTester(unittest.TestCase): # Test a list of 4d tensor images is converted to a list of videos images = [torch.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)] - videos_list = make_nested_list_of_images(images) + videos_list = make_batched_videos(images) self.assertIsInstance(videos_list[0], list) self.assertIsInstance(videos_list[0][0], torch.Tensor) self.assertEqual(len(videos_list), 2) self.assertEqual(len(videos_list[0]), 4) self.assertTrue(np.array_equal(videos_list[0][0], images[0][0])) + # Test a batch of list of 4d tensor images is converted to a list of videos + images = [[torch.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)] for _ in range(2)] + videos_list = make_batched_videos(images) + self.assertIsInstance(videos_list[0], list) + self.assertIsInstance(videos_list[0][0], torch.Tensor) + self.assertEqual(len(videos_list), 2) + self.assertEqual(len(videos_list[0]), 8) + self.assertTrue(np.array_equal(videos_list[0][0], images[0][0][0])) + @require_torch def test_conversion_torch_to_array(self): feature_extractor = ImageFeatureExtractionMixin()