Add VideoMAE (#17821)

* First draft * Add VideoMAEForVideoClassification * Improve conversion script * Add VideoMAEForPreTraining * Add VideoMAEFeatureExtractor * Improve VideoMAEFeatureExtractor * Improve docs * Add first draft of model tests * Improve VideoMAEForPreTraining * Fix base_model_prefix * Make model take pixel_values of shape (B, T, C, H, W) * Add loss computation of VideoMAEForPreTraining * Improve tests * Improve model testsé * Make all tests pass * Add VideoMAE to main README * Add tests for VideoMAEFeatureExtractor * Add integration test * Improve conversion script * Rename patch embedding class * Remove VideoMAELayer from init * Update design of patch embeddings * Improve comments * Improve conversion script * Improve conversion script * Add conversion of pretrained model * Add loss verification of pretrained model * Add loss verification of unnormalized targets * Add integration test for pretraining model * Apply suggestions from code review * Fix bug to make feature extractor resize only shorter edge * Address more comments * Improve normalization of videos * Add doc examples * Move constants to dedicated script * Remove scripts * Transfer checkpoints, fix docs * Update script * Update image mean and std * Fix doc tests * Set return_tensors to NumPy by default * Revert the previous change Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
2022-08-04 18:02:55 +02:00
parent 672b66262a
commit f9a0008d2d
29 changed files with 2596 additions and 33 deletions
--- a/tests/test_feature_extraction_common.py
+++ b/tests/test_feature_extraction_common.py
@@ -48,49 +48,91 @@ SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = get_tests_dir("fixtures")
 def prepare_image_inputs(feature_extract_tester, equal_resolution=False, numpify=False, torchify=False):
    """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
    or a list of PyTorch tensors if one specifies torchify=True.
+
+    One can specify whether the images are of the same resolution or not.
    """

    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"

-    if equal_resolution:
-        image_inputs = []
-        for i in range(feature_extract_tester.batch_size):
-            image_inputs.append(
-                np.random.randint(
-                    255,
-                    size=(
-                        feature_extract_tester.num_channels,
-                        feature_extract_tester.max_resolution,
-                        feature_extract_tester.max_resolution,
-                    ),
-                    dtype=np.uint8,
-                )
-            )
-    else:
-        image_inputs = []
-
-        # To avoid getting image width/height 0
-        min_resolution = feature_extract_tester.min_resolution
-        if getattr(feature_extract_tester, "size_divisor", None):
-            # If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
-            min_resolution = max(feature_extract_tester.size_divisor, min_resolution)
-
-        for i in range(feature_extract_tester.batch_size):
+    image_inputs = []
+    for i in range(feature_extract_tester.batch_size):
+        if equal_resolution:
+            width = height = feature_extract_tester.max_resolution
+        else:
+            # To avoid getting image width/height 0
+            min_resolution = feature_extract_tester.min_resolution
+            if getattr(feature_extract_tester, "size_divisor", None):
+                # If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
+                min_resolution = max(feature_extract_tester.size_divisor, min_resolution)
            width, height = np.random.choice(np.arange(min_resolution, feature_extract_tester.max_resolution), 2)
-            image_inputs.append(
-                np.random.randint(255, size=(feature_extract_tester.num_channels, width, height), dtype=np.uint8)
+        image_inputs.append(
+            np.random.randint(
+                255,
+                size=(
+                    feature_extract_tester.num_channels,
+                    width,
+                    height,
+                ),
+                dtype=np.uint8,
            )
+        )

    if not numpify and not torchify:
        # PIL expects the channel dimension as last dimension
-        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+        image_inputs = [Image.fromarray(np.moveaxis(image, 0, -1)) for image in image_inputs]

    if torchify:
-        image_inputs = [torch.from_numpy(x) for x in image_inputs]
+        image_inputs = [torch.from_numpy(image) for image in image_inputs]

    return image_inputs


+def prepare_video(feature_extract_tester, width=10, height=10, numpify=False, torchify=False):
+    """This function prepares a video as a list of PIL images/NumPy arrays/PyTorch tensors."""
+
+    video = []
+    for i in range(feature_extract_tester.num_frames):
+        video.append(np.random.randint(255, size=(feature_extract_tester.num_channels, width, height), dtype=np.uint8))
+
+    if not numpify and not torchify:
+        # PIL expects the channel dimension as last dimension
+        video = [Image.fromarray(np.moveaxis(frame, 0, -1)) for frame in video]
+
+    if torchify:
+        video = [torch.from_numpy(frame) for frame in video]
+
+    return video
+
+
+def prepare_video_inputs(feature_extract_tester, equal_resolution=False, numpify=False, torchify=False):
+    """This function prepares a batch of videos: a list of list of PIL images, or a list of list of numpy arrays if
+    one specifies numpify=True, or a list of list of PyTorch tensors if one specifies torchify=True.
+
+    One can specify whether the videos are of the same resolution or not.
+    """
+
+    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
+
+    video_inputs = []
+    for i in range(feature_extract_tester.batch_size):
+        if equal_resolution:
+            width = height = feature_extract_tester.max_resolution
+        else:
+            width, height = np.random.choice(
+                np.arange(feature_extract_tester.min_resolution, feature_extract_tester.max_resolution), 2
+            )
+            video = prepare_video(
+                feature_extract_tester=feature_extract_tester,
+                width=width,
+                height=height,
+                numpify=numpify,
+                torchify=torchify,
+            )
+        video_inputs.append(video)
+
+    return video_inputs
+
+
 class FeatureExtractionSavingTestMixin:
    def test_feat_extract_to_json_string(self):
        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)