Fix ViT-MAE decoder interpolate (#33330)

* Fix ViT-MAE decoder interpolate * Add unit test for `interpolate_pos_encoding` w/ custom sizes * [run_slow] vit_mae
2024-09-30 18:47:13 +02:00
parent 1dba608df9
commit 18c5b216f1
2 changed files with 43 additions and 16 deletions
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -298,12 +298,16 @@ class ViTMAEModelIntegrationTest(unittest.TestCase):
    def default_image_processor(self):
        return ViTImageProcessor.from_pretrained("facebook/vit-mae-base")

+    @cached_property
+    def default_model(self):
+        return ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base").to(torch_device)
+
    @slow
    def test_inference_for_pretraining(self):
        # make random mask reproducible across the PT and TF model
        np.random.seed(2)

-        model = ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base").to(torch_device)
+        model = self.default_model

        image_processor = self.default_image_processor
        image = prepare_img()
@@ -313,11 +317,11 @@ class ViTMAEModelIntegrationTest(unittest.TestCase):
        # (this way we can ensure that the PT and TF models operate on the same inputs)
        vit_mae_config = ViTMAEConfig()
        num_patches = int((vit_mae_config.image_size // vit_mae_config.patch_size) ** 2)
-        noise = np.random.uniform(size=(1, num_patches))
+        noise = torch.from_numpy(np.random.uniform(size=(1, num_patches))).to(device=torch_device)

        # forward pass
        with torch.no_grad():
-            outputs = model(**inputs, noise=torch.from_numpy(noise).to(device=torch_device))
+            outputs = model(**inputs, noise=noise)

        # verify the logits
        expected_shape = torch.Size((1, 196, 768))
@@ -339,7 +343,7 @@ class ViTMAEModelIntegrationTest(unittest.TestCase):
        # make random mask reproducible across the PT and TF model
        np.random.seed(2)

-        model = ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base").to(torch_device)
+        model = self.default_model

        image_processor = self.default_image_processor
        image = prepare_img()
@@ -349,14 +353,38 @@ class ViTMAEModelIntegrationTest(unittest.TestCase):
        # (this way we can ensure that the PT and TF models operate on the same inputs)
        vit_mae_config = ViTMAEConfig()
        num_patches = (image.height // vit_mae_config.patch_size) * (image.width // vit_mae_config.patch_size)
-        noise = np.random.uniform(size=(1, num_patches))
+        noise = torch.from_numpy(np.random.uniform(size=(1, num_patches))).to(device=torch_device)

        # forward pass
        with torch.no_grad():
-            outputs = model(
-                **inputs, noise=torch.from_numpy(noise).to(device=torch_device), interpolate_pos_encoding=True
-            )
+            outputs = model(**inputs, noise=noise, interpolate_pos_encoding=True)

        # verify the logits
        expected_shape = torch.Size((1, 1200, 768))
        self.assertEqual(outputs.logits.shape, expected_shape)
+
+    @slow
+    def test_inference_interpolate_pos_encoding_custom_sizes(self):
+        # Ensure custom sizes are correctly handled when interpolating the position embeddings
+
+        # make random mask reproducible across the PT and TF model
+        np.random.seed(2)
+
+        model = self.default_model
+        image_processor = self.default_image_processor
+
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt", size={"height": 256, "width": 256}).to(
+            torch_device
+        )
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(
+                **inputs,
+                interpolate_pos_encoding=True,
+            )
+
+        # verify the logits
+        expected_shape = torch.Size((1, 256, 768))
+        self.assertEqual(outputs.logits.shape, expected_shape)