adding positional encoder changes and tests (#32600)

* adding positional encoder changes and tests * adding ruff suggestions * changes added by python utils/check_copies.py --fix_and_overwrite * removing pos_encoding added by script * adding interpolation to clipseg * formatting * adding further testing to altclip and better documentation to kosmos2 * skipping test_inputs_embeds_matches_input_ids_with_generate in git model * fixing clipseg comment suggestions * [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip * fixing bridgetower test * fixing altclip tensor output POS test * adding ruff formatting * fixing several tests * formatting with ruff * adding positional encoder changes and tests * adding ruff suggestions * changes added by python utils/check_copies.py --fix_and_overwrite * removing pos_encoding added by script * adding interpolation to clipseg * formatting * adding further testing to altclip and better documentation to kosmos2 * skipping test_inputs_embeds_matches_input_ids_with_generate in git model * fixing clipseg comment suggestions * fixing bridgetower test * fixing altclip tensor output POS test * adding ruff formatting * fixing several tests * formatting with ruff * adding right pretrained model * [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip * fixing test_inference_image_segmentation * [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip * fixing test_inference_interpolate_pos_encoding for the git model as there is no vision_model_output * [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip * adding ruff formatting * [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip * adding new interpolate_pos_encoding function * [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip * fixing interpolate_POS funciton * adapting output tensor in teests * [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip * modifying output tensor * [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip * adding the correct tensor * [run_slow] clipseg * fixing spaces * [run_slow] clipseg * [run_slow] clipseg --------- Co-authored-by: Manuel Sanchez Hernandez <manuel.sanchez.hernandez@schibsted.com>
2024-09-25 20:05:01 +02:00
parent 19d58d31f1
commit a55adee890
16 changed files with 828 additions and 65 deletions
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -597,3 +597,44 @@ class AltCLIPModelIntegrationTest(unittest.TestCase):
        expected_probs = torch.tensor([[9.9942e-01, 5.7805e-04]], device=torch_device)

        self.assertTrue(torch.allclose(probs, expected_probs, atol=5e-3))
+
+    @slow
+    def test_inference_interpolate_pos_encoding(self):
+        # ViT models have an `interpolate_pos_encoding` argument in their forward method,
+        # allowing to interpolate the pre-trained position embeddings in order to use
+        # the model on higher resolutions. The DINO model by Facebook AI leverages this
+        # to visualize self-attention on higher resolution images.
+        model_name = "BAAI/AltCLIP"
+        model = AltCLIPModel.from_pretrained(model_name).to(torch_device)
+
+        image_processor = AltCLIPProcessor.from_pretrained(
+            model_name, size={"shortest_edge": 180}, crop_size={"height": 180, "width": 180}
+        )
+
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
+
+        # interpolate_pos_encodiung false should return value error
+        with self.assertRaises(ValueError, msg="doesn't match model"):
+            with torch.no_grad():
+                model(**inputs, interpolate_pos_encoding=False)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs, interpolate_pos_encoding=True)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 145, 1024))
+        print("nilesh ")
+        print(outputs.vision_model_output.last_hidden_state.shape)
+        print(outputs.vision_model_output.last_hidden_state[0, :3, :3])
+
+        self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[-0.3589, -0.5939, 0.3534], [0.4346, 0.1647, 0.7071], [1.1404, -0.4716, 0.1664]]
+        ).to(torch_device)
+
+        self.assertTrue(
+            torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)
+        )