interpolation added for TVP. (#30863)

* Update TVP model to interpolate pre-trained image pad prompter encodings * feat: Add 2D positional embeddings interpolation in TvpVisualInputEmbedding * added required comments * Update TVP model to interpolate pre-trained image pad prompter encodings * feat: Add 2D positional embeddings interpolation in TvpVisualInputEmbedding * added required comments * docstring and argument fix * doc fixes and test case fix suggested in review. * varibale typo fix * styling and name fixes for padding interpolation flag.
2024-06-07 23:14:16 +05:30
parent ea50b64bea
commit 3b9174f248
2 changed files with 144 additions and 23 deletions
--- a/tests/models/tvp/test_modeling_tvp.py
+++ b/tests/models/tvp/test_modeling_tvp.py
@@ -256,7 +256,7 @@ def prepare_img():
 class TvpModelIntegrationTests(unittest.TestCase):
    @cached_property
    def default_image_processor(self):
-        return TvpImageProcessor.from_pretrained("Jiqing/tiny-random-tvp") if is_vision_available() else None
+        return TvpImageProcessor.from_pretrained("Jiqing/tiny-random-tvp")

    def test_inference_no_head(self):
        model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp").to(torch_device)
@@ -297,3 +297,41 @@ class TvpModelIntegrationTests(unittest.TestCase):
        assert outputs.logits.shape == expected_shape
        expected_slice = torch.tensor([[0.5061, 0.4988]]).to(torch_device)
        self.assertTrue(torch.allclose(outputs.logits, expected_slice, atol=1e-4))
+
+    def test_interpolate_inference_no_head(self):
+        model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp").to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()  # 480X640
+        encoding = image_processor(
+            images=image, return_tensors="pt", do_resize=False, do_pad=False, do_center_crop=False
+        )
+        input_ids = torch.tensor([[1, 2]])
+        attention_mask = torch.tensor([[1, 1]])
+        encoding.update({"input_ids": input_ids, "attention_mask": attention_mask})
+        encoding.to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**encoding, interpolate_pos_encoding=True)
+
+        expected_shape = torch.Size((1, 1212, 128))
+        assert outputs.last_hidden_state.shape == expected_shape
+
+    def test_interpolate_inference_with_head(self):
+        model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp").to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()  # 480X640
+        encoding = image_processor(
+            images=image, return_tensors="pt", do_resize=False, do_pad=False, do_center_crop=False
+        )
+        input_ids = torch.tensor([[1, 2]])
+        attention_mask = torch.tensor([[1, 1]])
+        encoding.update({"input_ids": input_ids, "attention_mask": attention_mask})
+        encoding.to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**encoding, interpolate_pos_encoding=True, output_hidden_states=True)
+
+        expected_shape = torch.Size((1, 1212, 128))
+        assert outputs.hidden_states[-1].shape == expected_shape