interpolation added for TVP. (#30863)

* Update TVP model to interpolate pre-trained image pad prompter encodings

* feat: Add 2D positional embeddings interpolation in TvpVisualInputEmbedding

* added required comments

* Update TVP model to interpolate pre-trained image pad prompter encodings

* feat: Add 2D positional embeddings interpolation in TvpVisualInputEmbedding

* added required comments

* docstring and argument fix

* doc fixes and test case fix suggested in review.

* varibale typo fix

* styling and name fixes for padding interpolation flag.
This commit is contained in:
BHUVAN M
2024-06-07 23:14:16 +05:30
committed by GitHub
parent ea50b64bea
commit 3b9174f248
2 changed files with 144 additions and 23 deletions

View File

@@ -256,7 +256,7 @@ def prepare_img():
class TvpModelIntegrationTests(unittest.TestCase):
@cached_property
def default_image_processor(self):
return TvpImageProcessor.from_pretrained("Jiqing/tiny-random-tvp") if is_vision_available() else None
return TvpImageProcessor.from_pretrained("Jiqing/tiny-random-tvp")
def test_inference_no_head(self):
model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp").to(torch_device)
@@ -297,3 +297,41 @@ class TvpModelIntegrationTests(unittest.TestCase):
assert outputs.logits.shape == expected_shape
expected_slice = torch.tensor([[0.5061, 0.4988]]).to(torch_device)
self.assertTrue(torch.allclose(outputs.logits, expected_slice, atol=1e-4))
def test_interpolate_inference_no_head(self):
model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp").to(torch_device)
image_processor = self.default_image_processor
image = prepare_img() # 480X640
encoding = image_processor(
images=image, return_tensors="pt", do_resize=False, do_pad=False, do_center_crop=False
)
input_ids = torch.tensor([[1, 2]])
attention_mask = torch.tensor([[1, 1]])
encoding.update({"input_ids": input_ids, "attention_mask": attention_mask})
encoding.to(torch_device)
with torch.no_grad():
outputs = model(**encoding, interpolate_pos_encoding=True)
expected_shape = torch.Size((1, 1212, 128))
assert outputs.last_hidden_state.shape == expected_shape
def test_interpolate_inference_with_head(self):
model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp").to(torch_device)
image_processor = self.default_image_processor
image = prepare_img() # 480X640
encoding = image_processor(
images=image, return_tensors="pt", do_resize=False, do_pad=False, do_center_crop=False
)
input_ids = torch.tensor([[1, 2]])
attention_mask = torch.tensor([[1, 1]])
encoding.update({"input_ids": input_ids, "attention_mask": attention_mask})
encoding.to(torch_device)
with torch.no_grad():
outputs = model(**encoding, interpolate_pos_encoding=True, output_hidden_states=True)
expected_shape = torch.Size((1, 1212, 128))
assert outputs.hidden_states[-1].shape == expected_shape