adding positional encoder changes and tests (#32600)
* adding positional encoder changes and tests * adding ruff suggestions * changes added by python utils/check_copies.py --fix_and_overwrite * removing pos_encoding added by script * adding interpolation to clipseg * formatting * adding further testing to altclip and better documentation to kosmos2 * skipping test_inputs_embeds_matches_input_ids_with_generate in git model * fixing clipseg comment suggestions * [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip * fixing bridgetower test * fixing altclip tensor output POS test * adding ruff formatting * fixing several tests * formatting with ruff * adding positional encoder changes and tests * adding ruff suggestions * changes added by python utils/check_copies.py --fix_and_overwrite * removing pos_encoding added by script * adding interpolation to clipseg * formatting * adding further testing to altclip and better documentation to kosmos2 * skipping test_inputs_embeds_matches_input_ids_with_generate in git model * fixing clipseg comment suggestions * fixing bridgetower test * fixing altclip tensor output POS test * adding ruff formatting * fixing several tests * formatting with ruff * adding right pretrained model * [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip * fixing test_inference_image_segmentation * [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip * fixing test_inference_interpolate_pos_encoding for the git model as there is no vision_model_output * [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip * adding ruff formatting * [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip * adding new interpolate_pos_encoding function * [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip * fixing interpolate_POS funciton * adapting output tensor in teests * [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip * modifying output tensor * [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip * adding the correct tensor * [run_slow] clipseg * fixing spaces * [run_slow] clipseg * [run_slow] clipseg --------- Co-authored-by: Manuel Sanchez Hernandez <manuel.sanchez.hernandez@schibsted.com>
This commit is contained in:
@@ -597,3 +597,44 @@ class AltCLIPModelIntegrationTest(unittest.TestCase):
|
||||
expected_probs = torch.tensor([[9.9942e-01, 5.7805e-04]], device=torch_device)
|
||||
|
||||
self.assertTrue(torch.allclose(probs, expected_probs, atol=5e-3))
|
||||
|
||||
@slow
|
||||
def test_inference_interpolate_pos_encoding(self):
|
||||
# ViT models have an `interpolate_pos_encoding` argument in their forward method,
|
||||
# allowing to interpolate the pre-trained position embeddings in order to use
|
||||
# the model on higher resolutions. The DINO model by Facebook AI leverages this
|
||||
# to visualize self-attention on higher resolution images.
|
||||
model_name = "BAAI/AltCLIP"
|
||||
model = AltCLIPModel.from_pretrained(model_name).to(torch_device)
|
||||
|
||||
image_processor = AltCLIPProcessor.from_pretrained(
|
||||
model_name, size={"shortest_edge": 180}, crop_size={"height": 180, "width": 180}
|
||||
)
|
||||
|
||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# interpolate_pos_encodiung false should return value error
|
||||
with self.assertRaises(ValueError, msg="doesn't match model"):
|
||||
with torch.no_grad():
|
||||
model(**inputs, interpolate_pos_encoding=False)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs, interpolate_pos_encoding=True)
|
||||
|
||||
# verify the logits
|
||||
expected_shape = torch.Size((1, 145, 1024))
|
||||
print("nilesh ")
|
||||
print(outputs.vision_model_output.last_hidden_state.shape)
|
||||
print(outputs.vision_model_output.last_hidden_state[0, :3, :3])
|
||||
|
||||
self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
|
||||
|
||||
expected_slice = torch.tensor(
|
||||
[[-0.3589, -0.5939, 0.3534], [0.4346, 0.1647, 0.7071], [1.1404, -0.4716, 0.1664]]
|
||||
).to(torch_device)
|
||||
|
||||
self.assertTrue(
|
||||
torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)
|
||||
)
|
||||
|
||||
@@ -656,3 +656,37 @@ class BridgeTowerModelTrainingTest(unittest.TestCase):
|
||||
for name, param in model.named_parameters():
|
||||
if self._is_layer_used(model_class, name):
|
||||
self.assertIsNotNone(param.grad, f"Gradients should not be None - got {param.grad} for {name}")
|
||||
|
||||
@slow
|
||||
def test_inference_interpolate_pos_encoding(self):
|
||||
# ViT models have an `interpolate_pos_encoding` argument in their forward method,
|
||||
# allowing to interpolate the pre-trained position embeddings in order to use
|
||||
# the model on higher resolutions. The DINO model by Facebook AI leverages this
|
||||
# to visualize self-attention on higher resolution images.
|
||||
model_name = "BridgeTower/bridgetower-base"
|
||||
model = BridgeTowerModel.from_pretrained(model_name).to(torch_device)
|
||||
|
||||
image_processor = BridgeTowerProcessor.from_pretrained(model_name, size={"shortest_edge": 180})
|
||||
|
||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# interpolate_pos_encodiung false should return value error
|
||||
with self.assertRaises(ValueError, msg="doesn't match model"):
|
||||
with torch.no_grad():
|
||||
model(**inputs, interpolate_pos_encoding=False)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs, interpolate_pos_encoding=True)
|
||||
|
||||
# verify the logits
|
||||
expected_shape = torch.Size((1, 122, 768))
|
||||
|
||||
self.assertEqual(outputs.image_features.shape, expected_shape)
|
||||
|
||||
expected_slice = torch.tensor(
|
||||
[[-0.6518, 0.4978, -0.4544], [-2.6672, -0.0843, -0.4210], [-2.4510, -0.1002, -0.3458]]
|
||||
).to(torch_device)
|
||||
|
||||
self.assertTrue(torch.allclose(outputs.image_features[0, :3, :3], expected_slice, atol=1e-4))
|
||||
|
||||
@@ -740,3 +740,41 @@ class ChineseCLIPModelIntegrationTest(unittest.TestCase):
|
||||
expected_probs = torch.tensor([[1.2686e-03, 5.4499e-02, 6.7968e-04, 9.4355e-01]], device=torch_device)
|
||||
|
||||
self.assertTrue(torch.allclose(probs, expected_probs, atol=5e-3))
|
||||
|
||||
@slow
|
||||
def test_inference_interpolate_pos_encoding(self):
|
||||
# ViT models have an `interpolate_pos_encoding` argument in their forward method,
|
||||
# allowing to interpolate the pre-trained position embeddings in order to use
|
||||
# the model on higher resolutions. The DINO model by Facebook AI leverages this
|
||||
# to visualize self-attention on higher resolution images.
|
||||
model_name = "OFA-Sys/chinese-clip-vit-base-patch16"
|
||||
model = ChineseCLIPModel.from_pretrained(model_name).to(torch_device)
|
||||
|
||||
image_processor = ChineseCLIPProcessor.from_pretrained(
|
||||
model_name, size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180}
|
||||
)
|
||||
|
||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# interpolate_pos_encodiung false should return value error
|
||||
with self.assertRaises(ValueError, msg="doesn't match model"):
|
||||
with torch.no_grad():
|
||||
model(**inputs, interpolate_pos_encoding=False)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs, interpolate_pos_encoding=True)
|
||||
|
||||
# verify the logits
|
||||
expected_shape = torch.Size((1, 122, 768))
|
||||
|
||||
self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
|
||||
|
||||
expected_slice = torch.tensor(
|
||||
[[-0.3990, 0.2983, -0.1239], [-0.1452, -0.2759, 0.0403], [-0.3149, -0.4763, 0.8555]]
|
||||
).to(torch_device)
|
||||
|
||||
self.assertTrue(
|
||||
torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)
|
||||
)
|
||||
|
||||
@@ -1182,3 +1182,40 @@ class CLIPModelIntegrationTest(unittest.TestCase):
|
||||
expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device)
|
||||
|
||||
self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
|
||||
|
||||
@slow
|
||||
def test_inference_interpolate_pos_encoding(self):
|
||||
# CLIP models have an `interpolate_pos_encoding` argument in their forward method,
|
||||
# allowing to interpolate the pre-trained position embeddings in order to use
|
||||
# the model on higher resolutions. The DINO model by Facebook AI leverages this
|
||||
# to visualize self-attention on higher resolution images.
|
||||
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(torch_device)
|
||||
|
||||
processor = CLIPProcessor.from_pretrained(
|
||||
"openai/clip-vit-base-patch32", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180}
|
||||
)
|
||||
|
||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# interpolate_pos_encodiung false should return value error
|
||||
with self.assertRaises(ValueError, msg="doesn't match model"):
|
||||
with torch.no_grad():
|
||||
model(**inputs, interpolate_pos_encoding=False)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs, interpolate_pos_encoding=True)
|
||||
|
||||
# verify the logits
|
||||
expected_shape = torch.Size((1, 26, 768))
|
||||
|
||||
self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
|
||||
|
||||
expected_slice = torch.tensor(
|
||||
[[-0.1538, 0.0322, -0.3235], [0.2893, 0.1135, -0.5708], [0.0461, 0.1540, -0.6018]]
|
||||
).to(torch_device)
|
||||
|
||||
self.assertTrue(
|
||||
torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)
|
||||
)
|
||||
|
||||
@@ -796,7 +796,7 @@ class CLIPSegModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
outputs = model(**inputs, interpolate_pos_encoding=True)
|
||||
|
||||
# verify the predicted masks
|
||||
self.assertEqual(
|
||||
@@ -804,8 +804,9 @@ class CLIPSegModelIntegrationTest(unittest.TestCase):
|
||||
torch.Size((3, 352, 352)),
|
||||
)
|
||||
expected_masks_slice = torch.tensor(
|
||||
[[-7.4613, -7.4785, -7.3628], [-7.3268, -7.0899, -7.1333], [-6.9838, -6.7900, -6.8913]]
|
||||
[[-7.4613, -7.4785, -7.3627], [-7.3268, -7.0898, -7.1333], [-6.9838, -6.7900, -6.8913]]
|
||||
).to(torch_device)
|
||||
|
||||
self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3))
|
||||
|
||||
# verify conditional and pooled output
|
||||
@@ -813,3 +814,40 @@ class CLIPSegModelIntegrationTest(unittest.TestCase):
|
||||
expected_pooled_output = torch.tensor([0.5036, -0.2681, -0.2644]).to(torch_device)
|
||||
self.assertTrue(torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3))
|
||||
self.assertTrue(torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3))
|
||||
|
||||
@slow
|
||||
def test_inference_interpolate_pos_encoding(self):
|
||||
# ViT models have an `interpolate_pos_encoding` argument in their forward method,
|
||||
# allowing to interpolate the pre-trained position embeddings in order to use
|
||||
# the model on higher resolutions. The DINO model by Facebook AI leverages this
|
||||
# to visualize self-attention on higher resolution images.
|
||||
model = CLIPSegModel.from_pretrained("openai/clip-vit-base-patch32").to(torch_device)
|
||||
|
||||
processor = CLIPSegProcessor.from_pretrained(
|
||||
"openai/clip-vit-base-patch32", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180}
|
||||
)
|
||||
|
||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# interpolate_pos_encodiung false should return value error
|
||||
with self.assertRaises(ValueError, msg="doesn't match model"):
|
||||
with torch.no_grad():
|
||||
model(**inputs, interpolate_pos_encoding=False)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs, interpolate_pos_encoding=True)
|
||||
|
||||
# verify the logits
|
||||
expected_shape = torch.Size((1, 26, 768))
|
||||
|
||||
self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
|
||||
|
||||
expected_slice = torch.tensor(
|
||||
[[-0.1538, 0.0322, -0.3235], [0.2893, 0.1135, -0.5708], [0.0461, 0.1540, -0.6018]]
|
||||
).to(torch_device)
|
||||
|
||||
self.assertTrue(
|
||||
torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)
|
||||
)
|
||||
|
||||
@@ -614,3 +614,38 @@ class GitModelIntegrationTest(unittest.TestCase):
|
||||
generated_captions = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
||||
|
||||
self.assertEqual(generated_captions, ["two cats sleeping on a pink blanket next to remotes."] * 2)
|
||||
|
||||
@slow
|
||||
def test_inference_interpolate_pos_encoding(self):
|
||||
# CLIP family models have an `interpolate_pos_encoding` argument in their forward method,
|
||||
# allowing to interpolate the pre-trained position embeddings in order to use
|
||||
# the model on higher resolutions. The DINO model by Facebook AI leverages this
|
||||
# to visualize self-attention on higher resolution images.
|
||||
model = GitModel.from_pretrained("microsoft/git-base").to(torch_device)
|
||||
|
||||
processor = GitProcessor.from_pretrained(
|
||||
"microsoft/git-base", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180}
|
||||
)
|
||||
|
||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# interpolate_pos_encodiung false should return value error
|
||||
with self.assertRaises(ValueError, msg="doesn't match model"):
|
||||
with torch.no_grad():
|
||||
model(**inputs, interpolate_pos_encoding=False)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs, interpolate_pos_encoding=True)
|
||||
|
||||
# verify the logits
|
||||
expected_shape = torch.Size((1, 130, 768))
|
||||
|
||||
self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
|
||||
|
||||
expected_slice = torch.tensor(
|
||||
[[-1.0296, 2.5960, 0.8703], [1.7027, 1.3302, -0.4543], [-1.4932, -0.1084, 0.0502]]
|
||||
).to(torch_device)
|
||||
|
||||
self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
|
||||
|
||||
@@ -762,3 +762,40 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
|
||||
self.assertEqual(processed_text[0], EXPECTED_PROCESSED_TEXT_0)
|
||||
self.assertEqual(all_final_text[0], EXPECTED_FINAL_TEXT_0)
|
||||
self.assertListEqual(all_entities[0], EXPECTED_ENTITIES_0)
|
||||
|
||||
@slow
|
||||
def test_inference_interpolate_pos_encoding(self):
|
||||
# ViT models have an `interpolate_pos_encoding` argument in their forward method,
|
||||
# allowing to interpolate the pre-trained position embeddings in order to use
|
||||
# the model on higher resolutions. The DINO model by Facebook AI leverages this
|
||||
# to visualize self-attention on higher resolution images.
|
||||
model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
|
||||
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
"microsoft/kosmos-2-patch14-224", size={"shortest_edge": 180}, crop_size={"height": 180, "width": 180}
|
||||
)
|
||||
|
||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# interpolate_pos_encodiung false should return value error
|
||||
with self.assertRaises(ValueError, msg="doesn't match model"):
|
||||
with torch.no_grad():
|
||||
model(**inputs, interpolate_pos_encoding=False)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs, interpolate_pos_encoding=True)
|
||||
|
||||
# verify the logits
|
||||
expected_shape = torch.Size((1, 145, 1024))
|
||||
|
||||
self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
|
||||
|
||||
expected_slice = torch.tensor(
|
||||
[[1.0022, -1.1901, 3.2887], [2.6164, 0.0515, -0.8270], [1.8315, 0.1272, -0.8590]]
|
||||
).to(torch_device)
|
||||
|
||||
self.assertTrue(
|
||||
torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)
|
||||
)
|
||||
|
||||
@@ -731,3 +731,39 @@ class XCLIPModelIntegrationTest(unittest.TestCase):
|
||||
expected_logits = torch.tensor([[14.0181, 20.2771, 14.4776]], device=torch_device)
|
||||
|
||||
self.assertTrue(torch.allclose(outputs.logits_per_video, expected_logits, atol=1e-3))
|
||||
|
||||
@slow
|
||||
def test_inference_interpolate_pos_encoding(self):
|
||||
# XCLIP models have an `interpolate_pos_encoding` argument in their forward method,
|
||||
# allowing to interpolate the pre-trained position embeddings in order to use
|
||||
# the model on higher resolutions. The DINO model by Facebook AI leverages this
|
||||
# to visualize self-attention on higher resolution images.
|
||||
model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32").to(torch_device)
|
||||
|
||||
processor = XCLIPProcessor.from_pretrained(
|
||||
"microsoft/xclip-base-patch32", size=180, crop_size={"height": 180, "width": 180}
|
||||
)
|
||||
|
||||
video = prepare_video()
|
||||
inputs = processor(text="what's in the video", videos=video, return_tensors="pt").to(torch_device)
|
||||
|
||||
# interpolate_pos_encodiung false should return value error
|
||||
with self.assertRaises(ValueError, msg="doesn't match model"):
|
||||
with torch.no_grad():
|
||||
model(**inputs, interpolate_pos_encoding=False)
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs, interpolate_pos_encoding=True)
|
||||
|
||||
# verify the logits
|
||||
expected_shape = torch.Size((8, 26, 768))
|
||||
|
||||
self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
|
||||
|
||||
expected_slice = torch.tensor(
|
||||
[[0.0126, 0.2109, 0.0609], [0.0448, 0.5862, -0.1688], [-0.0881, 0.8525, -0.3044]]
|
||||
).to(torch_device)
|
||||
|
||||
self.assertTrue(
|
||||
torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user