adding positional encoder changes and tests (#32600)

* adding positional encoder changes and tests

* adding ruff suggestions

* changes added by python utils/check_copies.py --fix_and_overwrite

* removing pos_encoding added by script

* adding interpolation to clipseg

* formatting

* adding further testing to altclip and better documentation to kosmos2

* skipping test_inputs_embeds_matches_input_ids_with_generate in git model

* fixing clipseg comment suggestions

* [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip

* fixing bridgetower test

* fixing altclip tensor output POS test

* adding ruff formatting

* fixing several tests

* formatting with ruff

* adding positional encoder changes and tests

* adding ruff suggestions

* changes added by python utils/check_copies.py --fix_and_overwrite

* removing pos_encoding added by script

* adding interpolation to clipseg

* formatting

* adding further testing to altclip and better documentation to kosmos2

* skipping test_inputs_embeds_matches_input_ids_with_generate in git model

* fixing clipseg comment suggestions

* fixing bridgetower test

* fixing altclip tensor output POS test

* adding ruff formatting

* fixing several tests

* formatting with ruff

* adding right pretrained model

* [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip

* fixing test_inference_image_segmentation

* [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip

* fixing test_inference_interpolate_pos_encoding for the git model as there is no vision_model_output

* [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip

* adding ruff formatting

* [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip

* adding new interpolate_pos_encoding function

* [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip

* fixing interpolate_POS funciton

* adapting output tensor in teests

* [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip

* modifying output tensor

* [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip

* adding the correct tensor

* [run_slow]  clipseg

* fixing spaces

* [run_slow]  clipseg

* [run_slow]  clipseg

---------

Co-authored-by: Manuel Sanchez Hernandez <manuel.sanchez.hernandez@schibsted.com>
This commit is contained in:
Manuel
2024-09-25 20:05:01 +02:00
committed by GitHub
parent 19d58d31f1
commit a55adee890
16 changed files with 828 additions and 65 deletions

View File

@@ -796,7 +796,7 @@ class CLIPSegModelIntegrationTest(unittest.TestCase):
# forward pass
with torch.no_grad():
outputs = model(**inputs)
outputs = model(**inputs, interpolate_pos_encoding=True)
# verify the predicted masks
self.assertEqual(
@@ -804,8 +804,9 @@ class CLIPSegModelIntegrationTest(unittest.TestCase):
torch.Size((3, 352, 352)),
)
expected_masks_slice = torch.tensor(
[[-7.4613, -7.4785, -7.3628], [-7.3268, -7.0899, -7.1333], [-6.9838, -6.7900, -6.8913]]
[[-7.4613, -7.4785, -7.3627], [-7.3268, -7.0898, -7.1333], [-6.9838, -6.7900, -6.8913]]
).to(torch_device)
self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3))
# verify conditional and pooled output
@@ -813,3 +814,40 @@ class CLIPSegModelIntegrationTest(unittest.TestCase):
expected_pooled_output = torch.tensor([0.5036, -0.2681, -0.2644]).to(torch_device)
self.assertTrue(torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3))
self.assertTrue(torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3))
@slow
def test_inference_interpolate_pos_encoding(self):
# ViT models have an `interpolate_pos_encoding` argument in their forward method,
# allowing to interpolate the pre-trained position embeddings in order to use
# the model on higher resolutions. The DINO model by Facebook AI leverages this
# to visualize self-attention on higher resolution images.
model = CLIPSegModel.from_pretrained("openai/clip-vit-base-patch32").to(torch_device)
processor = CLIPSegProcessor.from_pretrained(
"openai/clip-vit-base-patch32", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180}
)
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
# interpolate_pos_encodiung false should return value error
with self.assertRaises(ValueError, msg="doesn't match model"):
with torch.no_grad():
model(**inputs, interpolate_pos_encoding=False)
# forward pass
with torch.no_grad():
outputs = model(**inputs, interpolate_pos_encoding=True)
# verify the logits
expected_shape = torch.Size((1, 26, 768))
self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
expected_slice = torch.tensor(
[[-0.1538, 0.0322, -0.3235], [0.2893, 0.1135, -0.5708], [0.0461, 0.1540, -0.6018]]
).to(torch_device)
self.assertTrue(
torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)
)