Fix tests for vision models (#35654)

* Trigger tests

* [run-slow] beit, detr, dinov2, vit, textnet

* Fix BEiT interpolate_pos_encoding

* Fix DETR test

* Update DINOv2 test

* Fix textnet

* Fix vit

* Fix DPT

* fix data2vec test

* Fix textnet test

* Update interpolation check

* Fix ZoeDepth tests

* Update interpolate embeddings for BEiT

* Apply suggestions from code review
This commit is contained in:
Pavel Iakubovskii
2025-02-13 10:28:37 +00:00
committed by GitHub
parent e60ae0d078
commit d419862889
9 changed files with 55 additions and 79 deletions

View File

@@ -565,17 +565,12 @@ class Data2VecVisionModelIntegrationTest(unittest.TestCase):
inputs = processor(images=image, return_tensors="pt", size={"height": 480, "width": 480})
pixel_values = inputs.pixel_values.to(torch_device)
# with interpolate_pos_encoding being False an exception should be raised with higher resolution
# images than what the model supports.
self.assertFalse(processor.do_center_crop)
with torch.no_grad():
with self.assertRaises(ValueError, msg="doesn't match model"):
model(pixel_values, interpolate_pos_encoding=False)
# with interpolate_pos_encoding being True the model should process the higher resolution image
# successfully and produce the expected output.
with torch.no_grad():
outputs = model(pixel_values, interpolate_pos_encoding=True)
expected_shape = torch.Size((1, 1801, 768))
# num_cls_tokens + (height / patch_size) * (width / patch_size)
# 1 + (480 / 16) * (480 / 16) = 901
expected_shape = torch.Size((1, 901, 768))
self.assertEqual(outputs.last_hidden_state.shape, expected_shape)