Fix tests for vision models (#35654)

* Trigger tests

* [run-slow] beit, detr, dinov2, vit, textnet

* Fix BEiT interpolate_pos_encoding

* Fix DETR test

* Update DINOv2 test

* Fix textnet

* Fix vit

* Fix DPT

* fix data2vec test

* Fix textnet test

* Update interpolation check

* Fix ZoeDepth tests

* Update interpolate embeddings for BEiT

* Apply suggestions from code review
This commit is contained in:
Pavel Iakubovskii
2025-02-13 10:28:37 +00:00
committed by GitHub
parent e60ae0d078
commit d419862889
9 changed files with 55 additions and 79 deletions

View File

@@ -774,7 +774,9 @@ class BeitModelIntegrationTest(unittest.TestCase):
with torch.no_grad():
outputs = model(pixel_values, interpolate_pos_encoding=True)
expected_shape = torch.Size((1, 1801, 768))
# num_cls_tokens + (height / patch_size) * (width / patch_size)
# 1 + (480 / 16) * (480 / 16) = 1 + 30 * 30 = 901
expected_shape = torch.Size((1, 901, 768))
self.assertEqual(outputs.last_hidden_state.shape, expected_shape)

View File

@@ -565,17 +565,12 @@ class Data2VecVisionModelIntegrationTest(unittest.TestCase):
inputs = processor(images=image, return_tensors="pt", size={"height": 480, "width": 480})
pixel_values = inputs.pixel_values.to(torch_device)
# with interpolate_pos_encoding being False an exception should be raised with higher resolution
# images than what the model supports.
self.assertFalse(processor.do_center_crop)
with torch.no_grad():
with self.assertRaises(ValueError, msg="doesn't match model"):
model(pixel_values, interpolate_pos_encoding=False)
# with interpolate_pos_encoding being True the model should process the higher resolution image
# successfully and produce the expected output.
with torch.no_grad():
outputs = model(pixel_values, interpolate_pos_encoding=True)
expected_shape = torch.Size((1, 1801, 768))
# num_cls_tokens + (height / patch_size) * (width / patch_size)
# 1 + (480 / 16) * (480 / 16) = 901
expected_shape = torch.Size((1, 901, 768))
self.assertEqual(outputs.last_hidden_state.shape, expected_shape)

View File

@@ -684,7 +684,12 @@ class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase):
self.assertTrue(results["segmentation"].shape, expected_shape)
torch.testing.assert_close(results["segmentation"][:3, :3], expected_slice_segmentation, rtol=1e-4, atol=1e-4)
self.assertTrue(len(results["segments_info"]), expected_number_of_segments)
self.assertDictEqual(results["segments_info"][0], expected_first_segment)
predicted_first_segment = results["segments_info"][0]
self.assertEqual(predicted_first_segment["id"], expected_first_segment["id"])
self.assertEqual(predicted_first_segment["label_id"], expected_first_segment["label_id"])
self.assertEqual(predicted_first_segment["was_fused"], expected_first_segment["was_fused"])
self.assertAlmostEqual(predicted_first_segment["score"], expected_first_segment["score"], places=3)
@require_vision

View File

@@ -329,10 +329,10 @@ class Dinov2ModelIntegrationTest(unittest.TestCase):
self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
expected_slice = torch.tensor(
[[-2.1747, -0.4729, 1.0936], [-3.2780, -0.8269, -0.9210], [-2.9129, 1.1284, -0.7306]],
[[-2.2005, -0.4495, 1.0964], [-3.3959, -0.8942, -1.0315], [-2.9355, 1.1564, -0.7656]],
device=torch_device,
)
torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-3, atol=1e-3)
@require_torch

View File

@@ -328,14 +328,18 @@ class TextNetModelIntegrationTest(unittest.TestCase):
with torch.no_grad():
output = model(**inputs)
# verify logits
self.assertEqual(output.logits.shape, torch.Size([1, 2]))
# verify output
self.assertEqual(output.last_hidden_state.shape, torch.Size([1, 512, 20, 27]))
expected_slice_backbone = torch.tensor(
[0.9210, 0.6099, 0.0000, 0.0000, 0.0000, 0.0000, 3.2207, 2.6602, 1.8925, 0.0000],
[
[0.0000, 1.7415, 1.2660],
[0.0000, 1.0084, 1.9692],
[0.0000, 1.7464, 1.7892],
],
device=torch_device,
)
torch.testing.assert_close(
output.feature_maps[-1][0][10][12][:10], expected_slice_backbone, rtol=1e-3, atol=1e-3
output.last_hidden_state[0, 12, :3, :3], expected_slice_backbone, rtol=1e-2, atol=1e-2
)

View File

@@ -310,10 +310,10 @@ class ViTModelIntegrationTest(unittest.TestCase):
self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
expected_slice = torch.tensor(
[[4.2340, 4.3906, -6.6692], [4.5463, 1.8928, -6.7257], [4.4429, 0.8496, -5.8585]]
[[4.2325, 4.3882, -6.6678], [4.5372, 1.8933, -6.7355], [4.4454, 0.8514, -5.8747]]
).to(torch_device)
torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-3, atol=1e-3)
@slow
@require_accelerate

View File

@@ -301,8 +301,8 @@ class ZoeDepthModelIntegrationTest(unittest.TestCase):
out_l_reduced = torch.nn.functional.interpolate(
out_l.unsqueeze(0).unsqueeze(1), size=img.size[::-1], mode="bicubic", align_corners=False
)
self.assertTrue((np.array(out_l.shape)[::-1] == np.array(img.size) * 2).all())
torch.testing.assert_close(out, out_l_reduced, rtol=2e-2)
out_l_reduced = out_l_reduced.squeeze(0).squeeze(0)
torch.testing.assert_close(out, out_l_reduced, rtol=2e-2, atol=2e-2)
def check_post_processing_test(self, image_processor, images, model, pad_input=True, flip_aug=True):
inputs = image_processor(images=images, return_tensors="pt", do_pad=pad_input).to(torch_device)
@@ -324,7 +324,7 @@ class ZoeDepthModelIntegrationTest(unittest.TestCase):
for img, out, expected_slice in zip(images, outputs, expected_slices):
out = out["predicted_depth"]
self.assertTrue(img.size == out.shape[::-1])
torch.testing.assert_close(expected_slice, out[:3, :3], atol=1e-3, rtol=1e-3)
torch.testing.assert_close(expected_slice, out[:3, :3], rtol=1e-3, atol=1e-3)
self.check_target_size(image_processor, pad_input, images, outputs, raw_outputs, raw_outputs_flipped)