[Fix] ViViT interpolate_pos_encoding (#33815)

* fix:test_inference_interpolate_pos_encoding * style:make style;make fixup * test: add suggestion to test_modeling_vivit * chore:add suggestions * style:make style * [run_slow] vivit * ci:slow test fix * [run_slow] vivit
2024-10-02 00:44:35 +05:30
parent 8635802af9
commit 68a2b50069
2 changed files with 7 additions and 6 deletions
--- a/tests/models/vivit/test_modeling_vivit.py
+++ b/tests/models/vivit/test_modeling_vivit.py
@@ -359,12 +359,12 @@ class VivitModelIntegrationTest(unittest.TestCase):
        # allowing to interpolate the pre-trained position embeddings in order to use
        # the model on higher resolutions. The DINO model by Facebook AI leverages this
        # to visualize self-attention on higher resolution images.
-        model = VivitModel.from_pretrained("google/vivit-b-16x2").to(torch_device)
+        model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400").to(torch_device)

-        image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2")
+        image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
        video = prepare_video()
        inputs = image_processor(
-            video, size={"shortest_edge": 480}, crop_size={"height": 480, "width": 480}, return_tensors="pt"
+            video, size={"shortest_edge": 480}, crop_size={"height": 232, "width": 232}, return_tensors="pt"
        )
        pixel_values = inputs.pixel_values.to(torch_device)