From a25fc3592eec7a18aa20fe5d85bd335477896cbc Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 3 Jul 2025 15:13:06 +0200
Subject: [PATCH] Update expected values (after switching to A10) - part 4
 (#39189)

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/align/test_modeling_align.py     |  3 +
 .../test_modeling_fastspeech2_conformer.py    | 53 +++++++++----
 .../models/focalnet/test_modeling_focalnet.py | 14 +++-
 tests/models/glpn/test_modeling_glpn.py       |  3 +
 tests/models/hiera/test_modeling_hiera.py     |  3 +
 tests/models/levit/test_modeling_levit.py     | 13 ++-
 .../lightglue/test_modeling_lightglue.py      |  9 ++-
 tests/models/mgp_str/test_modeling_mgp_str.py |  3 +
 tests/models/minimax/test_modeling_minimax.py | 12 ++-
 tests/models/mixtral/test_modeling_mixtral.py | 23 +++---
 .../moonshine/test_modeling_moonshine.py      | 32 ++++----
 tests/models/mpt/test_modeling_mpt.py         | 20 +++--
 .../models/musicgen/test_modeling_musicgen.py | 35 ++++----
 .../test_modeling_musicgen_melody.py          | 14 ++--
 tests/models/sam/test_modeling_sam.py         | 15 +++-
 tests/models/sam_hq/test_modeling_sam_hq.py   | 79 +++++++++++++------
 .../timesformer/test_modeling_timesformer.py  | 13 ++-
 .../test_modeling_timm_wrapper.py             | 27 +++++--
 .../models/videomae/test_modeling_videomae.py | 12 ++-
 tests/models/vitpose/test_modeling_vitpose.py |  3 +
 .../test_modeling_vitpose_backbone.py         |  3 +
 tests/models/vivit/test_modeling_vivit.py     | 14 ++--
 .../test_modeling_wav2vec2_bert.py            |  5 ++
 .../test_modeling_wav2vec2_conformer.py       |  4 +-
 tests/models/x_clip/test_modeling_x_clip.py   | 24 ++++--
 25 files changed, 298 insertions(+), 138 deletions(-)

diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py
index 5f147220da..15c520d1d2 100644
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -462,6 +462,9 @@ class AlignModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    def test_batching_equivalence(self, atol=3e-4, rtol=3e-4):
+        super().test_batching_equivalence(atol=atol, rtol=rtol)
+
     @unittest.skip(reason="Start to fail after using torch `cu118`.")
     def test_multi_gpu_data_parallel_forward(self):
         super().test_multi_gpu_data_parallel_forward()
diff --git a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
index 22201f42b0..ccd88576f8 100644
--- a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
+++ b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
@@ -24,7 +24,14 @@ from transformers import (
     FastSpeech2ConformerWithHifiGanConfig,
     is_torch_available,
 )
-from transformers.testing_utils import require_g2p_en, require_torch, require_torch_accelerator, slow, torch_device
+from transformers.testing_utils import (
+    Expectations,
+    require_g2p_en,
+    require_torch,
+    require_torch_accelerator,
+    slow,
+    torch_device,
+)
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
@@ -373,24 +380,38 @@ class FastSpeech2ConformerModelIntegrationTest(unittest.TestCase):
 
         # mel-spectrogram is too large (1, 205, 80), so only check top-left 100 elements
         # fmt: off
-        expected_mel_spectrogram = torch.tensor(
-            [
-                [-1.2426, -1.7286, -1.6754, -1.7451, -1.6402, -1.5219, -1.4480, -1.3345, -1.4031, -1.4497],
-                [-0.7858, -1.4966, -1.3602, -1.4876, -1.2949, -1.0723, -1.0021, -0.7553, -0.6521, -0.6929],
-                [-0.7298, -1.3908, -1.0369, -1.2656, -1.0342, -0.7883, -0.7420, -0.5249, -0.3734, -0.3977],
-                [-0.4784, -1.3508, -1.1558, -1.4678, -1.2820, -1.0252, -1.0868, -0.9006, -0.8947, -0.8448],
-                [-0.3963, -1.2895, -1.2813, -1.6147, -1.4658, -1.2560, -1.4134, -1.2650, -1.3255, -1.1715],
-                [-1.4914, -1.3097, -0.3821, -0.3898, -0.5748, -0.9040, -1.0755, -1.0575, -1.2205, -1.0572],
-                [0.0197, -0.0582, 0.9147, 1.1512, 1.1651, 0.6628, -0.1010, -0.3085, -0.2285, 0.2650],
-                [1.1780, 0.1803, 0.7251, 1.5728, 1.6678, 0.4542, -0.1572, -0.1787, 0.0744, 0.8168],
-                [-0.2078, -0.3211, 1.1096, 1.5085, 1.4632, 0.6299, -0.0515, 0.0589, 0.8609, 1.4429],
-                [0.7831, -0.2663, 1.0352, 1.4489, 0.9088, 0.0247, -0.3995, 0.0078, 1.2446, 1.6998],
-            ],
-            device=torch_device,
+        expectations = Expectations(
+            {
+                (None, None): [
+                    [-1.2426, -1.7286, -1.6754, -1.7451, -1.6402, -1.5219, -1.4480, -1.3345, -1.4031, -1.4497],
+                    [-0.7858, -1.4966, -1.3602, -1.4876, -1.2949, -1.0723, -1.0021, -0.7553, -0.6521, -0.6929],
+                    [-0.7298, -1.3908, -1.0369, -1.2656, -1.0342, -0.7883, -0.7420, -0.5249, -0.3734, -0.3977],
+                    [-0.4784, -1.3508, -1.1558, -1.4678, -1.2820, -1.0252, -1.0868, -0.9006, -0.8947, -0.8448],
+                    [-0.3963, -1.2895, -1.2813, -1.6147, -1.4658, -1.2560, -1.4134, -1.2650, -1.3255, -1.1715],
+                    [-1.4914, -1.3097, -0.3821, -0.3898, -0.5748, -0.9040, -1.0755, -1.0575, -1.2205, -1.0572],
+                    [0.0197, -0.0582, 0.9147, 1.1512, 1.1651, 0.6628, -0.1010, -0.3085, -0.2285, 0.2650],
+                    [1.1780, 0.1803, 0.7251, 1.5728, 1.6678, 0.4542, -0.1572, -0.1787, 0.0744, 0.8168],
+                    [-0.2078, -0.3211, 1.1096, 1.5085, 1.4632, 0.6299, -0.0515, 0.0589, 0.8609, 1.4429],
+                    [0.7831, -0.2663, 1.0352, 1.4489, 0.9088, 0.0247, -0.3995, 0.0078, 1.2446, 1.6998],
+                ],
+                ("cuda", 8): [
+                    [-1.2425, -1.7282, -1.6750, -1.7448, -1.6400, -1.5217, -1.4478, -1.3341, -1.4026, -1.4493],
+                    [-0.7858, -1.4967, -1.3601, -1.4875, -1.2950, -1.0725, -1.0021, -0.7553, -0.6522, -0.6929],
+                    [-0.7303, -1.3911, -1.0370, -1.2656, -1.0345, -0.7888, -0.7423, -0.5251, -0.3737, -0.3979],
+                    [-0.4784, -1.3506, -1.1556, -1.4677, -1.2820, -1.0253, -1.0868, -0.9006, -0.8949, -0.8448],
+                    [-0.3968, -1.2896, -1.2811, -1.6145, -1.4660, -1.2564, -1.4135, -1.2652, -1.3258, -1.1716],
+                    [-1.4912, -1.3092, -0.3812, -0.3886, -0.5737, -0.9034, -1.0749, -1.0571, -1.2202, -1.0567],
+                    [0.0200, -0.0577, 0.9151, 1.1516, 1.1656, 0.6628, -0.1012, -0.3086, -0.2283, 0.2658],
+                    [1.1778, 0.1805, 0.7255, 1.5732, 1.6680, 0.4539, -0.1572, -0.1785, 0.0751, 0.8175],
+                    [-0.2088, -0.3212, 1.1101, 1.5085, 1.4625, 0.6293, -0.0522, 0.0587, 0.8615, 1.4432],
+                    [0.7834, -0.2659, 1.0355, 1.4486, 0.9080, 0.0244, -0.3995, 0.0083, 1.2452, 1.6998],
+                ],
+            }
         )
+        expected_mel_spectrogram = torch.tensor(expectations.get_expectation()).to(torch_device)
         # fmt: on
 
-        torch.testing.assert_close(spectrogram[0, :10, :10], expected_mel_spectrogram, rtol=1e-4, atol=1e-4)
+        torch.testing.assert_close(spectrogram[0, :10, :10], expected_mel_spectrogram, rtol=2e-4, atol=2e-4)
         self.assertEqual(spectrogram.shape, (1, 205, model.config.num_mel_bins))
 
     def test_training_integration(self):
diff --git a/tests/models/focalnet/test_modeling_focalnet.py b/tests/models/focalnet/test_modeling_focalnet.py
index d272f25891..893d9ed1ee 100644
--- a/tests/models/focalnet/test_modeling_focalnet.py
+++ b/tests/models/focalnet/test_modeling_focalnet.py
@@ -17,7 +17,7 @@ import collections
 import unittest
 
 from transformers import FocalNetConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import Expectations, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ...test_backbone_common import BackboneTesterMixin
@@ -425,8 +425,16 @@ class FocalNetModelIntegrationTest(unittest.TestCase):
         # verify the logits
         expected_shape = torch.Size((1, 1000))
         self.assertEqual(outputs.logits.shape, expected_shape)
-        expected_slice = torch.tensor([0.2166, -0.4368, 0.2191]).to(torch_device)
-        torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
+
+        expectations = Expectations(
+            {
+                (None, None): [0.2166, -0.4368, 0.2191],
+                ("cuda", 8): [0.2168, -0.4367, 0.2190],
+            }
+        )
+        expected_slice = torch.tensor(expectations.get_expectation()).to(torch_device)
+
+        torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=2e-4, atol=2e-4)
         self.assertTrue(outputs.logits.argmax(dim=-1).item(), 281)
 
 
diff --git a/tests/models/glpn/test_modeling_glpn.py b/tests/models/glpn/test_modeling_glpn.py
index b3e1852373..b98743de35 100644
--- a/tests/models/glpn/test_modeling_glpn.py
+++ b/tests/models/glpn/test_modeling_glpn.py
@@ -164,6 +164,9 @@ class GLPNModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_batching_equivalence(self, atol=3e-4, rtol=3e-4):
+        super().test_batching_equivalence(atol=atol, rtol=rtol)
+
     def test_for_depth_estimation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs)
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index dfbec4a4b8..1e3ed8e795 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -262,6 +262,9 @@ class HieraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         self.config_tester.check_config_can_be_init_without_params()
         self.config_tester.check_config_arguments_init()
 
+    def test_batching_equivalence(self, atol=3e-4, rtol=3e-4):
+        super().test_batching_equivalence(atol=atol, rtol=rtol)
+
     # Overriding as Hiera `get_input_embeddings` returns HieraPatchEmbeddings
     def test_model_get_set_embeddings(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/levit/test_modeling_levit.py b/tests/models/levit/test_modeling_levit.py
index f6226be1f8..80f8f822c5 100644
--- a/tests/models/levit/test_modeling_levit.py
+++ b/tests/models/levit/test_modeling_levit.py
@@ -19,7 +19,7 @@ from math import ceil, floor
 
 from transformers import LevitConfig
 from transformers.file_utils import cached_property, is_torch_available, is_vision_available
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import Expectations, require_torch, require_vision, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
@@ -406,6 +406,11 @@ class LevitModelIntegrationTest(unittest.TestCase):
         expected_shape = torch.Size((1, 1000))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_slice = torch.tensor([1.0448, -0.3745, -1.8317]).to(torch_device)
-
-        torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
+        expectations = Expectations(
+            {
+                (None, None): [1.0448, -0.3745, -1.8317],
+                ("cuda", 8): [1.0453, -0.3739, -1.8314],
+            }
+        )
+        expected_slice = torch.tensor(expectations.get_expectation()).to(torch_device)
+        torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=2e-4, atol=2e-4)
diff --git a/tests/models/lightglue/test_modeling_lightglue.py b/tests/models/lightglue/test_modeling_lightglue.py
index 20d9f2ef61..7f36469bf5 100644
--- a/tests/models/lightglue/test_modeling_lightglue.py
+++ b/tests/models/lightglue/test_modeling_lightglue.py
@@ -17,7 +17,7 @@ import unittest
 from datasets import load_dataset
 
 from transformers.models.lightglue.configuration_lightglue import LightGlueConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import get_device_properties, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -143,6 +143,13 @@ class LightGlueModelTest(ModelTesterMixin, unittest.TestCase):
         self.config_tester.check_config_can_be_init_without_params()
         self.config_tester.check_config_arguments_init()
 
+    def test_batching_equivalence(self, atol=1e-5, rtol=1e-5):
+        device_properties = get_device_properties()
+        if device_properties[0] == "cuda" and device_properties[1] == 8:
+            # TODO: (ydshieh) fix this
+            self.skipTest(reason="After switching to A10, this test always fails, but pass on CPU or T4.")
+        super().test_batching_equivalence(atol=atol, rtol=rtol)
+
     @unittest.skip(reason="LightGlueForKeypointMatching does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
diff --git a/tests/models/mgp_str/test_modeling_mgp_str.py b/tests/models/mgp_str/test_modeling_mgp_str.py
index 586e9f0bc4..1ff9927f89 100644
--- a/tests/models/mgp_str/test_modeling_mgp_str.py
+++ b/tests/models/mgp_str/test_modeling_mgp_str.py
@@ -140,6 +140,9 @@ class MgpstrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_batching_equivalence(self, atol=1e-4, rtol=1e-4):
+        super().test_batching_equivalence(atol=atol, rtol=rtol)
+
     @unittest.skip(reason="MgpstrModel does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
diff --git a/tests/models/minimax/test_modeling_minimax.py b/tests/models/minimax/test_modeling_minimax.py
index b9ae9d4515..0e36c7219b 100644
--- a/tests/models/minimax/test_modeling_minimax.py
+++ b/tests/models/minimax/test_modeling_minimax.py
@@ -20,6 +20,7 @@ import pytest
 from transformers import MiniMaxConfig, is_torch_available
 from transformers.cache_utils import Cache
 from transformers.testing_utils import (
+    Expectations,
     require_flash_attn,
     require_torch,
     require_torch_accelerator,
@@ -250,15 +251,20 @@ class MiniMaxIntegrationTest(unittest.TestCase):
             model_id,
             torch_dtype=torch.bfloat16,
         ).to(torch_device)
-        expected_slice = torch.tensor(
-            [[1.0312, -0.5156, -0.3262], [-0.1152, 0.4336, 0.2412], [1.2188, -0.5898, -0.0381]]
-        ).to(torch_device)
 
         with torch.no_grad():
             logits = model(dummy_input).logits
 
         logits = logits.float()
 
+        expectations = Expectations(
+            {
+                (None, None): [[1.0312, -0.5156, -0.3262], [-0.1152, 0.4336, 0.2412], [1.2188, -0.5898, -0.0381]],
+                ("cuda", 8): [[1.0312, -0.5156, -0.3203], [-0.1201, 0.4375, 0.2402], [1.2188, -0.5898, -0.0396]],
+            }
+        )
+        expected_slice = torch.tensor(expectations.get_expectation()).to(torch_device)
+
         torch.testing.assert_close(logits[0, :3, :3], expected_slice, atol=1e-3, rtol=1e-3)
         torch.testing.assert_close(logits[1, :3, :3], expected_slice, atol=1e-3, rtol=1e-3)
 
diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py
index 94ceb0e4a7..56c71fe735 100644
--- a/tests/models/mixtral/test_modeling_mixtral.py
+++ b/tests/models/mixtral/test_modeling_mixtral.py
@@ -191,27 +191,26 @@ class MixtralIntegrationTest(unittest.TestCase):
         # ("cuda", 8) for A100/A10, and ("cuda", 7) for T4.
         #
         # considering differences in hardware processing and potential deviations in generated text.
-        # fmt: off
+
         EXPECTED_LOGITS_LEFT_UNPADDED = Expectations(
             {
-                ("xpu", 3): torch.Tensor([[0.2236, 0.5195, -0.3828], [0.8203, -0.2295, 0.6055], [0.2676, -0.7070, 0.2461]]).to(torch_device),
-                ("cuda", 7): torch.Tensor([[0.2236, 0.5195, -0.3828], [0.8203, -0.2275, 0.6054], [0.2656, -0.7070, 0.2460]]).to(torch_device),
-                ("cuda", 8): torch.Tensor([[0.2207, 0.5234, -0.3828], [0.8203, -0.2285, 0.6055], [0.2656, -0.7109, 0.2451]]).to(torch_device),
-                ("rocm", 9): torch.Tensor([[0.2236, 0.5195, -0.3828], [0.8203, -0.2285, 0.6055], [0.2637, -0.7109, 0.2451]]).to(torch_device),
+                ("xpu", 3): [[0.2236, 0.5195, -0.3828], [0.8203, -0.2295, 0.6055], [0.2676, -0.7070, 0.2461]],
+                ("cuda", 7): [[0.2236, 0.5195, -0.3828], [0.8203, -0.2275, 0.6054], [0.2656, -0.7070, 0.2460]],
+                ("cuda", 8): [[0.2217, 0.5195, -0.3828], [0.8203, -0.2295, 0.6055], [0.2676, -0.7109, 0.2461]],
+                ("rocm", 9): [[0.2236, 0.5195, -0.3828], [0.8203, -0.2285, 0.6055], [0.2637, -0.7109, 0.2451]],
             }
         )
-        expected_left_unpadded = EXPECTED_LOGITS_LEFT_UNPADDED.get_expectation()
+        expected_left_unpadded = torch.tensor(EXPECTED_LOGITS_LEFT_UNPADDED.get_expectation(), device=torch_device)
 
         EXPECTED_LOGITS_RIGHT_UNPADDED = Expectations(
             {
-                ("xpu", 3): torch.Tensor([[0.2178, 0.1270, -0.1641], [-0.3496, 0.2988, -1.0312], [0.0693, 0.7930, 0.8008]]).to(torch_device),
-                ("cuda", 7): torch.Tensor([[0.2167, 0.1269, -0.1640], [-0.3496, 0.2988, -1.0312], [0.0688, 0.7929, 0.8007]]).to(torch_device),
-                ("cuda", 8): torch.Tensor([[0.2178, 0.1270, -0.1621], [-0.3496, 0.3008, -1.0312], [0.0693, 0.7930, 0.7969]]).to(torch_device),
-                ("rocm", 9): torch.Tensor([[0.2197, 0.1250, -0.1611], [-0.3516, 0.3008, -1.0312], [0.0684, 0.7930, 0.8008]]).to(torch_device),
+                ("xpu", 3): [[0.2178, 0.1270, -0.1641], [-0.3496, 0.2988, -1.0312], [0.0693, 0.7930, 0.8008]],
+                ("cuda", 7): [[0.2167, 0.1269, -0.1640], [-0.3496, 0.2988, -1.0312], [0.0688, 0.7929, 0.8007]],
+                ("cuda", 8): [[0.2178, 0.1260, -0.1621], [-0.3496, 0.2988, -1.0312], [0.0693, 0.7930, 0.8008]],
+                ("rocm", 9): [[0.2197, 0.1250, -0.1611], [-0.3516, 0.3008, -1.0312], [0.0684, 0.7930, 0.8008]],
             }
         )
-        expected_right_unpadded = EXPECTED_LOGITS_RIGHT_UNPADDED.get_expectation()
-        # fmt: on
+        expected_right_unpadded = torch.tensor(EXPECTED_LOGITS_RIGHT_UNPADDED.get_expectation(), device=torch_device)
 
         with torch.no_grad():
             logits = model(dummy_input, attention_mask=attention_mask).logits
diff --git a/tests/models/moonshine/test_modeling_moonshine.py b/tests/models/moonshine/test_modeling_moonshine.py
index 99573cff09..a551244a6e 100644
--- a/tests/models/moonshine/test_modeling_moonshine.py
+++ b/tests/models/moonshine/test_modeling_moonshine.py
@@ -17,7 +17,7 @@ import copy
 import unittest
 
 from transformers import MoonshineConfig, is_torch_available
-from transformers.testing_utils import cleanup, require_torch, slow, torch_device
+from transformers.testing_utils import Expectations, cleanup, require_torch, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
@@ -457,13 +457,15 @@ class MoonshineModelIntegrationTests(unittest.TestCase):
         outputs = model.generate(**inputs, max_new_tokens=1, return_dict_in_generate=True, output_logits=True)
 
         # fmt: off
-        EXPECTED_LOGITS = torch.tensor([
-            -9.1106,  4.5542,  6.3892, -6.8139, -7.2456, -7.9074, -7.2839, -7.6043, -8.0384, -7.8351,
-            -7.3867, -7.2450, -7.7420, -7.3912, -7.3866, -7.6979, -7.6420, -7.0504, -7.3979, -7.2483,
-            -8.0796, -7.3300, -7.3672, -6.8765, -7.6876, -7.2682, -6.9866, -6.7457, -7.6855, -7.3050,
-        ])
+        expectations = Expectations(
+            {
+                (None, None): [-9.1106, 4.5542, 6.3892, -6.8139, -7.2456, -7.9074, -7.2839, -7.6043, -8.0384, -7.8351, -7.3867, -7.2450, -7.7420, -7.3912, -7.3866, -7.6979, -7.6420, -7.0504, -7.3979, -7.2483, -8.0796, -7.3300, -7.3672, -6.8765, -7.6876, -7.2682, -6.9866, -6.7457, -7.6855, -7.3050],
+                ("cuda", 8): [-9.1107, 4.5538, 6.3902, -6.8141, -7.2459, -7.9076, -7.2842, -7.6045, -8.0387, -7.8354, -7.3869, -7.2453, -7.7423, -7.3914, -7.3869, -7.6982, -7.6422, -7.0507, -7.3982, -7.2486, -8.0798, -7.3302, -7.3675, -6.8769, -7.6878, -7.2684, -6.9868, -6.7459, -7.6858, -7.3052],
+            }
+        )
+        EXPECTED_LOGITS = torch.tensor(expectations.get_expectation()).to(torch_device)
         # fmt: on
-        torch.testing.assert_close(outputs.logits[0][0, :30].cpu(), EXPECTED_LOGITS, rtol=1e-4, atol=1e-4)
+        torch.testing.assert_close(outputs.logits[0][0, :30], EXPECTED_LOGITS, rtol=2e-4, atol=2e-4)
 
     @slow
     def test_base_logits_single(self):
@@ -476,7 +478,7 @@ class MoonshineModelIntegrationTests(unittest.TestCase):
 
         # fmt: off
         EXPECTED_LOGITS = torch.tensor([
-            -6.7336,  1.9482,  5.2448, -8.0277, -7.9167, -7.8956, -7.9649, -7.9348, -8.1312, -8.0616,
+            -6.7336, 1.9482, 5.2448, -8.0277, -7.9167, -7.8956, -7.9649, -7.9348, -8.1312, -8.0616,
             -8.1070, -7.7696, -7.8809, -7.9450, -8.1013, -7.8177, -7.8598, -7.8257, -7.8729, -7.9657,
             -7.9310, -8.1024, -7.8699, -7.8231, -8.0752, -7.9764, -7.8127, -8.0536, -7.9492, -7.9290,
         ])
@@ -493,9 +495,9 @@ class MoonshineModelIntegrationTests(unittest.TestCase):
         outputs = model.generate(**inputs, max_new_tokens=1, return_dict_in_generate=True, output_logits=True)
         # fmt: off
         EXPECTED_LOGITS = torch.tensor([
-            [-8.0109,  5.0241,  4.5979, -6.8125, -7.1675, -7.8783, -7.2152, -7.5188, -7.9077, -7.7394],
-            [-4.4399, -1.4422,  6.6710, -6.8929, -7.3751, -7.0969, -6.5257, -7.0257, -7.2585, -7.0008],
-            [-10.0086, 3.2859,  0.7345, -6.5557, -6.8514, -6.5308, -6.4172, -6.9484, -6.6214, -6.6229],
+            [-8.0109, 5.0241, 4.5979, -6.8125, -7.1675, -7.8783, -7.2152, -7.5188, -7.9077, -7.7394],
+            [-4.4399, -1.4422, 6.6710, -6.8929, -7.3751, -7.0969, -6.5257, -7.0257, -7.2585, -7.0008],
+            [-10.0086, 3.2859, 0.7345, -6.5557, -6.8514, -6.5308, -6.4172, -6.9484, -6.6214, -6.6229],
             [-10.8078, 4.0030, -0.0633, -5.0505, -5.3906, -5.4590, -5.2420, -5.4746, -5.2665, -5.3158]
         ])
         # fmt: on
@@ -512,10 +514,10 @@ class MoonshineModelIntegrationTests(unittest.TestCase):
 
         # fmt: off
         EXPECTED_LOGITS = torch.tensor([
-            [-7.7272,  1.4630,  5.2294, -7.7313, -7.6252, -7.6011, -7.6788, -7.6441, -7.8452, -7.7549],
-            [-6.2173, -0.5891,  7.9493, -7.0694, -6.9997, -6.9982, -7.0953, -7.0831, -7.1686, -7.0137],
-            [-7.3184,  3.1192,  3.8937, -5.7206, -5.8428, -5.7609, -5.9996, -5.8212, -5.8615, -5.8719],
-            [-9.5475,  1.0146,  4.1179, -5.9971, -6.0614, -6.0329, -6.2103, -6.0318, -6.0789, -6.0873]
+            [-7.7272, 1.4630, 5.2294, -7.7313, -7.6252, -7.6011, -7.6788, -7.6441, -7.8452, -7.7549],
+            [-6.2173, -0.5891, 7.9493, -7.0694, -6.9997, -6.9982, -7.0953, -7.0831, -7.1686, -7.0137],
+            [-7.3184, 3.1192, 3.8937, -5.7206, -5.8428, -5.7609, -5.9996, -5.8212, -5.8615, -5.8719],
+            [-9.5475, 1.0146, 4.1179, -5.9971, -6.0614, -6.0329, -6.2103, -6.0318, -6.0789, -6.0873]
         ])
 
         # fmt: on
diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py
index 449ddfbc2a..15d8fddb9f 100644
--- a/tests/models/mpt/test_modeling_mpt.py
+++ b/tests/models/mpt/test_modeling_mpt.py
@@ -446,7 +446,8 @@ class MptIntegrationTests(unittest.TestCase):
 
         input_text = "Hello"
         expected_outputs = Expectations({
-            ("cuda", None): "Hello, I'm a new user of the forum. I have a question about the \"Solaris",
+            (None, None): "Hello, I'm a new user of the forum. I have a question about the \"Solaris",
+            ("cuda", 8): "Hello, I'm a new user of the forum. I have a question. I have a problem with",
             ("rocm", (9, 5)): "Hello, I'm a newbie to the forum. I have a question about the \"B\" in",
         })  # fmt: off
         expected_output = expected_outputs.get_expectation()
@@ -468,10 +469,10 @@ class MptIntegrationTests(unittest.TestCase):
 
         input_text = "Hello"
         expected_outputs = Expectations({
+            (None, None): "Hello and welcome to the first episode of the new podcast, The Frugal Feminist.\n",
             ("rocm", (9, 5)): "Hello and welcome to the first day of the new release at The Stamp Man!\nToday we are",
             ("xpu", 3): "Hello and welcome to the first ever episode of the new and improved, and hopefully improved, podcast.\n",
-            ("cuda", 7): "Hello and welcome to the first episode of the new podcast, The Frugal Feminist.\n",
-            ("cuda", 8): "Hello and welcome to the first day of the new release countdown for the month of May!\nToday",
+            ("cuda", 8): "Hello and welcome to the first ever episode of the new and improved, and hopefully improved, podcast.\n",
         })  # fmt: off
         expected_output = expected_outputs.get_expectation()
 
@@ -499,13 +500,17 @@ class MptIntegrationTests(unittest.TestCase):
 
         expected_outputs = Expectations(
             {
+                (None, None): [
+                    "Hello my name is Tiffany and I am a mother of two beautiful children. I have been a nanny for the",
+                    "Today I am going at the gym and then I am going to go to the grocery store. I am going to buy some food and some",
+                ],
                 ("xpu", 3): [
                     "Hello my name is Tiffany. I am a mother of two beautiful children. I have been a nanny for over",
                     "Today I am going at the gym and then I am going to go to the mall with my mom. I am going to go to the",
                 ],
-                ("cuda", 7): [
-                    "Hello my name is Tiffany and I am a mother of two beautiful children. I have been a nanny for the",
-                    "Today I am going at the gym and then I am going to go to the grocery store. I am going to buy some food and some",
+                ("cuda", 8): [
+                    "Hello my name is Tiffany and I am a mother of two beautiful children. I have been a nanny for over",
+                    "Today I am going at the gym and then I am going to go to the grocery store. I am going to make a list of things",
                 ],
                 ("rocm", (9, 5)): [
                     "Hello my name is Jasmine and I am a very sweet and loving dog. I am a very playful dog and I",
@@ -534,8 +539,9 @@ class MptIntegrationTests(unittest.TestCase):
 
         expected_slices = Expectations(
             {
+                (None, None): torch.Tensor([-0.2520, -0.2178, -0.1953]),
                 ("xpu", 3): torch.Tensor([-0.2090, -0.2061, -0.1465]),
-                ("cuda", 7): torch.Tensor([-0.2520, -0.2178, -0.1953]),
+                ("cuda", 8): torch.Tensor([-0.2559, -0.2227, -0.2217]),
                 # TODO: This is quite a bit off, check BnB
                 ("rocm", (9, 5)): torch.Tensor([-0.3008, -0.1309, -0.1562]),
             }
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index 3de8b482d7..9356ddf92e 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -31,6 +31,7 @@ from transformers import (
     T5Config,
 )
 from transformers.testing_utils import (
+    Expectations,
     get_device_properties,
     is_torch_available,
     require_flash_attn,
@@ -1377,16 +1378,17 @@ class MusicgenIntegrationTests(unittest.TestCase):
         output_values = model.generate(**unconditional_inputs, do_sample=True, max_new_tokens=10)
 
         # fmt: off
-        EXPECTED_VALUES = torch.tensor(
-            [
-                -0.0099, -0.0140, 0.0079, 0.0080, -0.0046,  0.0065, -0.0068, -0.0185,
-                 0.0105,  0.0059, 0.0329, 0.0249, -0.0204, -0.0341, -0.0465,  0.0053,
-            ]
+        expectations = Expectations(
+            {
+                (None, None): [-0.0099, -0.0140, 0.0079, 0.0080, -0.0046, 0.0065, -0.0068, -0.0185, 0.0105, 0.0059, 0.0329, 0.0249, -0.0204, -0.0341, -0.0465, 0.0053],
+                ("cuda", 8): [-0.0099, -0.0140, 0.0079, 0.0080, -0.0046, 0.0065, -0.0068, -0.0185, 0.0105, 0.0058, 0.0328, 0.0249, -0.0205, -0.0342, -0.0466, 0.0052],
+            }
         )
+        EXPECTED_VALUES = torch.tensor(expectations.get_expectation()).to(torch_device)
         # fmt: on
 
         self.assertTrue(output_values.shape == (2, 1, 4480))
-        torch.testing.assert_close(output_values[0, 0, :16].cpu(), EXPECTED_VALUES, rtol=1e-4, atol=1e-4)
+        torch.testing.assert_close(output_values[0, 0, :16], EXPECTED_VALUES, rtol=2e-4, atol=2e-4)
 
     @slow
     def test_generate_text_prompt_greedy(self):
@@ -1459,16 +1461,17 @@ class MusicgenIntegrationTests(unittest.TestCase):
         )
 
         # fmt: off
-        EXPECTED_VALUES = torch.tensor(
-            [
-                -0.0111, -0.0154, 0.0047, 0.0058, -0.0068,  0.0012, -0.0109, -0.0229,
-                 0.0010, -0.0038, 0.0167, 0.0042, -0.0421, -0.0610, -0.0764, -0.0326,
-            ]
+        expectations = Expectations(
+            {
+                (None, None): [-0.0111, -0.0154, 0.0047, 0.0058, -0.0068, 0.0012, -0.0109, -0.0229, 0.0010, -0.0038, 0.0167, 0.0042, -0.0421, -0.0610, -0.0764, -0.0326],
+                ("cuda", 8): [-0.0110, -0.0153, 0.0048, 0.0058, -0.0068, 0.0012, -0.0109, -0.0229, 0.0010, -0.0037, 0.0168, 0.0042, -0.0420, -0.0609, -0.0763, -0.0326],
+            }
         )
+        EXPECTED_VALUES = torch.tensor(expectations.get_expectation()).to(torch_device)
         # fmt: on
 
         self.assertTrue(output_values.shape == (2, 1, 4480))
-        torch.testing.assert_close(output_values[0, 0, :16].cpu(), EXPECTED_VALUES, rtol=1e-4, atol=1e-4)
+        torch.testing.assert_close(output_values[0, 0, :16], EXPECTED_VALUES, rtol=2e-4, atol=2e-4)
 
     @slow
     def test_generate_text_audio_prompt(self):
@@ -1521,13 +1524,13 @@ class MusicgenStereoIntegrationTests(unittest.TestCase):
         # fmt: off
         EXPECTED_VALUES_LEFT = torch.tensor(
             [
-                 0.0017,  0.0004,  0.0004,  0.0005,  0.0002,  0.0002, -0.0002, -0.0013,
+                 0.0017, 0.0004, 0.0004, 0.0005, 0.0002, 0.0002, -0.0002, -0.0013,
                 -0.0010, -0.0015, -0.0018, -0.0032, -0.0060, -0.0082, -0.0096, -0.0099,
             ]
         )
         EXPECTED_VALUES_RIGHT = torch.tensor(
             [
-                0.0038, 0.0028, 0.0031,  0.0032,  0.0031,  0.0032,  0.0030,  0.0019,
+                0.0038, 0.0028, 0.0031, 0.0032, 0.0031, 0.0032, 0.0030, 0.0019,
                 0.0021, 0.0015, 0.0009, -0.0008, -0.0040, -0.0067, -0.0087, -0.0096,
             ]
         )
@@ -1555,13 +1558,13 @@ class MusicgenStereoIntegrationTests(unittest.TestCase):
         # fmt: off
         EXPECTED_VALUES_LEFT = torch.tensor(
             [
-                 0.2535,  0.2008,  0.1471,  0.0896,  0.0306, -0.0200, -0.0501, -0.0728,
+                 0.2535, 0.2008, 0.1471, 0.0896, 0.0306, -0.0200, -0.0501, -0.0728,
                 -0.0832, -0.0856, -0.0867, -0.0884, -0.0864, -0.0866, -0.0744, -0.0430,
             ]
         )
         EXPECTED_VALUES_RIGHT = torch.tensor(
             [
-                 0.1695,  0.1213,  0.0732,  0.0239, -0.0264, -0.0705, -0.0935, -0.1103,
+                 0.1695, 0.1213, 0.0732, 0.0239, -0.0264, -0.0705, -0.0935, -0.1103,
                 -0.1163, -0.1139, -0.1104, -0.1082, -0.1027, -0.1004, -0.0900, -0.0614,
             ]
         )
diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
index eef833750b..4aa812a0ae 100644
--- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@@ -30,6 +30,7 @@ from transformers import (
     T5Config,
 )
 from transformers.testing_utils import (
+    Expectations,
     get_device_properties,
     is_torch_available,
     is_torchaudio_available,
@@ -1472,16 +1473,17 @@ class MusicgenMelodyIntegrationTests(unittest.TestCase):
         )
 
         # fmt: off
-        EXPECTED_VALUES = torch.tensor(
-            [
-                -0.0165, -0.0222, -0.0041, -0.0058, -0.0145, -0.0023, -0.0160, -0.0310,
-                -0.0055, -0.0127,  0.0104,  0.0105, -0.0326, -0.0611, -0.0744, -0.0083
-            ]
+        expectations = Expectations(
+            {
+                (None, None): [-0.0165, -0.0222, -0.0041, -0.0058, -0.0145, -0.0023, -0.0160, -0.0310, -0.0055, -0.0127,  0.0104,  0.0105, -0.0326, -0.0611, -0.0744, -0.0083],
+                ("cuda", 8): [-0.0165, -0.0221, -0.0040, -0.0058, -0.0145, -0.0024, -0.0160, -0.0310, -0.0055, -0.0127,  0.0104,  0.0105, -0.0326, -0.0612, -0.0744, -0.0082],
+            }
         )
+        EXPECTED_VALUES = torch.tensor(expectations.get_expectation()).to(torch_device)
         # fmt: on
 
         self.assertTrue(output_values.shape == (2, 1, 4480))
-        torch.testing.assert_close(output_values[0, 0, :16].cpu(), EXPECTED_VALUES, rtol=1e-4, atol=1e-4)
+        torch.testing.assert_close(output_values[0, 0, :16], EXPECTED_VALUES, rtol=2e-4, atol=2e-4)
 
     @slow
     def test_generate_text_audio_prompt(self):
diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index fa1ada4f61..660d529dc9 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -19,7 +19,7 @@ import unittest
 import requests
 
 from transformers import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig, pipeline
-from transformers.testing_utils import cleanup, require_torch, require_torch_sdpa, slow, torch_device
+from transformers.testing_utils import Expectations, cleanup, require_torch, require_torch_sdpa, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -771,9 +771,18 @@ class SamModelIntegrationTest(unittest.TestCase):
         with torch.no_grad():
             outputs = model(**inputs)
         scores = outputs.iou_scores.squeeze().cpu()
-        masks = outputs.pred_masks[0, 0, 0, 0, :3].cpu()
+        masks = outputs.pred_masks[0, 0, 0, 0, :3]
+
+        expectations = Expectations(
+            {
+                (None, None): [-12.7729, -12.3665, -12.6061],
+                ("cuda", 8): [-12.7657, -12.3683, -12.5983],
+            }
+        )
+        expected_masks = torch.tensor(expectations.get_expectation()).to(torch_device)
+
         torch.testing.assert_close(scores[-1], torch.tensor(0.9566), rtol=2e-4, atol=2e-4)
-        torch.testing.assert_close(masks, torch.tensor([-12.7729, -12.3665, -12.6061]), rtol=2e-4, atol=2e-4)
+        torch.testing.assert_close(masks, expected_masks, rtol=2e-4, atol=2e-4)
 
     def test_inference_mask_generation_batched_points_batched_images(self):
         model = SamModel.from_pretrained("facebook/sam-vit-base")
diff --git a/tests/models/sam_hq/test_modeling_sam_hq.py b/tests/models/sam_hq/test_modeling_sam_hq.py
index 830b537031..b4701fa975 100644
--- a/tests/models/sam_hq/test_modeling_sam_hq.py
+++ b/tests/models/sam_hq/test_modeling_sam_hq.py
@@ -27,7 +27,7 @@ from transformers import (
     SamHQVisionModel,
     pipeline,
 )
-from transformers.testing_utils import cleanup, require_torch, require_torch_sdpa, slow, torch_device
+from transformers.testing_utils import Expectations, cleanup, require_torch, require_torch_sdpa, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -802,9 +802,15 @@ class SamHQModelIntegrationTest(unittest.TestCase):
 
         masks = outputs.pred_masks[0, 0, 0, 0, :3]
         self.assertTrue(torch.allclose(scores[0][0][-1], torch.tensor(0.4482), atol=2e-4))
-        self.assertTrue(
-            torch.allclose(masks, torch.tensor([-13.1695, -14.6201, -14.8989]).to(torch_device), atol=2e-3)
+
+        expectations = Expectations(
+            {
+                (None, None): [-13.1695, -14.6201, -14.8989],
+                ("cuda", 8): [-13.1668, -14.6182, -14.8970],
+            }
         )
+        EXPECTED_MASKS = torch.tensor(expectations.get_expectation()).to(torch_device)
+        torch.testing.assert_close(masks, EXPECTED_MASKS, atol=2e-3, rtol=2e-3)
 
     def test_inference_mask_generation_one_point_one_bb(self):
         model = SamHQModel.from_pretrained("syscv-community/sam-hq-vit-base")
@@ -849,28 +855,53 @@ class SamHQModelIntegrationTest(unittest.TestCase):
 
         with torch.no_grad():
             outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze().cpu()
-        masks = outputs.pred_masks[0, 0, 0, 0, :3].cpu()
-        EXPECTED_SCORES = torch.tensor(
-            [
-                [
-                    [0.9195, 0.8316, 0.6614],
-                    [0.9195, 0.8316, 0.6614],
-                    [0.9195, 0.8316, 0.6614],
-                    [0.9195, 0.8316, 0.6614],
-                ],
-                [
-                    [0.7598, 0.7388, 0.3110],
-                    [0.9195, 0.8317, 0.6614],
-                    [0.9195, 0.8317, 0.6614],
-                    [0.9195, 0.8317, 0.6614],
-                ],
-            ]
-        )
-        EXPECTED_MASKS = torch.tensor([-40.2445, -37.4300, -38.1577])
+        scores = outputs.iou_scores.squeeze()
+        masks = outputs.pred_masks[0, 0, 0, 0, :3]
 
-        self.assertTrue(torch.allclose(scores, EXPECTED_SCORES, atol=1e-3))
-        self.assertTrue(torch.allclose(masks, EXPECTED_MASKS, atol=9e-3))
+        expectations = Expectations(
+            {
+                (None, None): [
+                    [
+                        [0.9195, 0.8316, 0.6614],
+                        [0.9195, 0.8316, 0.6614],
+                        [0.9195, 0.8316, 0.6614],
+                        [0.9195, 0.8316, 0.6614],
+                    ],
+                    [
+                        [0.7598, 0.7388, 0.3110],
+                        [0.9195, 0.8317, 0.6614],
+                        [0.9195, 0.8317, 0.6614],
+                        [0.9195, 0.8317, 0.6614],
+                    ],
+                ],
+                ("cuda", 8): [
+                    [
+                        [0.9195, 0.8316, 0.6614],
+                        [0.9195, 0.8316, 0.6614],
+                        [0.9195, 0.8316, 0.6614],
+                        [0.9195, 0.8316, 0.6614],
+                    ],
+                    [
+                        [0.7597, 0.7387, 0.3110],
+                        [0.9195, 0.8316, 0.6614],
+                        [0.9195, 0.8316, 0.6614],
+                        [0.9195, 0.8316, 0.6614],
+                    ],
+                ],
+            }
+        )
+        EXPECTED_SCORES = torch.tensor(expectations.get_expectation()).to(torch_device)
+
+        expectations = Expectations(
+            {
+                (None, None): [-40.2445, -37.4300, -38.1577],
+                ("cuda", 8): [-40.2351, -37.4334, -38.1526],
+            }
+        )
+        EXPECTED_MASKS = torch.tensor(expectations.get_expectation()).to(torch_device)
+
+        torch.testing.assert_close(scores, EXPECTED_SCORES, atol=1e-3, rtol=1e-3)
+        torch.testing.assert_close(masks, EXPECTED_MASKS, atol=9e-3, rtol=9e-3)
 
     def test_inference_mask_generation_one_point_one_bb_zero(self):
         model = SamHQModel.from_pretrained("syscv-community/sam-hq-vit-base")
diff --git a/tests/models/timesformer/test_modeling_timesformer.py b/tests/models/timesformer/test_modeling_timesformer.py
index 247d0a5fba..5b74a8507a 100644
--- a/tests/models/timesformer/test_modeling_timesformer.py
+++ b/tests/models/timesformer/test_modeling_timesformer.py
@@ -21,7 +21,7 @@ from huggingface_hub import hf_hub_download
 
 from transformers import TimesformerConfig
 from transformers.models.auto import get_values
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import Expectations, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -350,6 +350,11 @@ class TimesformerModelIntegrationTest(unittest.TestCase):
         expected_shape = torch.Size((1, 400))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_slice = torch.tensor([-0.3016, -0.7713, -0.4205]).to(torch_device)
-
-        torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
+        expectations = Expectations(
+            {
+                (None, None): [-0.3016, -0.7713, -0.4205],
+                ("cuda", 8): [-0.3004, -0.7708, -0.4205],
+            }
+        )
+        expected_slice = torch.tensor(expectations.get_expectation()).to(torch_device)
+        torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=2e-4, atol=2e-4)
diff --git a/tests/models/timm_wrapper/test_modeling_timm_wrapper.py b/tests/models/timm_wrapper/test_modeling_timm_wrapper.py
index 3f103309a0..a37f10d381 100644
--- a/tests/models/timm_wrapper/test_modeling_timm_wrapper.py
+++ b/tests/models/timm_wrapper/test_modeling_timm_wrapper.py
@@ -18,6 +18,7 @@ import unittest
 
 from transformers import pipeline
 from transformers.testing_utils import (
+    Expectations,
     require_bitsandbytes,
     require_timm,
     require_torch,
@@ -304,10 +305,16 @@ class TimmWrapperModelIntegrationTest(unittest.TestCase):
         expected_label = 281  # tabby cat
         self.assertEqual(torch.argmax(outputs.logits).item(), expected_label)
 
-        expected_slice = torch.tensor([-11.2618, -9.6192, -10.3205]).to(torch_device)
+        expectations = Expectations(
+            {
+                (None, None): [-11.2618, -9.6192, -10.3205],
+                ("cuda", 8): [-11.2634, -9.6208, -10.3199],
+            }
+        )
+        expected_slice = torch.tensor(expectations.get_expectation()).to(torch_device)
+
         resulted_slice = outputs.logits[0, :3]
-        is_close = torch.allclose(resulted_slice, expected_slice, atol=1e-3)
-        self.assertTrue(is_close, f"Expected {expected_slice}, but got {resulted_slice}")
+        torch.testing.assert_close(resulted_slice, expected_slice, atol=1e-3, rtol=1e-3)
 
     @slow
     def test_inference_with_pipeline(self):
@@ -349,10 +356,16 @@ class TimmWrapperModelIntegrationTest(unittest.TestCase):
         expected_label = 281  # tabby cat
         self.assertEqual(torch.argmax(outputs.logits).item(), expected_label)
 
-        expected_slice = torch.tensor([-2.4043, 1.4492, -0.5127]).to(outputs.logits.dtype)
-        resulted_slice = outputs.logits[0, :3].cpu()
-        is_close = torch.allclose(resulted_slice, expected_slice, atol=0.1)
-        self.assertTrue(is_close, f"Expected {expected_slice}, but got {resulted_slice}")
+        expectations = Expectations(
+            {
+                (None, None): [-2.4043, 1.4492, -0.5127],
+                ("cuda", 8): [-2.2676, 1.5303, -0.4409],
+            }
+        )
+        expected_slice = torch.tensor(expectations.get_expectation()).to(torch_device)
+
+        resulted_slice = outputs.logits[0, :3].to(dtype=torch.float32)
+        torch.testing.assert_close(resulted_slice, expected_slice, atol=0.1, rtol=0.1)
 
     @slow
     def test_transformers_model_for_classification_is_equivalent_to_timm(self):
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 0b85c31f8a..2c592290a6 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -24,6 +24,7 @@ from pytest import mark
 from transformers import VideoMAEConfig
 from transformers.models.auto import get_values
 from transformers.testing_utils import (
+    Expectations,
     is_flaky,
     require_flash_attn,
     require_torch,
@@ -442,9 +443,14 @@ class VideoMAEModelIntegrationTest(unittest.TestCase):
         expected_shape = torch.Size((1, 400))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_slice = torch.tensor([0.3669, -0.0688, -0.2421]).to(torch_device)
-
-        torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
+        expectations = Expectations(
+            {
+                (None, None): [0.3669, -0.0688, -0.2421],
+                ("cuda", 8): [0.3668, -0.0690, -0.2421],
+            }
+        )
+        expected_slice = torch.tensor(expectations.get_expectation()).to(torch_device)
+        torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=2e-4, atol=2e-4)
 
     @slow
     def test_inference_for_pretraining(self):
diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index 6f4ac62132..e9bce2d4c6 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -169,6 +169,9 @@ class VitPoseModelTest(ModelTesterMixin, unittest.TestCase):
         self.config_tester.check_config_can_be_init_without_params()
         self.config_tester.check_config_arguments_init()
 
+    def test_batching_equivalence(self, atol=3e-4, rtol=3e-4):
+        super().test_batching_equivalence(atol=atol, rtol=rtol)
+
     @unittest.skip(reason="VitPose does not support input and output embeddings")
     def test_model_common_attributes(self):
         pass
diff --git a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
index 64ff79a68e..a95d6ca1fa 100644
--- a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
+++ b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
@@ -137,6 +137,9 @@ class VitPoseBackboneModelTest(ModelTesterMixin, unittest.TestCase):
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    def test_batching_equivalence(self, atol=3e-4, rtol=3e-4):
+        super().test_batching_equivalence(atol=atol, rtol=rtol)
+
     # TODO: @Pavel
     @unittest.skip(reason="currently failing")
     def test_initialization(self):
diff --git a/tests/models/vivit/test_modeling_vivit.py b/tests/models/vivit/test_modeling_vivit.py
index d4d3efe374..f2866febb7 100644
--- a/tests/models/vivit/test_modeling_vivit.py
+++ b/tests/models/vivit/test_modeling_vivit.py
@@ -22,7 +22,7 @@ from huggingface_hub import hf_hub_download
 
 from transformers import VivitConfig
 from transformers.models.auto import get_values
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import Expectations, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -355,10 +355,14 @@ class VivitModelIntegrationTest(unittest.TestCase):
         expected_shape = torch.Size((1, 400))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        # taken from original model
-        expected_slice = torch.tensor([-0.9498, 2.7971, -1.4049, 0.1024, -1.8353]).to(torch_device)
-
-        torch.testing.assert_close(outputs.logits[0, :5], expected_slice, rtol=1e-4, atol=1e-4)
+        expectations = Expectations(
+            {
+                (None, None): [-0.9498, 2.7971, -1.4049, 0.1024, -1.8353],
+                ("cuda", 8): [-0.9502, 2.7967, -1.4046, 0.1027, -1.8345],
+            }
+        )
+        expected_slice = torch.tensor(expectations.get_expectation()).to(torch_device)
+        torch.testing.assert_close(outputs.logits[0, :5], expected_slice, rtol=2e-4, atol=2e-4)
 
     @slow
     def test_inference_interpolate_pos_encoding(self):
diff --git a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
index 1408e09744..a8e2c4843e 100644
--- a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
+++ b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
@@ -20,6 +20,7 @@ from datasets import load_dataset
 
 from transformers import Wav2Vec2BertConfig, is_torch_available
 from transformers.testing_utils import (
+    is_flaky,
     require_torch,
     require_torch_accelerator,
     require_torch_fp16,
@@ -434,6 +435,10 @@ class Wav2Vec2BertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Test
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @is_flaky(description="Get lager difference with A10 and even with the new `5e-4` still flaky")
+    def test_batching_equivalence(self, atol=5e-4, rtol=5e-4):
+        super().test_batching_equivalence(atol=atol, rtol=rtol)
+
     def test_model_with_relative(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type="relative")
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
index f980582024..430653721d 100644
--- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
+++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
@@ -428,8 +428,8 @@ class Wav2Vec2ConformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest
     @is_flaky(
         description="The `codevector_idx` computed with `argmax()` in `Wav2Vec2ConformerGumbelVectorQuantizer.forward` is not stable."
     )
-    def test_batching_equivalence(self):
-        super().test_batching_equivalence()
+    def test_batching_equivalence(self, atol=1e-4, rtol=1e-4):
+        super().test_batching_equivalence(atol=atol, rtol=rtol)
 
     def test_model_with_relative(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type="relative")
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 1a0c7dda6e..8c5134fc6d 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -22,7 +22,14 @@ import numpy as np
 from huggingface_hub import hf_hub_download
 
 from transformers import XCLIPConfig, XCLIPTextConfig, XCLIPVisionConfig
-from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
+from transformers.testing_utils import (
+    Expectations,
+    require_torch,
+    require_torch_multi_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -751,10 +758,13 @@ class XCLIPModelIntegrationTest(unittest.TestCase):
 
         self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
 
-        expected_slice = torch.tensor(
-            [[0.0126, 0.2109, 0.0609], [0.0448, 0.5862, -0.1688], [-0.0881, 0.8525, -0.3044]]
-        ).to(torch_device)
-
-        torch.testing.assert_close(
-            outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4
+        expectations = Expectations(
+            {
+                (None, None): [[0.0126, 0.2109, 0.0609], [0.0448, 0.5862, -0.1688], [-0.0881, 0.8525, -0.3044]],
+                ("cuda", 8): [[0.0141, 0.2114, 0.0599], [0.0446, 0.5866, -0.1674], [-0.0876, 0.8592, -0.3025]],
+            }
+        )
+        expected_slice = torch.tensor(expectations.get_expectation()).to(torch_device)
+        torch.testing.assert_close(
+            outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, rtol=2e-4, atol=2e-4
         )