Byebye test_batching_equivalence's flakiness (#35729)

* fix * fix * skip * better error message --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2025-01-21 13:11:33 +01:00
parent 78f5ee0217
commit fd8d61fdb2
18 changed files with 92 additions and 50 deletions
--- a/tests/models/autoformer/test_modeling_autoformer.py
+++ b/tests/models/autoformer/test_modeling_autoformer.py
@@ -217,6 +217,13 @@ class AutoformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
        self.model_tester = AutoformerModelTester(self)
        self.config_tester = ConfigTester(self, config_class=AutoformerConfig, has_text_modality=False)

+    # TODO: (ydshieh) Fix the wrong logic for `tmp_delay` is possible
+    @unittest.skip(
+        reason="The computation of `tmp_delay` in `AutoformerAttention.forward` seems wrong, see PR #12345. Also `topk` is used to compute indices which is not stable."
+    )
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
    def test_config(self):
        self.config_tester.run_common_tests()

--- a/tests/models/dac/test_modeling_dac.py
+++ b/tests/models/dac/test_modeling_dac.py
@@ -146,6 +146,11 @@ class DacModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model_forward(*config_and_inputs)

+    # TODO (ydshieh): Although we have a potential cause, it's still strange that this test fails all the time with large differences
+    @unittest.skip(reason="Might be caused by `indices` computed with `max()` in `decode_latents`")
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
    def test_forward_signature(self):
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()

--- a/tests/models/dpt/test_modeling_dpt_hybrid.py
+++ b/tests/models/dpt/test_modeling_dpt_hybrid.py
@@ -18,7 +18,7 @@ import unittest

 from transformers import DPTConfig
 from transformers.file_utils import is_torch_available, is_vision_available
-from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device

 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
@@ -304,10 +304,6 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
        with self.assertRaises(ValueError):
            _ = DPTForDepthEstimation(config)

-    @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
-    def test_batching_equivalence(self):
-        super().test_batching_equivalence()
-

 # We will verify our results on an image of cute cats
 def prepare_img():
--- a/tests/models/esm/test_modeling_esmfold.py
+++ b/tests/models/esm/test_modeling_esmfold.py
@@ -17,7 +17,7 @@
 import unittest

 from transformers import EsmConfig, is_torch_available
-from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
+from transformers.testing_utils import TestCasePlus, is_flaky, require_torch, slow, torch_device

 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
@@ -184,6 +184,12 @@ class EsmFoldModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

+    @is_flaky(
+        description="The computed `s = s / norm_denom` in `EsmFoldAngleResnet` is numerically instable if `norm_denom` is very small."
+    )
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
    @unittest.skip(reason="Does not support attention outputs")
    def test_attention_outputs(self):
        pass
--- a/tests/models/groupvit/test_modeling_groupvit.py
+++ b/tests/models/groupvit/test_modeling_groupvit.py
@@ -24,7 +24,7 @@ import numpy as np
 import requests

 from transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
-from transformers.testing_utils import is_pt_tf_cross_test, require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import is_flaky, is_pt_tf_cross_test, require_torch, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available

 from ...test_configuration_common import ConfigTester
@@ -162,6 +162,10 @@ class GroupViTVisionModelTest(ModelTesterMixin, unittest.TestCase):
    def test_inputs_embeds(self):
        pass

+    @is_flaky(description="The `index` computed with `max()` in `hard_softmax` is not stable.")
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
    @is_pt_tf_cross_test
    def test_pt_tf_model_equivalence(self):
        import tensorflow as tf
@@ -571,6 +575,10 @@ class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
    def test_config(self):
        self.config_tester.run_common_tests()

+    @is_flaky(description="The `index` computed with `max()` in `hard_softmax` is not stable.")
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
    @unittest.skip(reason="hidden_states are tested in individual model tests")
    def test_hidden_states_output(self):
        pass
--- a/tests/models/mimi/test_modeling_mimi.py
+++ b/tests/models/mimi/test_modeling_mimi.py
@@ -734,10 +734,6 @@ class MimiModelTest(ModelTesterMixin, unittest.TestCase):
    def test_sdpa_can_compile_dynamic(self):
        pass

-    @is_flaky()
-    def test_batching_equivalence(self):
-        super().test_batching_equivalence()
-

 # Copied from transformers.tests.encodec.test_modeling_encodec.normalize
 def normalize(arr):
--- a/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
+++ b/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
@@ -17,7 +17,7 @@
 import unittest

 from transformers import MobileNetV1Config
-from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available

 from ...test_configuration_common import ConfigTester
@@ -214,12 +214,6 @@ class MobileNetV1ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
        model = MobileNetV1Model.from_pretrained(model_name)
        self.assertIsNotNone(model)

-    # TODO: ydshieh
-    @unittest.skip("skip for now as #35564 fails this test more frequently for this model")
-    @is_flaky(description="is_flaky https://github.com/huggingface/transformers/pull/31258")
-    def test_batching_equivalence(self):
-        super().test_batching_equivalence()
-

 # We will verify our results on an image of cute cats
 def prepare_img():
--- a/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py
+++ b/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py
@@ -17,7 +17,7 @@
 import unittest

 from transformers import MobileNetV2Config
-from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available

 from ...test_configuration_common import ConfigTester
@@ -269,10 +269,6 @@ class MobileNetV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
        model = MobileNetV2Model.from_pretrained(model_name)
        self.assertIsNotNone(model)

-    @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
-    def test_batching_equivalence(self):
-        super().test_batching_equivalence()
-

 # We will verify our results on an image of cute cats
 def prepare_img():
--- a/tests/models/mobilevit/test_modeling_mobilevit.py
+++ b/tests/models/mobilevit/test_modeling_mobilevit.py
@@ -17,7 +17,7 @@
 import unittest

 from transformers import MobileViTConfig
-from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available

 from ...test_configuration_common import ConfigTester
@@ -274,10 +274,6 @@ class MobileViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
        model = MobileViTModel.from_pretrained(model_name)
        self.assertIsNotNone(model)

-    @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
-    def test_batching_equivalence(self):
-        super().test_batching_equivalence()
-

 # We will verify our results on an image of cute cats
 def prepare_img():
--- a/tests/models/oneformer/test_modeling_oneformer.py
+++ b/tests/models/oneformer/test_modeling_oneformer.py
@@ -23,6 +23,7 @@ import numpy as np
 from tests.test_modeling_common import floats_tensor
 from transformers import OneFormerConfig, is_torch_available, is_vision_available
 from transformers.testing_utils import (
+    is_flaky,
    require_timm,
    require_torch,
    require_torch_accelerator,
@@ -268,6 +269,12 @@ class OneFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
    def test_config(self):
        self.config_tester.run_common_tests()

+    @is_flaky(
+        description="The `attention_mask` computed with `< 0.5` in `OneFormerTransformerDecoder.forward_prediction_heads` is sensitive to input values."
+    )
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
    def test_oneformer_model(self):
        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
        self.model_tester.create_and_check_oneformer_model(config, **inputs, output_hidden_states=False)
--- a/tests/models/superpoint/test_modeling_superpoint.py
+++ b/tests/models/superpoint/test_modeling_superpoint.py
@@ -16,7 +16,7 @@ import unittest
 from typing import List

 from transformers.models.superpoint.configuration_superpoint import SuperPointConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available

 from ...test_configuration_common import ConfigTester
@@ -135,6 +135,10 @@ class SuperPointModelTest(ModelTesterMixin, unittest.TestCase):
    def test_config(self):
        self.config_tester.run_common_tests()

+    @is_flaky(description="The `indices` computed with `topk()` in `top_k_keypoints` is not stable.")
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
    @unittest.skip(reason="SuperPointForKeypointDetection does not use inputs_embeds")
    def test_inputs_embeds(self):
        pass
--- a/tests/models/timm_backbone/test_modeling_timm_backbone.py
+++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py
@@ -18,7 +18,7 @@ import inspect
 import unittest

 from transformers import AutoBackbone
-from transformers.testing_utils import require_timm, require_torch, torch_device
+from transformers.testing_utils import is_flaky, require_timm, require_torch, torch_device
 from transformers.utils.import_utils import is_torch_available

 from ...test_backbone_common import BackboneTesterMixin
@@ -115,6 +115,12 @@ class TimmBackboneModelTest(ModelTesterMixin, BackboneTesterMixin, PipelineTeste
    def test_config(self):
        self.config_tester.run_common_tests()

+    @is_flaky(
+        description="`TimmBackbone` has no `_init_weights`. Timm's way of weight init. seems to give larger magnitude in the intermediate values during `forward`."
+    )
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
    def test_timm_transformer_backbone_equivalence(self):
        timm_checkpoint = "resnet18"
        transformers_checkpoint = "microsoft/resnet-18"
--- a/tests/models/unispeech/test_modeling_unispeech.py
+++ b/tests/models/unispeech/test_modeling_unispeech.py
@@ -22,7 +22,7 @@ import pytest
 from datasets import load_dataset

 from transformers import UniSpeechConfig, is_torch_available
-from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
+from transformers.testing_utils import is_flaky, require_soundfile, require_torch, slow, torch_device

 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
@@ -329,6 +329,12 @@ class UniSpeechRobustModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.T
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

+    @is_flaky(
+        description="The `codevector_idx` computed with `argmax()` in `UniSpeechGumbelVectorQuantizer.forward` is not stable."
+    )
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
    def test_batched_inference(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -30,6 +30,7 @@ from transformers import Wav2Vec2Config, is_torch_available
 from transformers.testing_utils import (
    CaptureLogger,
    cleanup,
+    is_flaky,
    is_pt_flax_cross_test,
    is_pyctcdecode_available,
    is_torchaudio_available,
@@ -863,6 +864,12 @@ class Wav2Vec2RobustModelTest(ModelTesterMixin, unittest.TestCase):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

+    @is_flaky(
+        description="The `codevector_idx` computed with `argmax()` in `Wav2Vec2GumbelVectorQuantizer.forward` is not stable."
+    )
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
    def test_model_with_adapter(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
--- a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
+++ b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
@@ -423,7 +423,6 @@ class Wav2Vec2BertModelTester:


@require_torch
-# Copied from tests.models.wav2vec2_conformer.test_modeling_wav2vec2_conformer.Wav2Vec2ConformerModelTest with Conformer->Bert, input_values->input_features
 class Wav2Vec2BertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    # Ignore copy
    all_model_classes = (
--- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
+++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
@@ -23,6 +23,7 @@ from datasets import load_dataset

 from transformers import Wav2Vec2ConformerConfig, is_torch_available
 from transformers.testing_utils import (
+    is_flaky,
    is_pt_flax_cross_test,
    require_torch,
    require_torch_accelerator,
@@ -452,6 +453,12 @@ class Wav2Vec2ConformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

+    @is_flaky(
+        description="The `codevector_idx` computed with `argmax()` in `Wav2Vec2ConformerGumbelVectorQuantizer.forward` is not stable."
+    )
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
    def test_model_with_relative(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type="relative")
        self.model_tester.create_and_check_model(*config_and_inputs)
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -770,15 +770,6 @@ class ModelTesterMixin:
        different results: https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535)
        """

-        def get_tensor_equivalence_function(batched_input):
-            # models operating on continuous spaces have higher abs difference than LMs
-            # instead, we can rely on cos distance for image/speech models, similar to `diffusers`
-            if "input_ids" not in batched_input:
-                return lambda tensor1, tensor2: (
-                    1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=1e-38)
-                )
-            return lambda tensor1, tensor2: torch.max(torch.abs(tensor1 - tensor2))
-
        def recursive_check(batched_object, single_row_object, model_name, key):
            if isinstance(batched_object, (list, tuple)):
                for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
@@ -793,6 +784,10 @@ class ModelTesterMixin:
                return
            elif batched_object.dim() == 0:
                return
+            # do not compare int or bool outputs as they are mostly computed with max/argmax/topk methods which are
+            # very sensitive to the inputs (e.g. tiny differences may give totally different results)
+            elif not torch.is_floating_point(batched_object):
+                return
            else:
                # indexing the first element does not always work
                # e.g. models that output similarity scores of size (N, M) would need to index [0, 0]
@@ -810,19 +805,17 @@ class ModelTesterMixin:
                self.assertFalse(
                    torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
                )
-                self.assertTrue(
-                    (equivalence(batched_row, single_row_object)) <= 1e-03,
-                    msg=(
-                        f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
-                        f"Difference={equivalence(batched_row, single_row_object)}."
-                    ),
-                )
+                try:
+                    torch.testing.assert_close(batched_row, single_row_object, atol=1e-5, rtol=1e-5)
+                except AssertionError as e:
+                    msg = f"Batched and Single row outputs are not equal in {model_name} for key={key}.\n\n"
+                    msg += str(e)
+                    raise AssertionError(msg)

        set_model_tester_for_less_flaky_test(self)

        config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
        set_config_for_less_flaky_test(config)
-        equivalence = get_tensor_equivalence_function(batched_input)

        for model_class in self.all_model_classes:
            config.output_hidden_states = True