From fd8d61fdb261d878576f8ed74179640242ff72d8 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Tue, 21 Jan 2025 13:11:33 +0100 Subject: [PATCH] Byebye `test_batching_equivalence`'s flakiness (#35729) * fix * fix * skip * better error message --------- Co-authored-by: ydshieh --- src/transformers/testing_utils.py | 11 +++++++- .../autoformer/test_modeling_autoformer.py | 7 +++++ tests/models/dac/test_modeling_dac.py | 5 ++++ tests/models/dpt/test_modeling_dpt_hybrid.py | 6 +---- tests/models/esm/test_modeling_esmfold.py | 8 +++++- .../models/groupvit/test_modeling_groupvit.py | 10 ++++++- tests/models/mimi/test_modeling_mimi.py | 4 --- .../test_modeling_mobilenet_v1.py | 8 +----- .../test_modeling_mobilenet_v2.py | 6 +---- .../mobilevit/test_modeling_mobilevit.py | 6 +---- .../oneformer/test_modeling_oneformer.py | 7 +++++ .../superpoint/test_modeling_superpoint.py | 6 ++++- .../test_modeling_timm_backbone.py | 8 +++++- .../unispeech/test_modeling_unispeech.py | 8 +++++- .../models/wav2vec2/test_modeling_wav2vec2.py | 7 +++++ .../test_modeling_wav2vec2_bert.py | 1 - .../test_modeling_wav2vec2_conformer.py | 7 +++++ tests/test_modeling_common.py | 27 +++++++------------ 18 files changed, 92 insertions(+), 50 deletions(-) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index da2a39f462..8e687724fa 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -1498,7 +1498,16 @@ def set_config_for_less_flaky_test(config): def set_model_for_less_flaky_test(model): # Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.) - target_names = ("LayerNorm", "GroupNorm", "BatchNorm", "RMSNorm", "BatchNorm2d", "BatchNorm1d") + target_names = ( + "LayerNorm", + "GroupNorm", + "BatchNorm", + "RMSNorm", + "BatchNorm2d", + "BatchNorm1d", + "BitGroupNormActivation", + "WeightStandardizedConv2d", + ) target_attrs = ["eps", "epsilon", "variance_epsilon"] if is_torch_available() and isinstance(model, torch.nn.Module): for module in model.modules(): diff --git a/tests/models/autoformer/test_modeling_autoformer.py b/tests/models/autoformer/test_modeling_autoformer.py index f0cd5dad37..489e872e65 100644 --- a/tests/models/autoformer/test_modeling_autoformer.py +++ b/tests/models/autoformer/test_modeling_autoformer.py @@ -217,6 +217,13 @@ class AutoformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa self.model_tester = AutoformerModelTester(self) self.config_tester = ConfigTester(self, config_class=AutoformerConfig, has_text_modality=False) + # TODO: (ydshieh) Fix the wrong logic for `tmp_delay` is possible + @unittest.skip( + reason="The computation of `tmp_delay` in `AutoformerAttention.forward` seems wrong, see PR #12345. Also `topk` is used to compute indices which is not stable." + ) + def test_batching_equivalence(self): + super().test_batching_equivalence() + def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/dac/test_modeling_dac.py b/tests/models/dac/test_modeling_dac.py index 55a17ab1e0..62e2241796 100644 --- a/tests/models/dac/test_modeling_dac.py +++ b/tests/models/dac/test_modeling_dac.py @@ -146,6 +146,11 @@ class DacModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_forward(*config_and_inputs) + # TODO (ydshieh): Although we have a potential cause, it's still strange that this test fails all the time with large differences + @unittest.skip(reason="Might be caused by `indices` computed with `max()` in `decode_latents`") + def test_batching_equivalence(self): + super().test_batching_equivalence() + def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py index ab117c1441..dbdb5aa9e9 100644 --- a/tests/models/dpt/test_modeling_dpt_hybrid.py +++ b/tests/models/dpt/test_modeling_dpt_hybrid.py @@ -18,7 +18,7 @@ import unittest from transformers import DPTConfig from transformers.file_utils import is_torch_available, is_vision_available -from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device +from transformers.testing_utils import require_torch, require_vision, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor @@ -304,10 +304,6 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): with self.assertRaises(ValueError): _ = DPTForDepthEstimation(config) - @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516") - def test_batching_equivalence(self): - super().test_batching_equivalence() - # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py index 5c05efb03f..7c461bdc46 100644 --- a/tests/models/esm/test_modeling_esmfold.py +++ b/tests/models/esm/test_modeling_esmfold.py @@ -17,7 +17,7 @@ import unittest from transformers import EsmConfig, is_torch_available -from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device +from transformers.testing_utils import TestCasePlus, is_flaky, require_torch, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask @@ -184,6 +184,12 @@ class EsmFoldModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @is_flaky( + description="The computed `s = s / norm_denom` in `EsmFoldAngleResnet` is numerically instable if `norm_denom` is very small." + ) + def test_batching_equivalence(self): + super().test_batching_equivalence() + @unittest.skip(reason="Does not support attention outputs") def test_attention_outputs(self): pass diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py index 88b55ec56d..a4b4f3543a 100644 --- a/tests/models/groupvit/test_modeling_groupvit.py +++ b/tests/models/groupvit/test_modeling_groupvit.py @@ -24,7 +24,7 @@ import numpy as np import requests from transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig -from transformers.testing_utils import is_pt_tf_cross_test, require_torch, require_vision, slow, torch_device +from transformers.testing_utils import is_flaky, is_pt_tf_cross_test, require_torch, require_vision, slow, torch_device from transformers.utils import is_torch_available, is_vision_available from ...test_configuration_common import ConfigTester @@ -162,6 +162,10 @@ class GroupViTVisionModelTest(ModelTesterMixin, unittest.TestCase): def test_inputs_embeds(self): pass + @is_flaky(description="The `index` computed with `max()` in `hard_softmax` is not stable.") + def test_batching_equivalence(self): + super().test_batching_equivalence() + @is_pt_tf_cross_test def test_pt_tf_model_equivalence(self): import tensorflow as tf @@ -571,6 +575,10 @@ class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase def test_config(self): self.config_tester.run_common_tests() + @is_flaky(description="The `index` computed with `max()` in `hard_softmax` is not stable.") + def test_batching_equivalence(self): + super().test_batching_equivalence() + @unittest.skip(reason="hidden_states are tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py index 4f6cfaff7e..bc3c663b46 100644 --- a/tests/models/mimi/test_modeling_mimi.py +++ b/tests/models/mimi/test_modeling_mimi.py @@ -734,10 +734,6 @@ class MimiModelTest(ModelTesterMixin, unittest.TestCase): def test_sdpa_can_compile_dynamic(self): pass - @is_flaky() - def test_batching_equivalence(self): - super().test_batching_equivalence() - # Copied from transformers.tests.encodec.test_modeling_encodec.normalize def normalize(arr): diff --git a/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py b/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py index 31153173d2..d272347991 100644 --- a/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py +++ b/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py @@ -17,7 +17,7 @@ import unittest from transformers import MobileNetV1Config -from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device +from transformers.testing_utils import require_torch, require_vision, slow, torch_device from transformers.utils import cached_property, is_torch_available, is_vision_available from ...test_configuration_common import ConfigTester @@ -214,12 +214,6 @@ class MobileNetV1ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC model = MobileNetV1Model.from_pretrained(model_name) self.assertIsNotNone(model) - # TODO: ydshieh - @unittest.skip("skip for now as #35564 fails this test more frequently for this model") - @is_flaky(description="is_flaky https://github.com/huggingface/transformers/pull/31258") - def test_batching_equivalence(self): - super().test_batching_equivalence() - # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py b/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py index 7df6cbd119..2f8fb55554 100644 --- a/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py +++ b/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py @@ -17,7 +17,7 @@ import unittest from transformers import MobileNetV2Config -from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device +from transformers.testing_utils import require_torch, require_vision, slow, torch_device from transformers.utils import cached_property, is_torch_available, is_vision_available from ...test_configuration_common import ConfigTester @@ -269,10 +269,6 @@ class MobileNetV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC model = MobileNetV2Model.from_pretrained(model_name) self.assertIsNotNone(model) - @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516") - def test_batching_equivalence(self): - super().test_batching_equivalence() - # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/mobilevit/test_modeling_mobilevit.py b/tests/models/mobilevit/test_modeling_mobilevit.py index cd4cfa68e5..9eb5878500 100644 --- a/tests/models/mobilevit/test_modeling_mobilevit.py +++ b/tests/models/mobilevit/test_modeling_mobilevit.py @@ -17,7 +17,7 @@ import unittest from transformers import MobileViTConfig -from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device +from transformers.testing_utils import require_torch, require_vision, slow, torch_device from transformers.utils import cached_property, is_torch_available, is_vision_available from ...test_configuration_common import ConfigTester @@ -274,10 +274,6 @@ class MobileViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas model = MobileViTModel.from_pretrained(model_name) self.assertIsNotNone(model) - @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516") - def test_batching_equivalence(self): - super().test_batching_equivalence() - # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py index f27302f8e1..d75a76cd4f 100644 --- a/tests/models/oneformer/test_modeling_oneformer.py +++ b/tests/models/oneformer/test_modeling_oneformer.py @@ -23,6 +23,7 @@ import numpy as np from tests.test_modeling_common import floats_tensor from transformers import OneFormerConfig, is_torch_available, is_vision_available from transformers.testing_utils import ( + is_flaky, require_timm, require_torch, require_torch_accelerator, @@ -268,6 +269,12 @@ class OneFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas def test_config(self): self.config_tester.run_common_tests() + @is_flaky( + description="The `attention_mask` computed with `< 0.5` in `OneFormerTransformerDecoder.forward_prediction_heads` is sensitive to input values." + ) + def test_batching_equivalence(self): + super().test_batching_equivalence() + def test_oneformer_model(self): config, inputs = self.model_tester.prepare_config_and_inputs_for_common() self.model_tester.create_and_check_oneformer_model(config, **inputs, output_hidden_states=False) diff --git a/tests/models/superpoint/test_modeling_superpoint.py b/tests/models/superpoint/test_modeling_superpoint.py index 8db435502c..e811d3f6b4 100644 --- a/tests/models/superpoint/test_modeling_superpoint.py +++ b/tests/models/superpoint/test_modeling_superpoint.py @@ -16,7 +16,7 @@ import unittest from typing import List from transformers.models.superpoint.configuration_superpoint import SuperPointConfig -from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device from transformers.utils import cached_property, is_torch_available, is_vision_available from ...test_configuration_common import ConfigTester @@ -135,6 +135,10 @@ class SuperPointModelTest(ModelTesterMixin, unittest.TestCase): def test_config(self): self.config_tester.run_common_tests() + @is_flaky(description="The `indices` computed with `topk()` in `top_k_keypoints` is not stable.") + def test_batching_equivalence(self): + super().test_batching_equivalence() + @unittest.skip(reason="SuperPointForKeypointDetection does not use inputs_embeds") def test_inputs_embeds(self): pass diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py index 43c511e1ef..296a38c176 100644 --- a/tests/models/timm_backbone/test_modeling_timm_backbone.py +++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py @@ -18,7 +18,7 @@ import inspect import unittest from transformers import AutoBackbone -from transformers.testing_utils import require_timm, require_torch, torch_device +from transformers.testing_utils import is_flaky, require_timm, require_torch, torch_device from transformers.utils.import_utils import is_torch_available from ...test_backbone_common import BackboneTesterMixin @@ -115,6 +115,12 @@ class TimmBackboneModelTest(ModelTesterMixin, BackboneTesterMixin, PipelineTeste def test_config(self): self.config_tester.run_common_tests() + @is_flaky( + description="`TimmBackbone` has no `_init_weights`. Timm's way of weight init. seems to give larger magnitude in the intermediate values during `forward`." + ) + def test_batching_equivalence(self): + super().test_batching_equivalence() + def test_timm_transformer_backbone_equivalence(self): timm_checkpoint = "resnet18" transformers_checkpoint = "microsoft/resnet-18" diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py index d0a1d35224..4290ac21ab 100644 --- a/tests/models/unispeech/test_modeling_unispeech.py +++ b/tests/models/unispeech/test_modeling_unispeech.py @@ -22,7 +22,7 @@ import pytest from datasets import load_dataset from transformers import UniSpeechConfig, is_torch_available -from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device +from transformers.testing_utils import is_flaky, require_soundfile, require_torch, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( @@ -329,6 +329,12 @@ class UniSpeechRobustModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.T config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @is_flaky( + description="The `codevector_idx` computed with `argmax()` in `UniSpeechGumbelVectorQuantizer.forward` is not stable." + ) + def test_batching_equivalence(self): + super().test_batching_equivalence() + def test_batched_inference(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_batch_inference(*config_and_inputs) diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py index b2d90adc79..9e82002f61 100644 --- a/tests/models/wav2vec2/test_modeling_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py @@ -30,6 +30,7 @@ from transformers import Wav2Vec2Config, is_torch_available from transformers.testing_utils import ( CaptureLogger, cleanup, + is_flaky, is_pt_flax_cross_test, is_pyctcdecode_available, is_torchaudio_available, @@ -863,6 +864,12 @@ class Wav2Vec2RobustModelTest(ModelTesterMixin, unittest.TestCase): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @is_flaky( + description="The `codevector_idx` computed with `argmax()` in `Wav2Vec2GumbelVectorQuantizer.forward` is not stable." + ) + def test_batching_equivalence(self): + super().test_batching_equivalence() + def test_model_with_adapter(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_with_adapter(*config_and_inputs) diff --git a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py index 80237fea9d..0fbc18165e 100644 --- a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py +++ b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py @@ -423,7 +423,6 @@ class Wav2Vec2BertModelTester: @require_torch -# Copied from tests.models.wav2vec2_conformer.test_modeling_wav2vec2_conformer.Wav2Vec2ConformerModelTest with Conformer->Bert, input_values->input_features class Wav2Vec2BertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): # Ignore copy all_model_classes = ( diff --git a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py index 096d1368ed..2f1e5a8e34 100644 --- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py +++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py @@ -23,6 +23,7 @@ from datasets import load_dataset from transformers import Wav2Vec2ConformerConfig, is_torch_available from transformers.testing_utils import ( + is_flaky, is_pt_flax_cross_test, require_torch, require_torch_accelerator, @@ -452,6 +453,12 @@ class Wav2Vec2ConformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @is_flaky( + description="The `codevector_idx` computed with `argmax()` in `Wav2Vec2ConformerGumbelVectorQuantizer.forward` is not stable." + ) + def test_batching_equivalence(self): + super().test_batching_equivalence() + def test_model_with_relative(self): config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type="relative") self.model_tester.create_and_check_model(*config_and_inputs) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 965d759369..cf259fabe3 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -770,15 +770,6 @@ class ModelTesterMixin: different results: https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535) """ - def get_tensor_equivalence_function(batched_input): - # models operating on continuous spaces have higher abs difference than LMs - # instead, we can rely on cos distance for image/speech models, similar to `diffusers` - if "input_ids" not in batched_input: - return lambda tensor1, tensor2: ( - 1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=1e-38) - ) - return lambda tensor1, tensor2: torch.max(torch.abs(tensor1 - tensor2)) - def recursive_check(batched_object, single_row_object, model_name, key): if isinstance(batched_object, (list, tuple)): for batched_object_value, single_row_object_value in zip(batched_object, single_row_object): @@ -793,6 +784,10 @@ class ModelTesterMixin: return elif batched_object.dim() == 0: return + # do not compare int or bool outputs as they are mostly computed with max/argmax/topk methods which are + # very sensitive to the inputs (e.g. tiny differences may give totally different results) + elif not torch.is_floating_point(batched_object): + return else: # indexing the first element does not always work # e.g. models that output similarity scores of size (N, M) would need to index [0, 0] @@ -810,19 +805,17 @@ class ModelTesterMixin: self.assertFalse( torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}" ) - self.assertTrue( - (equivalence(batched_row, single_row_object)) <= 1e-03, - msg=( - f"Batched and Single row outputs are not equal in {model_name} for key={key}. " - f"Difference={equivalence(batched_row, single_row_object)}." - ), - ) + try: + torch.testing.assert_close(batched_row, single_row_object, atol=1e-5, rtol=1e-5) + except AssertionError as e: + msg = f"Batched and Single row outputs are not equal in {model_name} for key={key}.\n\n" + msg += str(e) + raise AssertionError(msg) set_model_tester_for_less_flaky_test(self) config, batched_input = self.model_tester.prepare_config_and_inputs_for_common() set_config_for_less_flaky_test(config) - equivalence = get_tensor_equivalence_function(batched_input) for model_class in self.all_model_classes: config.output_hidden_states = True