Make test_generate_with_static_cache even less flaky (#34995)
* fix * fix * fix * fix * fix * fix * fix * fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -14,6 +14,7 @@
|
|||||||
|
|
||||||
import collections
|
import collections
|
||||||
import contextlib
|
import contextlib
|
||||||
|
import copy
|
||||||
import doctest
|
import doctest
|
||||||
import functools
|
import functools
|
||||||
import gc
|
import gc
|
||||||
@@ -1396,6 +1397,53 @@ def assert_screenout(out, what):
|
|||||||
assert match_str != -1, f"expecting to find {what} in output: f{out_pr}"
|
assert match_str != -1, f"expecting to find {what} in output: f{out_pr}"
|
||||||
|
|
||||||
|
|
||||||
|
def set_model_tester_for_less_flaky_test(test_case):
|
||||||
|
if hasattr(test_case.model_tester, "num_hidden_layers"):
|
||||||
|
test_case.model_tester.num_hidden_layers = 1
|
||||||
|
if (
|
||||||
|
hasattr(test_case.model_tester, "vision_config")
|
||||||
|
and "num_hidden_layers" in test_case.model_tester.vision_config
|
||||||
|
):
|
||||||
|
test_case.model_tester.vision_config = copy.deepcopy(test_case.model_tester.vision_config)
|
||||||
|
test_case.model_tester.vision_config["num_hidden_layers"] = 1
|
||||||
|
if hasattr(test_case.model_tester, "text_config") and "num_hidden_layers" in test_case.model_tester.text_config:
|
||||||
|
test_case.model_tester.text_config = copy.deepcopy(test_case.model_tester.text_config)
|
||||||
|
test_case.model_tester.text_config["num_hidden_layers"] = 1
|
||||||
|
|
||||||
|
|
||||||
|
def set_config_for_less_flaky_test(config):
|
||||||
|
target_attrs = [
|
||||||
|
"rms_norm_eps",
|
||||||
|
"layer_norm_eps",
|
||||||
|
"norm_eps",
|
||||||
|
"norm_epsilon",
|
||||||
|
"layer_norm_epsilon",
|
||||||
|
"batch_norm_eps",
|
||||||
|
]
|
||||||
|
for target_attr in target_attrs:
|
||||||
|
setattr(config, target_attr, 1.0)
|
||||||
|
|
||||||
|
# norm layers (layer/group norm, etc.) could cause flaky tests when the tensors have very small variance.
|
||||||
|
# (We don't need the original epsilon values to check eager/sdpa matches)
|
||||||
|
attrs = ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"]
|
||||||
|
for attr in attrs:
|
||||||
|
if hasattr(config, attr):
|
||||||
|
for target_attr in target_attrs:
|
||||||
|
setattr(getattr(config, attr), target_attr, 1.0)
|
||||||
|
|
||||||
|
|
||||||
|
def set_model_for_less_flaky_test(model):
|
||||||
|
# Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.)
|
||||||
|
target_names = ("LayerNorm", "GroupNorm", "BatchNorm", "RMSNorm", "BatchNorm2d", "BatchNorm1d")
|
||||||
|
target_attrs = ["eps", "epsilon", "variance_epsilon"]
|
||||||
|
if is_torch_available() and isinstance(model, torch.nn.Module):
|
||||||
|
for module in model.modules():
|
||||||
|
if type(module).__name__.endswith(target_names):
|
||||||
|
for attr in target_attrs:
|
||||||
|
if hasattr(module, attr):
|
||||||
|
setattr(module, attr, 1.0)
|
||||||
|
|
||||||
|
|
||||||
class CaptureStd:
|
class CaptureStd:
|
||||||
"""
|
"""
|
||||||
Context manager to capture:
|
Context manager to capture:
|
||||||
|
|||||||
@@ -37,6 +37,9 @@ from transformers.testing_utils import (
|
|||||||
require_torch_multi_accelerator,
|
require_torch_multi_accelerator,
|
||||||
require_torch_multi_gpu,
|
require_torch_multi_gpu,
|
||||||
require_torch_sdpa,
|
require_torch_sdpa,
|
||||||
|
set_config_for_less_flaky_test,
|
||||||
|
set_model_for_less_flaky_test,
|
||||||
|
set_model_tester_for_less_flaky_test,
|
||||||
slow,
|
slow,
|
||||||
torch_device,
|
torch_device,
|
||||||
)
|
)
|
||||||
@@ -1921,11 +1924,13 @@ class GenerationTesterMixin:
|
|||||||
Tests that generating with static cache give almost same results as with dynamic cache, and the output cache
|
Tests that generating with static cache give almost same results as with dynamic cache, and the output cache
|
||||||
has the expected shapes
|
has the expected shapes
|
||||||
"""
|
"""
|
||||||
|
set_model_tester_for_less_flaky_test(self)
|
||||||
for model_class in self.all_generative_model_classes:
|
for model_class in self.all_generative_model_classes:
|
||||||
if not model_class._supports_static_cache:
|
if not model_class._supports_static_cache:
|
||||||
self.skipTest(reason="This model does not support the static cache format")
|
self.skipTest(reason="This model does not support the static cache format")
|
||||||
|
|
||||||
config, inputs_dict = self.prepare_config_and_inputs_for_generate()
|
config, inputs_dict = self.prepare_config_and_inputs_for_generate()
|
||||||
|
set_config_for_less_flaky_test(config)
|
||||||
main_input = inputs_dict[model_class.main_input_name]
|
main_input = inputs_dict[model_class.main_input_name]
|
||||||
|
|
||||||
if config.is_encoder_decoder:
|
if config.is_encoder_decoder:
|
||||||
@@ -1938,6 +1943,8 @@ class GenerationTesterMixin:
|
|||||||
|
|
||||||
for dtype in (torch.float32, torch.float16):
|
for dtype in (torch.float32, torch.float16):
|
||||||
model = model_class(config).to(torch_device).to(dtype).eval()
|
model = model_class(config).to(torch_device).to(dtype).eval()
|
||||||
|
set_model_for_less_flaky_test(model)
|
||||||
|
|
||||||
generation_kwargs = {
|
generation_kwargs = {
|
||||||
"max_new_tokens": max_new_tokens,
|
"max_new_tokens": max_new_tokens,
|
||||||
"return_dict_in_generate": True, # Required to return `past_key_values`
|
"return_dict_in_generate": True, # Required to return `past_key_values`
|
||||||
|
|||||||
@@ -41,6 +41,9 @@ from transformers.testing_utils import (
|
|||||||
require_torch_gpu,
|
require_torch_gpu,
|
||||||
require_torch_sdpa,
|
require_torch_sdpa,
|
||||||
require_torchaudio,
|
require_torchaudio,
|
||||||
|
set_config_for_less_flaky_test,
|
||||||
|
set_model_for_less_flaky_test,
|
||||||
|
set_model_tester_for_less_flaky_test,
|
||||||
slow,
|
slow,
|
||||||
torch_device,
|
torch_device,
|
||||||
)
|
)
|
||||||
@@ -516,8 +519,11 @@ class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittes
|
|||||||
def get_mean_reldiff(failcase, x, ref, atol, rtol):
|
def get_mean_reldiff(failcase, x, ref, atol, rtol):
|
||||||
return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
|
return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
|
||||||
|
|
||||||
|
set_model_tester_for_less_flaky_test(self)
|
||||||
|
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
set_config_for_less_flaky_test(config)
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
|
|
||||||
is_encoder_decoder = model.config.is_encoder_decoder
|
is_encoder_decoder = model.config.is_encoder_decoder
|
||||||
@@ -534,6 +540,9 @@ class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittes
|
|||||||
)
|
)
|
||||||
model_eager = model_eager.eval().to(torch_device)
|
model_eager = model_eager.eval().to(torch_device)
|
||||||
|
|
||||||
|
set_model_for_less_flaky_test(model_eager)
|
||||||
|
set_model_for_less_flaky_test(model_sdpa)
|
||||||
|
|
||||||
# We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
|
# We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
|
||||||
# but it would be nicer to have an efficient way to use parameterized.expand
|
# but it would be nicer to have an efficient way to use parameterized.expand
|
||||||
fail_cases = []
|
fail_cases = []
|
||||||
@@ -1528,8 +1537,11 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
|
|||||||
def get_mean_reldiff(failcase, x, ref, atol, rtol):
|
def get_mean_reldiff(failcase, x, ref, atol, rtol):
|
||||||
return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
|
return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
|
||||||
|
|
||||||
|
set_model_tester_for_less_flaky_test(self)
|
||||||
|
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
set_config_for_less_flaky_test(config)
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
|
|
||||||
is_encoder_decoder = model.config.is_encoder_decoder
|
is_encoder_decoder = model.config.is_encoder_decoder
|
||||||
@@ -1546,6 +1558,9 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
|
|||||||
)
|
)
|
||||||
model_eager = model_eager.eval().to(torch_device)
|
model_eager = model_eager.eval().to(torch_device)
|
||||||
|
|
||||||
|
set_model_for_less_flaky_test(model_eager)
|
||||||
|
set_model_for_less_flaky_test(model_sdpa)
|
||||||
|
|
||||||
# We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
|
# We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
|
||||||
# but it would be nicer to have an efficient way to use parameterized.expand
|
# but it would be nicer to have an efficient way to use parameterized.expand
|
||||||
fail_cases = []
|
fail_cases = []
|
||||||
|
|||||||
@@ -840,7 +840,13 @@ class SeamlessM4Tv2GenerationTest(unittest.TestCase):
|
|||||||
def test_speech_generation(self):
|
def test_speech_generation(self):
|
||||||
config, input_speech, input_text = self.prepare_speech_and_text_input()
|
config, input_speech, input_text = self.prepare_speech_and_text_input()
|
||||||
|
|
||||||
|
from transformers.testing_utils import set_config_for_less_flaky_test, set_model_for_less_flaky_test
|
||||||
|
|
||||||
|
set_config_for_less_flaky_test(config)
|
||||||
|
|
||||||
model = SeamlessM4Tv2Model(config=config)
|
model = SeamlessM4Tv2Model(config=config)
|
||||||
|
set_model_for_less_flaky_test(model)
|
||||||
|
|
||||||
self.update_generation(model)
|
self.update_generation(model)
|
||||||
model.save_pretrained(self.tmpdirname)
|
model.save_pretrained(self.tmpdirname)
|
||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
@@ -852,6 +858,11 @@ class SeamlessM4Tv2GenerationTest(unittest.TestCase):
|
|||||||
state_dict = model.state_dict()
|
state_dict = model.state_dict()
|
||||||
|
|
||||||
text_model = SeamlessM4Tv2ForTextToSpeech.from_pretrained(self.tmpdirname)
|
text_model = SeamlessM4Tv2ForTextToSpeech.from_pretrained(self.tmpdirname)
|
||||||
|
# Even if this component is loaded after `model.save_pretrained` which is after
|
||||||
|
# `set_model_for_less_flaky_test(model)`, we still need to apply `set_model_for_less_flaky_test` here as the
|
||||||
|
# `eps` attribute in the model's norm layers is not set from the config.
|
||||||
|
set_model_for_less_flaky_test(text_model)
|
||||||
|
|
||||||
self.update_generation(text_model)
|
self.update_generation(text_model)
|
||||||
text_model.to(torch_device)
|
text_model.to(torch_device)
|
||||||
text_model.eval()
|
text_model.eval()
|
||||||
@@ -859,6 +870,11 @@ class SeamlessM4Tv2GenerationTest(unittest.TestCase):
|
|||||||
output_text = self.factory_generation_speech_test(model, input_text)
|
output_text = self.factory_generation_speech_test(model, input_text)
|
||||||
|
|
||||||
speech_model = SeamlessM4Tv2ForSpeechToSpeech.from_pretrained(self.tmpdirname)
|
speech_model = SeamlessM4Tv2ForSpeechToSpeech.from_pretrained(self.tmpdirname)
|
||||||
|
# Even if this component is loaded after `model.save_pretrained` which is after
|
||||||
|
# `set_model_for_less_flaky_test(model)`, we still need to apply `set_model_for_less_flaky_test` here as the
|
||||||
|
# `eps` attribute in the model's norm layers is not set from the config.
|
||||||
|
set_model_for_less_flaky_test(speech_model)
|
||||||
|
|
||||||
self.update_generation(speech_model)
|
self.update_generation(speech_model)
|
||||||
speech_model.to(torch_device)
|
speech_model.to(torch_device)
|
||||||
speech_model.eval()
|
speech_model.eval()
|
||||||
|
|||||||
@@ -89,6 +89,9 @@ from transformers.testing_utils import (
|
|||||||
require_torch_multi_accelerator,
|
require_torch_multi_accelerator,
|
||||||
require_torch_multi_gpu,
|
require_torch_multi_gpu,
|
||||||
require_torch_sdpa,
|
require_torch_sdpa,
|
||||||
|
set_config_for_less_flaky_test,
|
||||||
|
set_model_for_less_flaky_test,
|
||||||
|
set_model_tester_for_less_flaky_test,
|
||||||
slow,
|
slow,
|
||||||
torch_device,
|
torch_device,
|
||||||
)
|
)
|
||||||
@@ -3976,34 +3979,11 @@ class ModelTesterMixin:
|
|||||||
def get_mean_reldiff(failcase, x, ref, atol, rtol):
|
def get_mean_reldiff(failcase, x, ref, atol, rtol):
|
||||||
return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
|
return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
|
||||||
|
|
||||||
if hasattr(self.model_tester, "num_hidden_layers"):
|
set_model_tester_for_less_flaky_test(self)
|
||||||
self.model_tester.num_hidden_layers = 1
|
|
||||||
if hasattr(self.model_tester, "vision_config") and "num_hidden_layers" in self.model_tester.vision_config:
|
|
||||||
self.model_tester.vision_config = copy.deepcopy(self.model_tester.vision_config)
|
|
||||||
self.model_tester.vision_config["num_hidden_layers"] = 1
|
|
||||||
if hasattr(self.model_tester, "text_config") and "num_hidden_layers" in self.model_tester.text_config:
|
|
||||||
self.model_tester.text_config = copy.deepcopy(self.model_tester.text_config)
|
|
||||||
self.model_tester.text_config["num_hidden_layers"] = 1
|
|
||||||
|
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
set_config_for_less_flaky_test(config)
|
||||||
config.rms_norm_eps = 1.0
|
|
||||||
config.layer_norm_eps = 1.0
|
|
||||||
config.norm_eps = 1.0
|
|
||||||
config.norm_epsilon = 1.0
|
|
||||||
config.layer_norm_epsilon = 1.0
|
|
||||||
|
|
||||||
# norm layers (layer/group norm, etc.) could cause flaky tests when the tensors have very small variance.
|
|
||||||
# (We don't need the original epsilon values to check eager/sdpa matches)
|
|
||||||
for attr in ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"]:
|
|
||||||
if hasattr(config, attr):
|
|
||||||
getattr(config, attr).rms_norm_eps = 1.0
|
|
||||||
getattr(config, attr).layer_norm_eps = 1.0
|
|
||||||
getattr(config, attr).norm_eps = 1.0
|
|
||||||
getattr(config, attr).norm_epsilon = 1.0
|
|
||||||
getattr(config, attr).layer_norm_epsilon = 1.0
|
|
||||||
|
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
# FIXME: we deactivate boolean mask for models using "use_mask_token" in their constructors.
|
# FIXME: we deactivate boolean mask for models using "use_mask_token" in their constructors.
|
||||||
# These models support masking only in the case `use_mask_token=True`. Otherwise they cannot consume an input mask.
|
# These models support masking only in the case `use_mask_token=True`. Otherwise they cannot consume an input mask.
|
||||||
@@ -4029,13 +4009,8 @@ class ModelTesterMixin:
|
|||||||
)
|
)
|
||||||
model_eager = model_eager.eval().to(torch_device, dtype=torch_dtype)
|
model_eager = model_eager.eval().to(torch_device, dtype=torch_dtype)
|
||||||
|
|
||||||
# Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.)
|
set_model_for_less_flaky_test(model_eager)
|
||||||
for x in model_eager.modules():
|
set_model_for_less_flaky_test(model_sdpa)
|
||||||
if isinstance(x, (nn.LayerNorm, nn.GroupNorm)):
|
|
||||||
x.eps = 1.0
|
|
||||||
for x in model_sdpa.modules():
|
|
||||||
if isinstance(x, (nn.LayerNorm, nn.GroupNorm)):
|
|
||||||
x.eps = 1.0
|
|
||||||
|
|
||||||
# We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model,
|
# We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model,
|
||||||
# but it would be nicer to have an efficient way to use parameterized.expand
|
# but it would be nicer to have an efficient way to use parameterized.expand
|
||||||
|
|||||||
Reference in New Issue
Block a user