Remove @slow for test_eager_matches_sdpa_inference (#34558)
* update * update * update * update * update * update * update * update * update * update * update --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -452,7 +452,6 @@ class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
|
||||
|
||||
@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
|
||||
@require_torch_sdpa
|
||||
@slow
|
||||
# Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference
|
||||
def test_eager_matches_sdpa_inference(self, torch_dtype: str):
|
||||
if not self.has_attentions:
|
||||
@@ -479,8 +478,10 @@ class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
|
||||
|
||||
atols = {
|
||||
("cpu", False, torch.float32): 1e-6,
|
||||
("cpu", False, torch.float16): 5e-3,
|
||||
("cpu", False, torch.bfloat16): 1e-2,
|
||||
("cpu", True, torch.float32): 1e-6,
|
||||
("cpu", True, torch.float16): 5e-3,
|
||||
("cpu", True, torch.bfloat16): 1e-2,
|
||||
("cuda", False, torch.float32): 1e-6,
|
||||
("cuda", False, torch.bfloat16): 1e-2,
|
||||
@@ -491,8 +492,10 @@ class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
|
||||
}
|
||||
rtols = {
|
||||
("cpu", False, torch.float32): 1e-4,
|
||||
("cpu", False, torch.float16): 5e-3,
|
||||
("cpu", False, torch.bfloat16): 1e-2,
|
||||
("cpu", True, torch.float32): 1e-4,
|
||||
("cpu", True, torch.float16): 5e-3,
|
||||
("cpu", True, torch.bfloat16): 1e-2,
|
||||
("cuda", False, torch.float32): 1e-4,
|
||||
("cuda", False, torch.bfloat16): 1e-2,
|
||||
@@ -528,7 +531,7 @@ class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
|
||||
fail_cases = []
|
||||
for padding_side in ["left", "right"]:
|
||||
for use_mask in [False, True]:
|
||||
for batch_size in [1, 5]:
|
||||
for batch_size in [7]:
|
||||
# Ignore copy
|
||||
batch_size_input_ids = self.model_tester.num_codebooks * batch_size
|
||||
dummy_input = inputs_dict[model.main_input_name]
|
||||
@@ -585,11 +588,11 @@ class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
|
||||
|
||||
dummy_attention_mask[:] = 1
|
||||
if padding_side == "left":
|
||||
dummy_attention_mask[-1, :-1] = 1
|
||||
dummy_attention_mask[-1, -4:] = 0
|
||||
dummy_attention_mask[-1, :2] = 0
|
||||
dummy_attention_mask[-1, 2:] = 1
|
||||
elif padding_side == "right":
|
||||
dummy_attention_mask[-1, 1:] = 1
|
||||
dummy_attention_mask[-1, :3] = 0
|
||||
dummy_attention_mask[-1, -2:] = 0
|
||||
dummy_attention_mask[-1, :-2] = 1
|
||||
|
||||
for enable_kernels in [False, True]:
|
||||
failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
|
||||
@@ -632,52 +635,32 @@ class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
|
||||
|
||||
# Masked tokens output slightly deviates - we don't mind that.
|
||||
if use_mask:
|
||||
_logits_sdpa = torch.zeros_like(input=logits_sdpa)
|
||||
_logits_eager = torch.zeros_like(input=logits_eager)
|
||||
|
||||
_logits_sdpa[:-1] = logits_sdpa[:-1]
|
||||
_logits_eager[:-1] = logits_eager[:-1]
|
||||
|
||||
if padding_side == "left":
|
||||
sub_sdpa = logits_sdpa[:-1]
|
||||
sub_eager = logits_eager[:-1]
|
||||
if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
|
||||
fail_cases.append(
|
||||
get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
|
||||
)
|
||||
_logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
|
||||
_logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
|
||||
|
||||
sub_sdpa = logits_sdpa[-1, :-4]
|
||||
sub_eager = logits_eager[-1, :-4]
|
||||
if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
|
||||
fail_cases.append(
|
||||
get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
|
||||
)
|
||||
|
||||
# Testing the padding tokens is not really meaningful but anyway
|
||||
# sub_sdpa = logits_sdpa[-1, -4:]
|
||||
# sub_eager = logits_eager[-1, -4:]
|
||||
# if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
|
||||
# fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
|
||||
elif padding_side == "right":
|
||||
sub_sdpa = logits_sdpa[:-1]
|
||||
sub_eager = logits_eager[:-1]
|
||||
if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
|
||||
fail_cases.append(
|
||||
get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
|
||||
)
|
||||
_logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
|
||||
_logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
|
||||
|
||||
sub_sdpa = logits_sdpa[-1, 3:]
|
||||
sub_eager = logits_eager[-1, 3:]
|
||||
if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
|
||||
fail_cases.append(
|
||||
get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
|
||||
)
|
||||
logits_sdpa = _logits_sdpa
|
||||
logits_eager = _logits_eager
|
||||
|
||||
# Testing the padding tokens is not really meaningful but anyway
|
||||
# sub_sdpa = logits_sdpa[-1, :3]
|
||||
# sub_eager = logits_eager[-1, :3]
|
||||
# if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
|
||||
# fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
|
||||
|
||||
else:
|
||||
if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
|
||||
fail_cases.append(
|
||||
get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
|
||||
)
|
||||
results = [
|
||||
torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
|
||||
for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
|
||||
]
|
||||
# If 80% batch elements have matched results, it's fine
|
||||
if np.mean(results) < 0.8:
|
||||
fail_cases.append(
|
||||
get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
|
||||
)
|
||||
|
||||
self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
|
||||
|
||||
@@ -1496,8 +1479,6 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
|
||||
@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
|
||||
@require_torch_sdpa
|
||||
@slow
|
||||
# Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference
|
||||
def test_eager_matches_sdpa_inference(self, torch_dtype: str):
|
||||
if not self.has_attentions:
|
||||
self.skipTest(reason="Model architecture does not support attentions")
|
||||
@@ -1523,8 +1504,10 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
|
||||
atols = {
|
||||
("cpu", False, torch.float32): 1e-6,
|
||||
("cpu", False, torch.float16): 5e-3,
|
||||
("cpu", False, torch.bfloat16): 1e-2,
|
||||
("cpu", True, torch.float32): 1e-6,
|
||||
("cpu", True, torch.float16): 5e-3,
|
||||
("cpu", True, torch.bfloat16): 1e-2,
|
||||
("cuda", False, torch.float32): 1e-6,
|
||||
("cuda", False, torch.bfloat16): 1e-2,
|
||||
@@ -1535,8 +1518,10 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
}
|
||||
rtols = {
|
||||
("cpu", False, torch.float32): 1e-4,
|
||||
("cpu", False, torch.float16): 5e-3,
|
||||
("cpu", False, torch.bfloat16): 1e-2,
|
||||
("cpu", True, torch.float32): 1e-4,
|
||||
("cpu", True, torch.float16): 5e-3,
|
||||
("cpu", True, torch.bfloat16): 1e-2,
|
||||
("cuda", False, torch.float32): 1e-4,
|
||||
("cuda", False, torch.bfloat16): 1e-2,
|
||||
@@ -1549,8 +1534,26 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
def get_mean_reldiff(failcase, x, ref, atol, rtol):
|
||||
return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
|
||||
|
||||
if hasattr(self.model_tester, "num_hidden_layers"):
|
||||
self.model_tester.num_hidden_layers = 1
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
config.rms_norm_eps = 1.0
|
||||
config.layer_norm_eps = 1.0
|
||||
config.norm_eps = 1.0
|
||||
config.norm_epsilon = 1.0
|
||||
config.layer_norm_epsilon = 1.0
|
||||
|
||||
for attr in ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"]:
|
||||
if hasattr(config, attr):
|
||||
getattr(config, attr).rms_norm_eps = 1.0
|
||||
getattr(config, attr).layer_norm_eps = 1.0
|
||||
getattr(config, attr).norm_eps = 1.0
|
||||
getattr(config, attr).norm_epsilon = 1.0
|
||||
getattr(config, attr).layer_norm_epsilon = 1.0
|
||||
|
||||
model = model_class(config)
|
||||
|
||||
is_encoder_decoder = model.config.is_encoder_decoder
|
||||
@@ -1567,12 +1570,19 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
)
|
||||
model_eager = model_eager.eval().to(torch_device)
|
||||
|
||||
for x in model_eager.modules():
|
||||
if isinstance(x, (torch.nn.LayerNorm, torch.nn.GroupNorm)):
|
||||
x.eps = 1.0
|
||||
for x in model_sdpa.modules():
|
||||
if isinstance(x, (torch.nn.LayerNorm, torch.nn.GroupNorm)):
|
||||
x.eps = 1.0
|
||||
|
||||
# We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
|
||||
# but it would be nicer to have an efficient way to use parameterized.expand
|
||||
fail_cases = []
|
||||
for padding_side in ["left", "right"]:
|
||||
for use_mask in [False, True]:
|
||||
for batch_size in [1, 5]:
|
||||
for batch_size in [7]:
|
||||
dummy_input = inputs_dict[model.main_input_name]
|
||||
|
||||
if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
|
||||
@@ -1622,11 +1632,11 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
|
||||
dummy_attention_mask[:] = 1
|
||||
if padding_side == "left":
|
||||
dummy_attention_mask[-1, :-1] = 1
|
||||
dummy_attention_mask[-1, -4:] = 0
|
||||
dummy_attention_mask[-1, :2] = 0
|
||||
dummy_attention_mask[-1, 2:] = 1
|
||||
elif padding_side == "right":
|
||||
dummy_attention_mask[-1, 1:] = 1
|
||||
dummy_attention_mask[-1, :3] = 0
|
||||
dummy_attention_mask[-1, -2:] = 0
|
||||
dummy_attention_mask[-1, :-2] = 1
|
||||
|
||||
for enable_kernels in [False, True]:
|
||||
failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
|
||||
@@ -1687,52 +1697,32 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
|
||||
# Masked tokens output slightly deviates - we don't mind that.
|
||||
if use_mask:
|
||||
_logits_sdpa = torch.zeros_like(input=logits_sdpa)
|
||||
_logits_eager = torch.zeros_like(input=logits_eager)
|
||||
|
||||
_logits_sdpa[:-1] = logits_sdpa[:-1]
|
||||
_logits_eager[:-1] = logits_eager[:-1]
|
||||
|
||||
if padding_side == "left":
|
||||
sub_sdpa = logits_sdpa[:-1]
|
||||
sub_eager = logits_eager[:-1]
|
||||
if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
|
||||
fail_cases.append(
|
||||
get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
|
||||
)
|
||||
_logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
|
||||
_logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
|
||||
|
||||
sub_sdpa = logits_sdpa[-1, :-4]
|
||||
sub_eager = logits_eager[-1, :-4]
|
||||
if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
|
||||
fail_cases.append(
|
||||
get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
|
||||
)
|
||||
|
||||
# Testing the padding tokens is not really meaningful but anyway
|
||||
# sub_sdpa = logits_sdpa[-1, -4:]
|
||||
# sub_eager = logits_eager[-1, -4:]
|
||||
# if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
|
||||
# fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
|
||||
elif padding_side == "right":
|
||||
sub_sdpa = logits_sdpa[:-1]
|
||||
sub_eager = logits_eager[:-1]
|
||||
if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
|
||||
fail_cases.append(
|
||||
get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
|
||||
)
|
||||
_logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
|
||||
_logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
|
||||
|
||||
sub_sdpa = logits_sdpa[-1, 3:]
|
||||
sub_eager = logits_eager[-1, 3:]
|
||||
if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
|
||||
fail_cases.append(
|
||||
get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
|
||||
)
|
||||
logits_sdpa = _logits_sdpa
|
||||
logits_eager = _logits_eager
|
||||
|
||||
# Testing the padding tokens is not really meaningful but anyway
|
||||
# sub_sdpa = logits_sdpa[-1, :3]
|
||||
# sub_eager = logits_eager[-1, :3]
|
||||
# if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
|
||||
# fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
|
||||
|
||||
else:
|
||||
if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
|
||||
fail_cases.append(
|
||||
get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
|
||||
)
|
||||
results = [
|
||||
torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
|
||||
for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
|
||||
]
|
||||
# If 80% batch elements have matched results, it's fine
|
||||
if np.mean(results) < 0.8:
|
||||
fail_cases.append(
|
||||
get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
|
||||
)
|
||||
|
||||
self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user