Tests: upgrade test_eager_matches_sdpa_generate (#34386)

This commit is contained in:
Joao Gante
2024-10-25 11:55:07 +01:00
committed by GitHub
parent 8814043c8c
commit 186b8dc190
22 changed files with 85 additions and 946 deletions

View File

@@ -15,6 +15,7 @@
import copy import copy
import gc
import inspect import inspect
import tempfile import tempfile
import unittest import unittest
@@ -33,6 +34,7 @@ from transformers.testing_utils import (
require_torch_gpu, require_torch_gpu,
require_torch_multi_accelerator, require_torch_multi_accelerator,
require_torch_multi_gpu, require_torch_multi_gpu,
require_torch_sdpa,
slow, slow,
torch_device, torch_device,
) )
@@ -2046,6 +2048,86 @@ class GenerationTesterMixin:
for model_class in self.all_generative_model_classes: for model_class in self.all_generative_model_classes:
self.assertTrue("GenerationMixin" in str(model_class.__bases__)) self.assertTrue("GenerationMixin" in str(model_class.__bases__))
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_generate(self):
max_new_tokens = 30
for model_class in self.all_generative_model_classes:
if not model_class._supports_sdpa:
self.skipTest(f"{model_class.__name__} does not support SDPA")
config, original_inputs_dict = self.prepare_config_and_inputs_for_generate()
inputs_dict = {}
for input_name, input_data in original_inputs_dict.items():
if isinstance(input_data, torch.Tensor) and input_data.dtype in [torch.float32, torch.bfloat16]:
inputs_dict[input_name] = input_data.to(torch.float16)
else:
inputs_dict[input_name] = input_data
main_input = inputs_dict[model_class.main_input_name]
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + main_input.shape[1] + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
del model
gc.collect()
generate_kwargs = {
"max_new_tokens": max_new_tokens,
"do_sample": False,
"return_dict_in_generate": True,
"output_scores": True,
}
model_sdpa = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(torch_device)
res_sdpa = model_sdpa.generate(**inputs_dict, **generate_kwargs)
del model_sdpa
gc.collect()
model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)
res_eager = model_eager.generate(**inputs_dict, **generate_kwargs)
del model_eager
gc.collect()
# Eager and SDPA are very similar, but not exactly the same. Because we are using random models, this
# test would be flaky if we only checked the sequences. Two situations in which this test passes:
# 1. The sequences are the same
# 2. The sequences are different, but the scores up until the first mismatch are nearly identical
output_matches = res_eager.sequences == res_sdpa.sequences
has_matching_outputs = output_matches.all()
has_matching_scores = None
if not has_matching_outputs:
input_length = main_input.shape[1]
for batch_idx in range(res_eager.sequences.shape[0]):
batch_matches = output_matches[batch_idx]
if batch_matches.all():
continue
first_mismatch_idx = batch_matches.int().argmin() # gets the index of the first False
first_mismatch_idx -= input_length # scores doesn't include data regarding input tokens
sdpa_first_mismatch_scores = res_sdpa.scores[first_mismatch_idx][batch_idx]
eager_first_mismatch_scores = res_eager.scores[first_mismatch_idx][batch_idx]
has_matching_scores = torch.allclose(
sdpa_first_mismatch_scores, eager_first_mismatch_scores, rtol=1e-3, atol=1e-3
)
if not has_matching_scores:
break
self.assertTrue(has_matching_outputs or has_matching_scores)
def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1): def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1):
# we can be sure what is batch size from main input but seq length depends on model type and whether input is text/audio/image # we can be sure what is batch size from main input but seq length depends on model type and whether input is text/audio/image
# so we infer actual text seq length from model_tester, same was as it is done in `test_modeling_common.py` tests` # so we infer actual text seq length from model_tester, same was as it is done in `test_modeling_common.py` tests`

View File

@@ -22,7 +22,6 @@ from transformers.testing_utils import (
CaptureLogger, CaptureLogger,
require_torch, require_torch,
require_torch_accelerator, require_torch_accelerator,
require_torch_sdpa,
slow, slow,
torch_device, torch_device,
) )
@@ -672,79 +671,6 @@ class BertModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
loaded = torch.jit.load(os.path.join(tmp, "bert.pt"), map_location=torch_device) loaded = torch.jit.load(os.path.join(tmp, "bert.pt"), map_location=torch_device)
loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device)) loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device))
# This test was copied from the common test_eager_matches_sdpa_generate(), but without low_cpu_mem_usage=True.
# TODO: Remove this and use the parent method (in common tests) once BERT supports low_cpu_mem_usage=True.
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_generate(self):
max_new_tokens = 30
if len(self.all_generative_model_classes) == 0:
self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test")
for model_class in self.all_generative_model_classes:
if not model_class._supports_sdpa:
self.skipTest(f"{model_class.__name__} does not support SDPA")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
model_sdpa = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
# low_cpu_mem_usage=True,
).to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
# low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")
# Just test that a large cache works as expected
res_eager = model_eager.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
res_sdpa = model_sdpa.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
self.assertTrue(torch.allclose(res_eager, res_sdpa))
@require_torch @require_torch
class BertModelIntegrationTest(unittest.TestCase): class BertModelIntegrationTest(unittest.TestCase):

View File

@@ -307,64 +307,6 @@ class CohereModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
def test_torch_fx_output_loss(self): def test_torch_fx_output_loss(self):
super().test_torch_fx_output_loss() super().test_torch_fx_output_loss()
@require_bitsandbytes
@require_torch_sdpa
@require_torch_multi_gpu
@slow
def test_eager_matches_sdpa_generate(self):
"""
Overwritting the common test as the test is flaky on tiny models
"""
max_new_tokens = 30
model_id = "CohereForAI/c4ai-command-r-v01-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model_sdpa = CohereForCausalLM.from_pretrained(
model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto"
)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = CohereForCausalLM.from_pretrained(
model_id, torch_dtype=torch.float16, attn_implementation="eager", device_map="auto"
)
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")
texts = [
"hi here's a longer context, getting longer and",
"Hello this is a very long sentence my friend, very long for real",
"Today I am in Paris and",
]
for padding_side in ["left", "right"]:
tokenizer.padding_side = padding_side
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device)
res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
with self.subTest(f"{padding_side}"):
torch.testing.assert_close(
res_eager,
res_sdpa,
msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}",
)
@require_torch @require_torch
@slow @slow

View File

@@ -14,7 +14,6 @@
# limitations under the License. # limitations under the License.
"""Testing suite for the PyTorch Falcon model.""" """Testing suite for the PyTorch Falcon model."""
import tempfile
import unittest import unittest
from parameterized import parameterized from parameterized import parameterized
@@ -27,7 +26,6 @@ from transformers import (
set_seed, set_seed,
) )
from transformers.testing_utils import ( from transformers.testing_utils import (
is_flaky,
require_bitsandbytes, require_bitsandbytes,
require_torch, require_torch,
require_torch_sdpa, require_torch_sdpa,
@@ -520,78 +518,6 @@ class FalconModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
torch.testing.assert_close(ntk_sin_long, original_sin_long) torch.testing.assert_close(ntk_sin_long, original_sin_long)
self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all())
# TODO: @Fxmarty
@is_flaky(max_attempts=3, description="flaky on some models.")
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_generate(self):
max_new_tokens = 30
if len(self.all_generative_model_classes) == 0:
self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test")
for model_class in self.all_generative_model_classes:
if not model_class._supports_sdpa:
self.skipTest(f"{model_class.__name__} does not support SDPA")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
model_sdpa = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
# NOTE: This check is disabled for Falcon as the non-SDPA/SDPA implementation is in the same class (legacy reason).
# for name, submodule in model_eager.named_modules():
# if "SdpaAttention" in submodule.__class__.__name__:
# raise ValueError("The eager model should not have SDPA attention layers")
# has_sdpa = False
# for name, submodule in model_sdpa.named_modules():
# if "SdpaAttention" in submodule.__class__.__name__:
# has_sdpa = True
# break
# if not has_sdpa:
# raise ValueError("The SDPA model should have SDPA attention layers")
# Just test that a large cache works as expected
res_eager = model_eager.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
res_sdpa = model_sdpa.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
self.assertTrue(torch.allclose(res_eager, res_sdpa))
@require_torch @require_torch
class FalconLanguageGenerationTest(unittest.TestCase): class FalconLanguageGenerationTest(unittest.TestCase):

View File

@@ -758,77 +758,6 @@ class GlmModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
@require_torch_sdpa
@slow
@is_flaky()
def test_eager_matches_sdpa_generate(self):
"""Overwrite to add flakyness: outputs sometimes start to diverge after some tokens"""
max_new_tokens = 30
for model_class in self.all_generative_model_classes:
if not model_class._supports_sdpa:
self.skipTest(f"{model_class.__name__} does not support SDPA")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
model_sdpa = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")
# Just test that a large cache works as expected
res_eager = model_eager.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
res_sdpa = model_sdpa.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
self.assertTrue(torch.allclose(res_eager, res_sdpa))
@slow @slow
@require_torch_accelerator @require_torch_accelerator

View File

@@ -19,7 +19,7 @@ import unittest
from parameterized import parameterized from parameterized import parameterized
from transformers import AutoTokenizer, GPTNeoXConfig, is_torch_available, set_seed from transformers import AutoTokenizer, GPTNeoXConfig, is_torch_available, set_seed
from transformers.testing_utils import require_torch, require_torch_sdpa, slow, torch_device from transformers.testing_utils import require_torch, slow, torch_device
from ...generation.test_utils import GenerationTesterMixin from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
@@ -434,68 +434,6 @@ class GPTNeoXModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
torch.testing.assert_close(ntk_sin_long, original_sin_long) torch.testing.assert_close(ntk_sin_long, original_sin_long)
self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all())
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_generate(self):
"""
Based on tests.models.llama.test_modeling_llama.LlamaModelTest.test_eager_matches_sdpa_generate
which also overwrites the common test as the test is flaky on tiny models.
"""
max_new_tokens = 30
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-1b")
model_sdpa = GPTNeoXForCausalLM.from_pretrained(
"EleutherAI/pythia-1b",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = GPTNeoXForCausalLM.from_pretrained(
"EleutherAI/pythia-1b",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")
texts = [
"hi here's a longer context, getting longer and",
"Hello this is a very long sentence my friend, very long for real",
"Today I am in Paris and",
]
for padding_side in ["left", "right"]:
tokenizer.padding_side = padding_side
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device)
res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
with self.subTest(f"{padding_side}"):
torch.testing.assert_close(
res_eager,
res_sdpa,
msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}",
)
@require_torch @require_torch
class GPTNeoXLanguageGenerationTest(unittest.TestCase): class GPTNeoXLanguageGenerationTest(unittest.TestCase):

View File

@@ -24,11 +24,9 @@ from parameterized import parameterized
from transformers import AutoTokenizer, JetMoeConfig, is_torch_available from transformers import AutoTokenizer, JetMoeConfig, is_torch_available
from transformers.testing_utils import ( from transformers.testing_utils import (
backend_empty_cache, backend_empty_cache,
is_flaky,
require_flash_attn, require_flash_attn,
require_torch, require_torch,
require_torch_gpu, require_torch_gpu,
require_torch_sdpa,
slow, slow,
torch_device, torch_device,
) )
@@ -302,13 +300,6 @@ class JetMoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
test_disk_offload_bin = False test_disk_offload_bin = False
test_disk_offload_safetensors = False test_disk_offload_safetensors = False
# TODO: @Fxmarty
@is_flaky(max_attempts=3, description="flaky on some models.")
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_generate(self):
super().test_eager_matches_sdpa_generate()
@parameterized.expand([(1, False), (1, True), (4, False)]) @parameterized.expand([(1, False), (1, True), (4, False)])
def test_new_cache_format(self, num_beams, do_sample): def test_new_cache_format(self, num_beams, do_sample):
pass pass

View File

@@ -32,7 +32,6 @@ from transformers.testing_utils import (
require_torch, require_torch,
require_torch_accelerator, require_torch_accelerator,
require_torch_gpu, require_torch_gpu,
require_torch_sdpa,
slow, slow,
torch_device, torch_device,
) )
@@ -651,67 +650,6 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
if not has_flash: if not has_flash:
raise ValueError("The flash model should have flash attention layers") raise ValueError("The flash model should have flash attention layers")
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_generate(self):
"""
Overwritting the common test as the test is flaky on tiny models
"""
max_new_tokens = 30
tokenizer = LlamaTokenizer.from_pretrained("saibo/llama-1B")
model_sdpa = LlamaForCausalLM.from_pretrained(
"saibo/llama-1B",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = LlamaForCausalLM.from_pretrained(
"saibo/llama-1B",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")
texts = [
"hi here's a longer context, getting longer and",
"Hello this is a very long sentence my friend, very long for real",
"Today I am in Paris and",
]
for padding_side in ["left", "right"]:
tokenizer.padding_side = padding_side
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device)
res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
with self.subTest(f"{padding_side}"):
torch.testing.assert_close(
res_eager,
res_sdpa,
msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}",
)
@unittest.skip("Broken by the loss update will fix soon @ArthurZucker") @unittest.skip("Broken by the loss update will fix soon @ArthurZucker")
def test_torch_fx_output_loss(self, *args, **kwargs): def test_torch_fx_output_loss(self, *args, **kwargs):
pass pass

View File

@@ -24,7 +24,6 @@ from packaging import version
from transformers import AutoTokenizer, MistralConfig, is_torch_available, set_seed from transformers import AutoTokenizer, MistralConfig, is_torch_available, set_seed
from transformers.testing_utils import ( from transformers.testing_utils import (
backend_empty_cache, backend_empty_cache,
is_flaky,
require_bitsandbytes, require_bitsandbytes,
require_flash_attn, require_flash_attn,
require_read_token, require_read_token,
@@ -332,13 +331,6 @@ class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
): ):
return True return True
# TODO: @Fxmarty
@is_flaky(max_attempts=3, description="flaky on some models.")
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_generate(self):
super().test_eager_matches_sdpa_generate()
def setUp(self): def setUp(self):
self.model_tester = MistralModelTester(self) self.model_tester = MistralModelTester(self)
self.config_tester = ConfigTester(self, config_class=MistralConfig, hidden_size=37) self.config_tester = ConfigTester(self, config_class=MistralConfig, hidden_size=37)

View File

@@ -21,11 +21,9 @@ import pytest
from transformers import MixtralConfig, is_torch_available from transformers import MixtralConfig, is_torch_available
from transformers.testing_utils import ( from transformers.testing_utils import (
is_flaky,
require_flash_attn, require_flash_attn,
require_torch, require_torch,
require_torch_gpu, require_torch_gpu,
require_torch_sdpa,
slow, slow,
torch_device, torch_device,
) )
@@ -332,13 +330,6 @@ class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
): ):
return True return True
# TODO: @Fxmarty
@is_flaky(max_attempts=3, description="flaky on some models.")
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_generate(self):
super().test_eager_matches_sdpa_generate()
def setUp(self): def setUp(self):
self.model_tester = MixtralModelTester(self) self.model_tester = MixtralModelTester(self)
self.config_tester = ConfigTester(self, config_class=MixtralConfig, hidden_size=37) self.config_tester = ConfigTester(self, config_class=MixtralConfig, hidden_size=37)

View File

@@ -132,12 +132,6 @@ class MllamaForCausalLMModelTest(ModelTesterMixin, GenerationTesterMixin, unitte
self.model_tester = MllamaText2TextModelTester(self) self.model_tester = MllamaText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=MllamaTextConfig, has_text_modality=True) self.config_tester = ConfigTester(self, config_class=MllamaTextConfig, has_text_modality=True)
@require_torch_sdpa
@slow
@is_flaky()
def test_eager_matches_sdpa_generate(self):
super().test_eager_matches_sdpa_generate()
class MllamaVisionText2TextModelTester: class MllamaVisionText2TextModelTester:
def __init__( def __init__(
@@ -360,12 +354,6 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
self.assertListEqual([layer_attention.shape for layer_attention in iter_attentions], expected_shapes) self.assertListEqual([layer_attention.shape for layer_attention in iter_attentions], expected_shapes)
@require_torch_sdpa
@slow
@is_flaky()
def test_eager_matches_sdpa_generate(self):
super().test_eager_matches_sdpa_generate()
@require_torch_sdpa @require_torch_sdpa
@slow @slow
@is_flaky() @is_flaky()

View File

@@ -788,14 +788,10 @@ class MoshiTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
@slow @slow
@is_flaky(max_attempts=5, description="flaky on some models.") @is_flaky(max_attempts=5, description="flaky on some models.")
def test_eager_matches_sdpa_generate(self): def test_eager_matches_sdpa_generate(self):
if not self.has_attentions: """Overwritten -- mochi has custom inputs and custom output checks"""
self.skipTest(reason="Model architecture does not support attentions")
max_new_tokens = 5 max_new_tokens = 5
if len(self.all_generative_model_classes) == 0:
self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test")
for model_class in self.all_generative_model_classes: for model_class in self.all_generative_model_classes:
if not model_class._supports_sdpa: if not model_class._supports_sdpa:
self.skipTest(f"{model_class.__name__} does not support SDPA") self.skipTest(f"{model_class.__name__} does not support SDPA")

View File

@@ -819,74 +819,6 @@ class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
@require_torch_sdpa
@slow
# Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate
def test_eager_matches_sdpa_generate(self):
max_new_tokens = 30
# Ignore copy
for model_class in self.greedy_sample_model_classes:
if not model_class._supports_sdpa:
self.skipTest(f"{model_class.__name__} does not support SDPA")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
model_sdpa = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")
# Just test that a large cache works as expected
res_eager = model_eager.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
res_sdpa = model_sdpa.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
self.assertTrue(torch.allclose(res_eager, res_sdpa))
def prepare_musicgen_inputs_dict( def prepare_musicgen_inputs_dict(
config, config,
@@ -2085,74 +2017,6 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
@require_torch_sdpa
@slow
# Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate
def test_eager_matches_sdpa_generate(self):
max_new_tokens = 30
# Ignore copy
for model_class in self.greedy_sample_model_classes:
if not model_class._supports_sdpa:
self.skipTest(f"{model_class.__name__} does not support SDPA")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
model_sdpa = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")
# Just test that a large cache works as expected
res_eager = model_eager.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
res_sdpa = model_sdpa.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
self.assertTrue(torch.allclose(res_eager, res_sdpa))
def test_requires_grad_with_frozen_encoders(self): def test_requires_grad_with_frozen_encoders(self):
config = self.model_tester.get_config() config = self.model_tester.get_config()
for model_class in self.all_model_classes: for model_class in self.all_model_classes:

View File

@@ -1866,74 +1866,6 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
@require_torch_sdpa
@slow
# Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate
def test_eager_matches_sdpa_generate(self):
max_new_tokens = 30
# Ignore copy
for model_class in self.greedy_sample_model_classes:
if not model_class._supports_sdpa:
self.skipTest(f"{model_class.__name__} does not support SDPA")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
model_sdpa = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")
# Just test that a large cache works as expected
res_eager = model_eager.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
res_sdpa = model_sdpa.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
self.assertTrue(torch.allclose(res_eager, res_sdpa))
def test_requires_grad_with_frozen_encoders(self): def test_requires_grad_with_frozen_encoders(self):
config = self.model_tester.get_config() config = self.model_tester.get_config()
for model_class in self.all_model_classes: for model_class in self.all_model_classes:

View File

@@ -24,10 +24,8 @@ from transformers.generation.configuration_utils import GenerationConfig
from transformers.models.auto.tokenization_auto import AutoTokenizer from transformers.models.auto.tokenization_auto import AutoTokenizer
from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
from transformers.testing_utils import ( from transformers.testing_utils import (
is_flaky,
require_tokenizers, require_tokenizers,
require_torch, require_torch,
require_torch_sdpa,
slow, slow,
torch_device, torch_device,
) )
@@ -317,13 +315,6 @@ class OlmoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
def test_save_load_fast_init_from_base(self): def test_save_load_fast_init_from_base(self):
pass pass
# TODO: @Fxmarty
@is_flaky(max_attempts=3, description="flaky on some models.")
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_generate(self):
super().test_eager_matches_sdpa_generate()
@parameterized.expand([("linear",), ("dynamic",)]) @parameterized.expand([("linear",), ("dynamic",)])
def test_model_rope_scaling(self, scaling_type): def test_model_rope_scaling(self, scaling_type):
config, _ = self.model_tester.prepare_config_and_inputs_for_common() config, _ = self.model_tester.prepare_config_and_inputs_for_common()

View File

@@ -22,10 +22,8 @@ from transformers import OlmoeConfig, is_torch_available, set_seed
from transformers.models.auto.tokenization_auto import AutoTokenizer from transformers.models.auto.tokenization_auto import AutoTokenizer
from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
from transformers.testing_utils import ( from transformers.testing_utils import (
is_flaky,
require_tokenizers, require_tokenizers,
require_torch, require_torch,
require_torch_sdpa,
slow, slow,
torch_device, torch_device,
) )
@@ -330,13 +328,6 @@ class OlmoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
def test_save_load_fast_init_from_base(self): def test_save_load_fast_init_from_base(self):
pass pass
# TODO: @Fxmarty
@is_flaky(max_attempts=3, description="flaky on some models.")
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_generate(self):
super().test_eager_matches_sdpa_generate()
@parameterized.expand([("linear",), ("dynamic",)]) @parameterized.expand([("linear",), ("dynamic",)])
def test_model_rope_scaling(self, scaling_type): def test_model_rope_scaling(self, scaling_type):
config, _ = self.model_tester.prepare_config_and_inputs_for_common() config, _ = self.model_tester.prepare_config_and_inputs_for_common()

View File

@@ -25,7 +25,6 @@ from transformers.testing_utils import (
require_torch, require_torch,
require_torch_accelerator, require_torch_accelerator,
require_torch_fp16, require_torch_fp16,
require_torch_sdpa,
slow, slow,
torch_device, torch_device,
) )
@@ -339,68 +338,6 @@ class OPTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_generate(self):
"""
Overwritting the common test as the test is flaky on tiny models
"""
max_new_tokens = 30
tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350M")
texts = [
"hi here's a longer context, getting longer and",
"Hello this is a very long sentence my friend, very long for real",
"Today I am in Paris and",
]
model_sdpa = OPTForCausalLM.from_pretrained(
"facebook/opt-350M",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="sdpa",
).to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = OPTForCausalLM.from_pretrained(
"facebook/opt-350M",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
for _, submodule in model_eager.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for _, submodule in model_sdpa.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")
for padding_side in ["left", "right"]:
tokenizer.padding_side = padding_side
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device)
res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
with self.subTest(f"{padding_side}"):
torch.testing.assert_close(
res_eager,
res_sdpa,
msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}",
)
@unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.") @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
def test_model_parallelism(self): def test_model_parallelism(self):
super().test_model_parallelism() super().test_model_parallelism()

View File

@@ -343,14 +343,6 @@ class Qwen2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
): ):
return True return True
# Ignore copy
# TODO: @Fxmarty
@require_torch_sdpa
@slow
@unittest.skip(reason="Currently failing.")
def test_eager_matches_sdpa_generate(self):
super().test_eager_matches_sdpa_generate()
def setUp(self): def setUp(self):
self.model_tester = Qwen2ModelTester(self) self.model_tester = Qwen2ModelTester(self)
self.config_tester = ConfigTester(self, config_class=Qwen2Config, hidden_size=37) self.config_tester = ConfigTester(self, config_class=Qwen2Config, hidden_size=37)

View File

@@ -368,12 +368,6 @@ class Qwen2MoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
): ):
return True return True
# Ignore copy
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_generate(self):
super().test_eager_matches_sdpa_generate()
def setUp(self): def setUp(self):
self.model_tester = Qwen2MoeModelTester(self) self.model_tester = Qwen2MoeModelTester(self)
self.config_tester = ConfigTester(self, config_class=Qwen2MoeConfig, hidden_size=37) self.config_tester = ConfigTester(self, config_class=Qwen2MoeConfig, hidden_size=37)

View File

@@ -21,11 +21,9 @@ from parameterized import parameterized
from transformers import StableLmConfig, is_torch_available, set_seed from transformers import StableLmConfig, is_torch_available, set_seed
from transformers.testing_utils import ( from transformers.testing_utils import (
is_flaky,
require_bitsandbytes, require_bitsandbytes,
require_flash_attn, require_flash_attn,
require_torch, require_torch,
require_torch_sdpa,
slow, slow,
torch_device, torch_device,
) )
@@ -558,67 +556,3 @@ class StableLmModelIntegrationTest(unittest.TestCase):
input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device) input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0) generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-3:].tolist()) self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-3:].tolist())
# Copied from transformers.tests.models.llama.test_modeling_llama.LlamaModelTest.test_eager_matches_sdpa_generate with Llama->StableLm,saibo/llama-1B->stabilityai/stablelm-3b-4e1t
# TODO: @Fxmarty
@is_flaky(max_attempts=3, description="flaky on some models.")
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_generate(self):
"""
Overwritting the common test as the test is flaky on tiny models
"""
max_new_tokens = 30
tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-3b-4e1t")
model_sdpa = StableLmForCausalLM.from_pretrained(
"stabilityai/stablelm-3b-4e1t",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = StableLmForCausalLM.from_pretrained(
"stabilityai/stablelm-3b-4e1t",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")
texts = [
"hi here's a longer context, getting longer and",
"Hello this is a very long sentence my friend, very long for real",
"Today I am in Paris and",
]
for padding_side in ["left", "right"]:
tokenizer.padding_side = padding_side
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device)
res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
with self.subTest(f"{padding_side}"):
torch.testing.assert_close(
res_eager,
res_sdpa,
msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}",
)

View File

@@ -14,11 +14,10 @@
# limitations under the License. # limitations under the License.
import tempfile
import unittest import unittest
from transformers import XLMRobertaXLConfig, is_torch_available from transformers import XLMRobertaXLConfig, is_torch_available
from transformers.testing_utils import require_torch, require_torch_sdpa, slow, torch_device from transformers.testing_utils import require_torch, slow, torch_device
from ...generation.test_utils import GenerationTesterMixin from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
@@ -523,84 +522,6 @@ class XLMRobertaXLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTes
self.assertEqual(position_ids.shape, expected_positions.shape) self.assertEqual(position_ids.shape, expected_positions.shape)
self.assertTrue(torch.all(torch.eq(position_ids, expected_positions))) self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
# TODO: Remove this and use the parent method (in common tests) once XLM RoBERTa XL supports low_cpu_mem_usage=True.
@require_torch_sdpa
@slow
# Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate
def test_eager_matches_sdpa_generate(self):
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
max_new_tokens = 30
if len(self.all_generative_model_classes) == 0:
self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test")
for model_class in self.all_generative_model_classes:
if not model_class._supports_sdpa:
self.skipTest(f"{model_class.__name__} does not support SDPA")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
# Ignore copy
model_sdpa = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=False,
).to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
# Ignore copy
model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=False,
attn_implementation="eager",
).to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")
# Just test that a large cache works as expected
res_eager = model_eager.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
res_sdpa = model_sdpa.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
self.assertTrue(torch.allclose(res_eager, res_sdpa))
@require_torch @require_torch
class XLMRobertaModelXLIntegrationTest(unittest.TestCase): class XLMRobertaModelXLIntegrationTest(unittest.TestCase):

View File

@@ -4469,62 +4469,6 @@ class ModelTesterMixin:
with torch.no_grad(): with torch.no_grad():
_ = model(**inputs_dict) _ = model(**inputs_dict)
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_generate(self):
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
max_new_tokens = 30
if len(self.all_generative_model_classes) == 0:
self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test")
for model_class in self.all_generative_model_classes:
if not model_class._supports_sdpa:
self.skipTest(f"{model_class.__name__} does not support SDPA")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
model_sdpa = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(torch_device)
model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)
# Just test that a large cache works as expected
res_eager = model_eager.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
res_sdpa = model_sdpa.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
self.assertTrue(torch.allclose(res_eager, res_sdpa))
@require_torch_sdpa @require_torch_sdpa
def test_sdpa_matches_eager_sliding_window(self): def test_sdpa_matches_eager_sliding_window(self):
if not self.has_attentions: if not self.has_attentions: