Fix slow tests for important models to be compatible with A10 runners (#29905)

* fix mistral and mixtral

* add pdb

* fix mixtral tesst

* fix

* fix mistral ?

* add fix gemma

* fix mistral

* fix

* test

* anoter test

* fix

* fix

* fix mistral tests

* fix them again

* final fixes for mistral

* fix padding right

* fix whipser fa2

* fix

* fix

* fix gemma

* test

* fix llama

* fix

* fix

* fix llama gemma

* add class attribute

* fix CI

* clarify whisper

* compute_capability

* rename names in some comments

* Add   # fmt: skip

* make style

* Update tests/models/mistral/test_modeling_mistral.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* update

* update

---------

Co-authored-by: Younes Belkada <younesbelkada@gmail.com>
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
Yih-Dar
2024-04-09 13:28:54 +02:00
committed by GitHub
parent e9c23fa056
commit 08a194fcd6
6 changed files with 246 additions and 110 deletions

View File

@@ -470,39 +470,68 @@ class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
self.skipTest("Mistral flash attention does not support right padding")
@require_torch
@require_torch_gpu
class MistralIntegrationTest(unittest.TestCase):
# This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
# Depending on the hardware we get different logits / generations
cuda_compute_capability_major_version = None
@classmethod
def setUpClass(cls):
if is_torch_available() and torch.cuda.is_available():
# 8 is for A100 / A10 and 7 for T4
cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
def tearDown(self):
torch.cuda.empty_cache()
gc.collect()
@slow
def test_model_7b_logits(self):
input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map="auto")
model = MistralForCausalLM.from_pretrained(
"mistralai/Mistral-7B-v0.1", device_map="auto", torch_dtype=torch.float16
)
input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
with torch.no_grad():
out = model(input_ids).logits.cpu()
# Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor([[-2.5548, -2.5737, -3.0600, -2.5906, -2.8478, -2.8118, -2.9325, -2.7694]])
torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
# slicing logits[0, 0, 0:30]
EXPECTED_SLICE = torch.tensor([-5.8781, -5.8616, -0.1052, -4.7200, -5.8781, -5.8774, -5.8773, -5.8777, -5.8781, -5.8780, -5.8781, -5.8779, -1.0787, 1.7583, -5.8779, -5.8780, -5.8783, -5.8778, -5.8776, -5.8781, -5.8784, -5.8778, -5.8778, -5.8777, -5.8779, -5.8778, -5.8776, -5.8780, -5.8779, -5.8781]) # fmt: skip
EXPECTED_SLICE = {
7: torch.tensor([-5.8781, -5.8616, -0.1052, -4.7200, -5.8781, -5.8774, -5.8773, -5.8777, -5.8781, -5.8780, -5.8781, -5.8779, -1.0787, 1.7583, -5.8779, -5.8780, -5.8783, -5.8778, -5.8776, -5.8781, -5.8784, -5.8778, -5.8778, -5.8777, -5.8779, -5.8778, -5.8776, -5.8780, -5.8779, -5.8781]),
8: torch.tensor([-5.8711, -5.8555, -0.1050, -4.7148, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -1.0781, 1.7568, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711]),
} # fmt: skip
print(out[0, 0, :30])
torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, atol=1e-4, rtol=1e-4)
torch.testing.assert_close(
out[0, 0, :30], EXPECTED_SLICE[self.cuda_compute_capability_major_version], atol=1e-4, rtol=1e-4
)
del model
backend_empty_cache(torch_device)
gc.collect()
@slow
@require_bitsandbytes
def test_model_7b_generation(self):
EXPECTED_TEXT_COMPLETION = """My favourite condiment is 100% ketchup. I love it on everything. Im not a big"""
EXPECTED_TEXT_COMPLETION = {
7: "My favourite condiment is 100% ketchup. I love it on everything. I'm not a big",
8: "My favourite condiment is 100% ketchup. Im not a fan of mustard, mayo,",
}
prompt = "My favourite condiment is "
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=False)
model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map="auto")
model = MistralForCausalLM.from_pretrained(
"mistralai/Mistral-7B-v0.1", device_map={"": torch_device}, load_in_4bit=True
)
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)
# greedy generation outputs
generated_ids = model.generate(input_ids, max_new_tokens=20, temperature=0)
text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], text)
del model
backend_empty_cache(torch_device)
@@ -517,7 +546,7 @@ class MistralIntegrationTest(unittest.TestCase):
input_ids = [1] + [306, 338] * 2048
model = MistralForCausalLM.from_pretrained(
"mistralai/Mistral-7B-v0.1",
device_map="auto",
device_map={"": torch_device},
load_in_4bit=True,
attn_implementation="flash_attention_2",
)
@@ -544,9 +573,7 @@ class MistralIntegrationTest(unittest.TestCase):
# An input with 4097 tokens that is above the size of the sliding window
input_ids = [1] + [306, 338] * 2048
model = MistralForCausalLM.from_pretrained(
"mistralai/Mistral-7B-v0.1",
device_map="auto",
attn_implementation="sdpa",
"mistralai/Mistral-7B-v0.1", device_map="auto", attn_implementation="sdpa", torch_dtype=torch.float16
)
input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
@@ -577,9 +604,10 @@ class MistralIntegrationTest(unittest.TestCase):
@slow
def test_speculative_generation(self):
EXPECTED_TEXT_COMPLETION = (
"My favourite condiment is 100% Sriracha. I love the heat, the tang and the fact costs"
)
EXPECTED_TEXT_COMPLETION = {
7: "My favourite condiment is 100% Sriracha. I love the heat, the tang and the fact costs",
8: "My favourite condiment is 100% Sriracha. I love the heat, the sweetness, the tang",
}
prompt = "My favourite condiment is "
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=False)
model = MistralForCausalLM.from_pretrained(
@@ -593,7 +621,7 @@ class MistralIntegrationTest(unittest.TestCase):
input_ids, max_new_tokens=20, do_sample=True, temperature=0.3, assistant_model=model
)
text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], text)
del model
backend_empty_cache(torch_device)