Tests: move generate tests to the right mixin and delete redundant tests (#34464)

* tmp commit

* tmp commit

* cull overwrites of deleted tests

* typo

* more specific docstring

* make fixup

* parameterize at the top?

* correction

* more deletions :D

* tmp commit

* for VLMs too

* fix _check_outputs

* test nit

* make fixup

* fix another flaky

* test_generate_from_inputs_embeds -- handle missing attention mask
This commit is contained in:
Joao Gante
2024-10-30 10:59:08 +00:00
committed by GitHub
parent 913330ca9f
commit 8a734ea2c3
46 changed files with 265 additions and 2348 deletions

View File

@@ -22,7 +22,6 @@ import os.path
import random
import re
import tempfile
import time
import warnings
from collections import defaultdict
from contextlib import contextmanager
@@ -37,10 +36,7 @@ import transformers
from transformers import (
AutoModel,
AutoModelForCausalLM,
AutoModelForSeq2SeqLM,
AutoModelForSequenceClassification,
AutoTokenizer,
GenerationConfig,
PretrainedConfig,
PreTrainedModel,
is_torch_available,
@@ -86,7 +82,6 @@ from transformers.testing_utils import (
require_deepspeed,
require_flash_attn,
require_non_xpu,
require_read_token,
require_safetensors,
require_torch,
require_torch_accelerator,
@@ -3000,71 +2995,6 @@ class ModelTesterMixin:
)[0]
self.assertTrue(torch.allclose(out_embeds, out_ids))
def test_inputs_embeds_matches_input_ids_with_generate(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_generative_model_classes:
if model_class.__name__ not in [
*get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES),
*get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES),
]:
continue
model = model_class(config)
model.to(torch_device)
model.eval()
model_forward_args = inspect.signature(model.forward).parameters
if any(argument not in model_forward_args for argument in ["inputs_embeds", "position_ids"]):
self.skipTest(reason="This model doesn't use `inputs_embeds` or `position_ids`.")
has_inputs_embeds_forwarding = "inputs_embeds" in set(
inspect.signature(model.prepare_inputs_for_generation).parameters.keys()
)
if not has_inputs_embeds_forwarding:
self.skipTest(reason="This model doesn't support `inputs_embeds` passed to `generate`.")
inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1
# VLMs can't generate with embeds and pixels at the same time. We expect the user to pass merged
# embeds already
if model_class.__name__ in get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES):
inputs.pop("pixel_values", None)
inputs.pop("pixel_values_videos", None)
inputs.pop("pixel_values_images", None)
wte = model.get_input_embeddings()
if not self.is_encoder_decoder:
input_ids = inputs["input_ids"]
# some models infer position ids/attn mask differently when input ids
# by check if pad_token let's make sure no padding is in input ids
not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1
input_ids[input_ids == pad_token_id] = not_pad_token_id
del inputs["input_ids"]
inputs_embeds = wte(input_ids)
out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2)[:, -2:]
out_embeds = model.generate(inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2)
else:
encoder_input_ids = inputs["input_ids"]
decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
encoder_input_ids[encoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
decoder_input_ids[decoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
del inputs["input_ids"]
inputs.pop("decoder_input_ids", None)
inputs_embeds = wte(encoder_input_ids)
decoder_inputs_embeds = wte(decoder_input_ids)
out_ids = model.generate(
input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, **inputs, max_new_tokens=2
)[:, -2:]
out_embeds = model.generate(
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
**inputs,
max_new_tokens=2,
)
# NOTE: this test changes the order of FP ops, there may be tiny differences in the output
number_of_different_tokens = (out_ids != out_embeds).sum()
max_differences = int(out_ids.shape[0] * out_ids.shape[1] * 0.1)
self.assertTrue(number_of_different_tokens <= max_differences) # accept up to 10% mismatch
@require_non_xpu
@require_torch_multi_gpu
def test_multi_gpu_data_parallel_forward(self):
@@ -3857,102 +3787,6 @@ class ModelTesterMixin:
assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2)
@require_flash_attn
@require_torch_gpu
@mark.flash_attn_test
@slow
@is_flaky()
def test_flash_attn_2_generate_left_padding(self):
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
for model_class in self.all_generative_model_classes:
if not model_class._supports_flash_attn_2:
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
torch_device
)
dummy_input = inputs_dict[model.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
# make sure we do left padding
dummy_attention_mask[:, :-1] = 0
dummy_attention_mask[:, -1:] = 1
out = model.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
)
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
).to(torch_device)
out_fa = model.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
)
self.assertTrue(torch.allclose(out, out_fa))
@require_flash_attn
@require_torch_gpu
@mark.flash_attn_test
@is_flaky()
@slow
def test_flash_attn_2_generate_padding_right(self):
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
for model_class in self.all_generative_model_classes:
if not model_class._supports_flash_attn_2:
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
torch_device
)
dummy_input = inputs_dict[model.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
# make sure we do right padding
dummy_attention_mask[:, :-1] = 1
dummy_attention_mask[:, -1:] = 0
out = model.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
)
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
).to(torch_device)
out_fa = model.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
)
self.assertTrue(torch.allclose(out, out_fa))
def test_attn_implementation_composite_models(self):
"""
Tests if composite models can receive a dict object as attn_implementation, where each key should be
@@ -4525,65 +4359,6 @@ class ModelTesterMixin:
torch.allclose(res_eager[attention_mask == 1], res_sdpa[attention_mask == 1], rtol=1e-4, atol=1e-4)
)
@require_flash_attn
@require_torch_gpu
@mark.flash_attn_test
@slow
def test_flash_attn_2_generate_use_cache(self):
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
max_new_tokens = 30
for model_class in self.all_generative_model_classes:
if not model_class._supports_flash_attn_2:
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
).to(torch_device)
# Just test that a large cache works as expected
_ = model.generate(
dummy_input,
attention_mask=dummy_attention_mask,
max_new_tokens=max_new_tokens,
do_sample=False,
use_cache=True,
)
# Generate with one batch only to test generation when attention mask will be None
# when real inputs are used, because there is no padding. See issue #32237 for more
dummy_input = dummy_input[:1, ...]
dummy_attention_mask = torch.ones_like(dummy_attention_mask[:1, ...])
_ = model.generate(
dummy_input,
attention_mask=dummy_attention_mask,
max_new_tokens=max_new_tokens,
do_sample=False,
use_cache=True,
)
@require_flash_attn
@require_torch_gpu
@mark.flash_attn_test
@@ -4640,62 +4415,6 @@ class ModelTesterMixin:
if not has_fa2:
raise ValueError("The FA2 model should have FA2 layers")
@require_flash_attn
@require_torch_gpu
@mark.flash_attn_test
@slow
def test_flash_attn_2_generate_reuse_cache(self):
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
max_new_tokens = 2
for model_class in self.all_generative_model_classes:
if not model_class._supports_flash_attn_2:
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = dummy_input.shape[1] * 2 + max_new_tokens * 2 + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
).to(torch_device)
# run generate once to get filled cache
output = model.generate(
dummy_input,
max_new_tokens=max_new_tokens,
do_sample=False,
use_cache=True,
return_dict_in_generate=True,
)
past_key_values = output.past_key_values
# Try to continue generation from where we left, given that we have more than 1 new token to process
# e.g. this can happen in speculative decoding when feeding candidate tokens back to target model
dummy_input_updated = torch.cat([dummy_input, output.sequences], dim=-1)
_ = model.generate(
dummy_input_updated,
max_new_tokens=max_new_tokens,
do_sample=False,
use_cache=True,
past_key_values=past_key_values,
)
@require_flash_attn
@require_torch_gpu
@require_bitsandbytes
@@ -4999,82 +4718,6 @@ class ModelTesterMixin:
normalized_1 = F.softmax(out_shared_prefix_last_tokens)
torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4)
def test_static_cache_matches_dynamic(self):
"""
Tests that generating with static cache give almost same results as with dynamic cache.
This test does not compile the model and check only logits similarity for numerical precision
errors.
"""
if len(self.all_generative_model_classes) == 0:
self.skipTest(
reason="Model architecture has no generative classes, and thus not necessarily supporting 4D masks"
)
for model_class in self.all_generative_model_classes:
if not model_class._supports_static_cache:
self.skipTest(f"{model_class.__name__} does not support static cache")
if not model_class._supports_cache_class:
self.skipTest(f"{model_class.__name__} does not support cache class")
config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
if getattr(config, "sliding_window", 0) is not None and getattr(config, "sliding_window", 0) > 0:
self.skipTest(f"{model_class.__name__} with sliding window attention is not supported by this test")
model = model_class(config).to(device=torch_device, dtype=torch.float32)
model.eval()
dynamic_out = model.generate(
**inputs, do_sample=False, max_new_tokens=10, output_logits=True, return_dict_in_generate=True
)
static_out = model.generate(
**inputs,
do_sample=False,
max_new_tokens=10,
cache_implementation="static",
output_logits=True,
return_dict_in_generate=True,
)
self.assertTrue(torch.allclose(dynamic_out.logits[0], static_out.logits[0], rtol=1e-3, atol=1e-4))
# For now, Let's focus only on GPU for `torch.compile`
@slow
@require_torch_accelerator
@require_read_token
def test_torch_compile(self):
if version.parse(torch.__version__) < version.parse("2.3"):
self.skipTest(reason="This test requires torch >= 2.3 to run.")
torch.compiler.reset()
if not hasattr(self, "_torch_compile_test_ckpt"):
self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.")
ckpt = self._torch_compile_test_ckpt
revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision
os.environ["TOKENIZERS_PARALLELISM"] = "false"
batch_size = 1
n_iter = 3
tokenizer = AutoTokenizer.from_pretrained(ckpt)
if self.is_encoder_decoder:
model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
torch_device
)
else:
model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
torch_device
)
model.generation_config.max_new_tokens = 4
model.generation_config.cache_implementation = "static"
model.forward = torch.compile(model.forward, mode="reduce-overhead")
input_text = "Why dogs are cute?"
input_ids = tokenizer([input_text] * batch_size, return_tensors="pt").to(torch_device)
for i in range(n_iter):
_ = model.generate(**input_ids, do_sample=False)
@slow
@require_torch_gpu
def test_torch_compile_for_training(self):
@@ -5118,74 +4761,6 @@ class ModelTesterMixin:
for name, param in model._orig_mod.named_parameters():
torch.testing.assert_close(param.grad.detach().cpu(), params[name], rtol=1e-4, atol=1e-4)
@slow
@require_torch_gpu # Testing cuda graphs.
@require_read_token
def test_compile_cuda_graph_time(self):
if version.parse(torch.__version__) < version.parse("2.3"):
self.skipTest(reason="This test requires torch >= 2.3 to run.")
# TODO felix: All models supporting `StaticCache` or `torch.compile` should be tested.
# At the moment, only llama, gemma and gemma2 are tested here!
if not hasattr(self, "_torch_compile_test_ckpt"):
self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.")
ckpt = self._torch_compile_test_ckpt
revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision
os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(ckpt)
if self.is_encoder_decoder:
model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
torch_device
)
else:
model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
torch_device
)
cache_implementation = "static"
if model.config.model_type == "gemma2":
cache_implementation = "hybrid"
new_tokens = 50
gen_config = GenerationConfig(
max_new_tokens=new_tokens,
min_new_tokens=new_tokens,
use_cache=True,
pad_token_id=tokenizer.pad_token_id,
num_beams=1,
do_sample=False,
eos_token_id=None, # This is required for min_new_tokens to actually have an effect.
)
model.generation_config.eos_token_id = None # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect.
model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
inp = tokenizer("Why cats are cute?", return_tensors="pt").to(torch_device)
# First run: the first run warms up each graph, which does things like CuBlas or Triton benchmarking
start = time.perf_counter()
_ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
end = time.perf_counter()
graph_warmup_time = end - start
# Second run: CUDA Graph recording, and replays it
start = time.perf_counter()
_ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
end = time.perf_counter()
record_time = end - start
# Finally: we hit the optimized, CUDA Graph replay path
start = time.perf_counter()
_ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
end = time.perf_counter()
opt_time = end - start
# For the recording step, we expect only two cuda graphs and this step should be much faster than the first.
self.assertTrue(record_time < 0.15 * graph_warmup_time)
self.assertTrue(opt_time < record_time)
def test_forward_with_num_logits_to_keep(self):
for model_class in self.all_generative_model_classes:
if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()):