Tests: move generate tests to the right mixin and delete redundant tests (#34464)
* tmp commit * tmp commit * cull overwrites of deleted tests * typo * more specific docstring * make fixup * parameterize at the top? * correction * more deletions :D * tmp commit * for VLMs too * fix _check_outputs * test nit * make fixup * fix another flaky * test_generate_from_inputs_embeds -- handle missing attention mask
This commit is contained in:
@@ -22,7 +22,6 @@ import os.path
|
||||
import random
|
||||
import re
|
||||
import tempfile
|
||||
import time
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from contextlib import contextmanager
|
||||
@@ -37,10 +36,7 @@ import transformers
|
||||
from transformers import (
|
||||
AutoModel,
|
||||
AutoModelForCausalLM,
|
||||
AutoModelForSeq2SeqLM,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
GenerationConfig,
|
||||
PretrainedConfig,
|
||||
PreTrainedModel,
|
||||
is_torch_available,
|
||||
@@ -86,7 +82,6 @@ from transformers.testing_utils import (
|
||||
require_deepspeed,
|
||||
require_flash_attn,
|
||||
require_non_xpu,
|
||||
require_read_token,
|
||||
require_safetensors,
|
||||
require_torch,
|
||||
require_torch_accelerator,
|
||||
@@ -3000,71 +2995,6 @@ class ModelTesterMixin:
|
||||
)[0]
|
||||
self.assertTrue(torch.allclose(out_embeds, out_ids))
|
||||
|
||||
def test_inputs_embeds_matches_input_ids_with_generate(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_generative_model_classes:
|
||||
if model_class.__name__ not in [
|
||||
*get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES),
|
||||
*get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES),
|
||||
]:
|
||||
continue
|
||||
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
model_forward_args = inspect.signature(model.forward).parameters
|
||||
if any(argument not in model_forward_args for argument in ["inputs_embeds", "position_ids"]):
|
||||
self.skipTest(reason="This model doesn't use `inputs_embeds` or `position_ids`.")
|
||||
has_inputs_embeds_forwarding = "inputs_embeds" in set(
|
||||
inspect.signature(model.prepare_inputs_for_generation).parameters.keys()
|
||||
)
|
||||
if not has_inputs_embeds_forwarding:
|
||||
self.skipTest(reason="This model doesn't support `inputs_embeds` passed to `generate`.")
|
||||
inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
|
||||
pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1
|
||||
|
||||
# VLMs can't generate with embeds and pixels at the same time. We expect the user to pass merged
|
||||
# embeds already
|
||||
if model_class.__name__ in get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES):
|
||||
inputs.pop("pixel_values", None)
|
||||
inputs.pop("pixel_values_videos", None)
|
||||
inputs.pop("pixel_values_images", None)
|
||||
|
||||
wte = model.get_input_embeddings()
|
||||
if not self.is_encoder_decoder:
|
||||
input_ids = inputs["input_ids"]
|
||||
# some models infer position ids/attn mask differently when input ids
|
||||
# by check if pad_token let's make sure no padding is in input ids
|
||||
not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1
|
||||
input_ids[input_ids == pad_token_id] = not_pad_token_id
|
||||
del inputs["input_ids"]
|
||||
inputs_embeds = wte(input_ids)
|
||||
out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2)[:, -2:]
|
||||
out_embeds = model.generate(inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2)
|
||||
else:
|
||||
encoder_input_ids = inputs["input_ids"]
|
||||
decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
|
||||
encoder_input_ids[encoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
|
||||
decoder_input_ids[decoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
|
||||
del inputs["input_ids"]
|
||||
inputs.pop("decoder_input_ids", None)
|
||||
inputs_embeds = wte(encoder_input_ids)
|
||||
decoder_inputs_embeds = wte(decoder_input_ids)
|
||||
out_ids = model.generate(
|
||||
input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, **inputs, max_new_tokens=2
|
||||
)[:, -2:]
|
||||
out_embeds = model.generate(
|
||||
inputs_embeds=inputs_embeds,
|
||||
decoder_inputs_embeds=decoder_inputs_embeds,
|
||||
**inputs,
|
||||
max_new_tokens=2,
|
||||
)
|
||||
# NOTE: this test changes the order of FP ops, there may be tiny differences in the output
|
||||
number_of_different_tokens = (out_ids != out_embeds).sum()
|
||||
max_differences = int(out_ids.shape[0] * out_ids.shape[1] * 0.1)
|
||||
self.assertTrue(number_of_different_tokens <= max_differences) # accept up to 10% mismatch
|
||||
|
||||
@require_non_xpu
|
||||
@require_torch_multi_gpu
|
||||
def test_multi_gpu_data_parallel_forward(self):
|
||||
@@ -3857,102 +3787,6 @@ class ModelTesterMixin:
|
||||
|
||||
assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2)
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@mark.flash_attn_test
|
||||
@slow
|
||||
@is_flaky()
|
||||
def test_flash_attn_2_generate_left_padding(self):
|
||||
if not self.has_attentions:
|
||||
self.skipTest(reason="Model architecture does not support attentions")
|
||||
|
||||
for model_class in self.all_generative_model_classes:
|
||||
if not model_class._supports_flash_attn_2:
|
||||
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
model = model_class(config)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
|
||||
torch_device
|
||||
)
|
||||
|
||||
dummy_input = inputs_dict[model.main_input_name]
|
||||
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
|
||||
dummy_input = dummy_input.to(torch.float16)
|
||||
|
||||
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
|
||||
# make sure we do left padding
|
||||
dummy_attention_mask[:, :-1] = 0
|
||||
dummy_attention_mask[:, -1:] = 1
|
||||
|
||||
out = model.generate(
|
||||
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
|
||||
)
|
||||
|
||||
model = model_class.from_pretrained(
|
||||
tmpdirname,
|
||||
torch_dtype=torch.float16,
|
||||
attn_implementation="flash_attention_2",
|
||||
low_cpu_mem_usage=True,
|
||||
).to(torch_device)
|
||||
|
||||
out_fa = model.generate(
|
||||
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
|
||||
)
|
||||
|
||||
self.assertTrue(torch.allclose(out, out_fa))
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@mark.flash_attn_test
|
||||
@is_flaky()
|
||||
@slow
|
||||
def test_flash_attn_2_generate_padding_right(self):
|
||||
if not self.has_attentions:
|
||||
self.skipTest(reason="Model architecture does not support attentions")
|
||||
|
||||
for model_class in self.all_generative_model_classes:
|
||||
if not model_class._supports_flash_attn_2:
|
||||
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
model = model_class(config)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
|
||||
torch_device
|
||||
)
|
||||
|
||||
dummy_input = inputs_dict[model.main_input_name]
|
||||
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
|
||||
dummy_input = dummy_input.to(torch.float16)
|
||||
|
||||
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
|
||||
# make sure we do right padding
|
||||
dummy_attention_mask[:, :-1] = 1
|
||||
dummy_attention_mask[:, -1:] = 0
|
||||
|
||||
out = model.generate(
|
||||
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
|
||||
)
|
||||
|
||||
model = model_class.from_pretrained(
|
||||
tmpdirname,
|
||||
torch_dtype=torch.float16,
|
||||
attn_implementation="flash_attention_2",
|
||||
low_cpu_mem_usage=True,
|
||||
).to(torch_device)
|
||||
|
||||
out_fa = model.generate(
|
||||
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
|
||||
)
|
||||
|
||||
self.assertTrue(torch.allclose(out, out_fa))
|
||||
|
||||
def test_attn_implementation_composite_models(self):
|
||||
"""
|
||||
Tests if composite models can receive a dict object as attn_implementation, where each key should be
|
||||
@@ -4525,65 +4359,6 @@ class ModelTesterMixin:
|
||||
torch.allclose(res_eager[attention_mask == 1], res_sdpa[attention_mask == 1], rtol=1e-4, atol=1e-4)
|
||||
)
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@mark.flash_attn_test
|
||||
@slow
|
||||
def test_flash_attn_2_generate_use_cache(self):
|
||||
if not self.has_attentions:
|
||||
self.skipTest(reason="Model architecture does not support attentions")
|
||||
|
||||
max_new_tokens = 30
|
||||
|
||||
for model_class in self.all_generative_model_classes:
|
||||
if not model_class._supports_flash_attn_2:
|
||||
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
dummy_input = inputs_dict[model_class.main_input_name]
|
||||
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
|
||||
dummy_input = dummy_input.to(torch.float16)
|
||||
|
||||
# make sure that all models have enough positions for generation
|
||||
if hasattr(config, "max_position_embeddings"):
|
||||
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
|
||||
|
||||
model = model_class(config)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
|
||||
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
|
||||
|
||||
model = model_class.from_pretrained(
|
||||
tmpdirname,
|
||||
torch_dtype=torch.float16,
|
||||
attn_implementation="flash_attention_2",
|
||||
low_cpu_mem_usage=True,
|
||||
).to(torch_device)
|
||||
|
||||
# Just test that a large cache works as expected
|
||||
_ = model.generate(
|
||||
dummy_input,
|
||||
attention_mask=dummy_attention_mask,
|
||||
max_new_tokens=max_new_tokens,
|
||||
do_sample=False,
|
||||
use_cache=True,
|
||||
)
|
||||
|
||||
# Generate with one batch only to test generation when attention mask will be None
|
||||
# when real inputs are used, because there is no padding. See issue #32237 for more
|
||||
dummy_input = dummy_input[:1, ...]
|
||||
dummy_attention_mask = torch.ones_like(dummy_attention_mask[:1, ...])
|
||||
_ = model.generate(
|
||||
dummy_input,
|
||||
attention_mask=dummy_attention_mask,
|
||||
max_new_tokens=max_new_tokens,
|
||||
do_sample=False,
|
||||
use_cache=True,
|
||||
)
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@mark.flash_attn_test
|
||||
@@ -4640,62 +4415,6 @@ class ModelTesterMixin:
|
||||
if not has_fa2:
|
||||
raise ValueError("The FA2 model should have FA2 layers")
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@mark.flash_attn_test
|
||||
@slow
|
||||
def test_flash_attn_2_generate_reuse_cache(self):
|
||||
if not self.has_attentions:
|
||||
self.skipTest(reason="Model architecture does not support attentions")
|
||||
|
||||
max_new_tokens = 2
|
||||
for model_class in self.all_generative_model_classes:
|
||||
if not model_class._supports_flash_attn_2:
|
||||
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
dummy_input = inputs_dict[model_class.main_input_name]
|
||||
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
|
||||
dummy_input = dummy_input.to(torch.float16)
|
||||
|
||||
# make sure that all models have enough positions for generation
|
||||
if hasattr(config, "max_position_embeddings"):
|
||||
config.max_position_embeddings = dummy_input.shape[1] * 2 + max_new_tokens * 2 + 1
|
||||
|
||||
model = model_class(config)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
|
||||
model = model_class.from_pretrained(
|
||||
tmpdirname,
|
||||
torch_dtype=torch.float16,
|
||||
attn_implementation="flash_attention_2",
|
||||
low_cpu_mem_usage=True,
|
||||
).to(torch_device)
|
||||
|
||||
# run generate once to get filled cache
|
||||
output = model.generate(
|
||||
dummy_input,
|
||||
max_new_tokens=max_new_tokens,
|
||||
do_sample=False,
|
||||
use_cache=True,
|
||||
return_dict_in_generate=True,
|
||||
)
|
||||
past_key_values = output.past_key_values
|
||||
|
||||
# Try to continue generation from where we left, given that we have more than 1 new token to process
|
||||
# e.g. this can happen in speculative decoding when feeding candidate tokens back to target model
|
||||
dummy_input_updated = torch.cat([dummy_input, output.sequences], dim=-1)
|
||||
_ = model.generate(
|
||||
dummy_input_updated,
|
||||
max_new_tokens=max_new_tokens,
|
||||
do_sample=False,
|
||||
use_cache=True,
|
||||
past_key_values=past_key_values,
|
||||
)
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@require_bitsandbytes
|
||||
@@ -4999,82 +4718,6 @@ class ModelTesterMixin:
|
||||
normalized_1 = F.softmax(out_shared_prefix_last_tokens)
|
||||
torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4)
|
||||
|
||||
def test_static_cache_matches_dynamic(self):
|
||||
"""
|
||||
Tests that generating with static cache give almost same results as with dynamic cache.
|
||||
This test does not compile the model and check only logits similarity for numerical precision
|
||||
errors.
|
||||
"""
|
||||
if len(self.all_generative_model_classes) == 0:
|
||||
self.skipTest(
|
||||
reason="Model architecture has no generative classes, and thus not necessarily supporting 4D masks"
|
||||
)
|
||||
for model_class in self.all_generative_model_classes:
|
||||
if not model_class._supports_static_cache:
|
||||
self.skipTest(f"{model_class.__name__} does not support static cache")
|
||||
|
||||
if not model_class._supports_cache_class:
|
||||
self.skipTest(f"{model_class.__name__} does not support cache class")
|
||||
|
||||
config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
if getattr(config, "sliding_window", 0) is not None and getattr(config, "sliding_window", 0) > 0:
|
||||
self.skipTest(f"{model_class.__name__} with sliding window attention is not supported by this test")
|
||||
|
||||
model = model_class(config).to(device=torch_device, dtype=torch.float32)
|
||||
model.eval()
|
||||
|
||||
dynamic_out = model.generate(
|
||||
**inputs, do_sample=False, max_new_tokens=10, output_logits=True, return_dict_in_generate=True
|
||||
)
|
||||
static_out = model.generate(
|
||||
**inputs,
|
||||
do_sample=False,
|
||||
max_new_tokens=10,
|
||||
cache_implementation="static",
|
||||
output_logits=True,
|
||||
return_dict_in_generate=True,
|
||||
)
|
||||
self.assertTrue(torch.allclose(dynamic_out.logits[0], static_out.logits[0], rtol=1e-3, atol=1e-4))
|
||||
|
||||
# For now, Let's focus only on GPU for `torch.compile`
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
@require_read_token
|
||||
def test_torch_compile(self):
|
||||
if version.parse(torch.__version__) < version.parse("2.3"):
|
||||
self.skipTest(reason="This test requires torch >= 2.3 to run.")
|
||||
torch.compiler.reset()
|
||||
if not hasattr(self, "_torch_compile_test_ckpt"):
|
||||
self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.")
|
||||
ckpt = self._torch_compile_test_ckpt
|
||||
revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
batch_size = 1
|
||||
n_iter = 3
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(ckpt)
|
||||
if self.is_encoder_decoder:
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
|
||||
torch_device
|
||||
)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
|
||||
torch_device
|
||||
)
|
||||
|
||||
model.generation_config.max_new_tokens = 4
|
||||
|
||||
model.generation_config.cache_implementation = "static"
|
||||
model.forward = torch.compile(model.forward, mode="reduce-overhead")
|
||||
|
||||
input_text = "Why dogs are cute?"
|
||||
input_ids = tokenizer([input_text] * batch_size, return_tensors="pt").to(torch_device)
|
||||
|
||||
for i in range(n_iter):
|
||||
_ = model.generate(**input_ids, do_sample=False)
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
def test_torch_compile_for_training(self):
|
||||
@@ -5118,74 +4761,6 @@ class ModelTesterMixin:
|
||||
for name, param in model._orig_mod.named_parameters():
|
||||
torch.testing.assert_close(param.grad.detach().cpu(), params[name], rtol=1e-4, atol=1e-4)
|
||||
|
||||
@slow
|
||||
@require_torch_gpu # Testing cuda graphs.
|
||||
@require_read_token
|
||||
def test_compile_cuda_graph_time(self):
|
||||
if version.parse(torch.__version__) < version.parse("2.3"):
|
||||
self.skipTest(reason="This test requires torch >= 2.3 to run.")
|
||||
|
||||
# TODO felix: All models supporting `StaticCache` or `torch.compile` should be tested.
|
||||
# At the moment, only llama, gemma and gemma2 are tested here!
|
||||
if not hasattr(self, "_torch_compile_test_ckpt"):
|
||||
self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.")
|
||||
ckpt = self._torch_compile_test_ckpt
|
||||
revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(ckpt)
|
||||
if self.is_encoder_decoder:
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
|
||||
torch_device
|
||||
)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
|
||||
torch_device
|
||||
)
|
||||
|
||||
cache_implementation = "static"
|
||||
if model.config.model_type == "gemma2":
|
||||
cache_implementation = "hybrid"
|
||||
|
||||
new_tokens = 50
|
||||
gen_config = GenerationConfig(
|
||||
max_new_tokens=new_tokens,
|
||||
min_new_tokens=new_tokens,
|
||||
use_cache=True,
|
||||
pad_token_id=tokenizer.pad_token_id,
|
||||
num_beams=1,
|
||||
do_sample=False,
|
||||
eos_token_id=None, # This is required for min_new_tokens to actually have an effect.
|
||||
)
|
||||
model.generation_config.eos_token_id = None # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect.
|
||||
|
||||
model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
inp = tokenizer("Why cats are cute?", return_tensors="pt").to(torch_device)
|
||||
|
||||
# First run: the first run warms up each graph, which does things like CuBlas or Triton benchmarking
|
||||
start = time.perf_counter()
|
||||
_ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
|
||||
end = time.perf_counter()
|
||||
graph_warmup_time = end - start
|
||||
|
||||
# Second run: CUDA Graph recording, and replays it
|
||||
start = time.perf_counter()
|
||||
_ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
|
||||
end = time.perf_counter()
|
||||
record_time = end - start
|
||||
|
||||
# Finally: we hit the optimized, CUDA Graph replay path
|
||||
start = time.perf_counter()
|
||||
_ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
|
||||
end = time.perf_counter()
|
||||
opt_time = end - start
|
||||
|
||||
# For the recording step, we expect only two cuda graphs and this step should be much faster than the first.
|
||||
self.assertTrue(record_time < 0.15 * graph_warmup_time)
|
||||
self.assertTrue(opt_time < record_time)
|
||||
|
||||
def test_forward_with_num_logits_to_keep(self):
|
||||
for model_class in self.all_generative_model_classes:
|
||||
if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()):
|
||||
|
||||
Reference in New Issue
Block a user