[tests] remove tf/flax tests in /generation (#36235)

This commit is contained in:
Joao Gante
2025-02-17 14:59:22 +00:00
committed by GitHub
parent c877c9fa5b
commit 55493f1390
26 changed files with 428 additions and 2663 deletions

View File

@@ -70,16 +70,6 @@ if is_tf_available():
TFAutoModelForSequenceClassification,
TFSharedEmbeddings,
)
from transformers.generation import (
TFBeamSampleDecoderOnlyOutput,
TFBeamSampleEncoderDecoderOutput,
TFBeamSearchDecoderOnlyOutput,
TFBeamSearchEncoderDecoderOutput,
TFGreedySearchDecoderOnlyOutput,
TFGreedySearchEncoderDecoderOutput,
TFSampleDecoderOnlyOutput,
TFSampleEncoderDecoderOutput,
)
from transformers.modeling_tf_utils import keras
tf.config.experimental.enable_tensor_float_32_execution(False)
@@ -1211,150 +1201,6 @@ class TFModelTesterMixin:
with self.assertRaises(tf.errors.InvalidArgumentError):
model(**prepared_inputs)
def test_lm_head_model_random_no_beam_search_generate(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
input_ids = inputs_dict.get("input_ids", None)
# iterate over all generative models
for model_class in self.all_generative_model_classes:
model = model_class(config)
if config.bos_token_id is None:
# if bos token id is not defined model needs input_ids
with self.assertRaises(ValueError):
model.generate(do_sample=True, max_length=5)
# num_return_sequences = 1
self._check_generated_ids(model.generate(input_ids, do_sample=True))
elif model_class.__name__ not in ["TFSpeech2TextForConditionalGeneration"]:
# Models with non-text inputs won't work here; num_return_sequences = 1
self._check_generated_ids(model.generate(do_sample=True, max_length=5))
with self.assertRaises(ValueError):
# generating multiple sequences when no beam search generation
# is not allowed as it would always generate the same sequences
model.generate(input_ids, do_sample=False, num_return_sequences=2)
# num_return_sequences > 1, sample
self._check_generated_ids(model.generate(input_ids, do_sample=True, num_return_sequences=2))
# check bad words tokens language generation
# create list of 1-seq bad token and list of 2-seq of bad tokens
bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
output_tokens = model.generate(
input_ids, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2
)
# only count generated tokens
generated_ids = output_tokens[:, input_ids.shape[-1] :]
self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
input_ids = inputs_dict.get("input_ids", None)
if input_ids is None:
input_ids = inputs_dict.get("input_features", None)
# iterate over all generative models
for model_class in self.all_generative_model_classes:
model = model_class(config)
output_greedy = model.generate(
input_ids,
do_sample=False,
output_scores=True,
output_hidden_states=True,
output_attentions=True,
return_dict_in_generate=True,
)
output_sample = model.generate(
input_ids,
do_sample=True,
output_scores=True,
output_hidden_states=True,
output_attentions=True,
return_dict_in_generate=True,
)
if model.config.is_encoder_decoder:
self.assertIsInstance(output_greedy, TFGreedySearchEncoderDecoderOutput)
self.assertIsInstance(output_sample, TFSampleEncoderDecoderOutput)
else:
self.assertIsInstance(output_greedy, TFGreedySearchDecoderOnlyOutput)
self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput)
def test_lm_head_model_random_beam_search_generate(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
input_ids = inputs_dict.get("input_ids", None)
for model_class in self.all_generative_model_classes:
model = model_class(config)
if config.bos_token_id is None:
# if bos token id is not defined model needs input_ids, num_return_sequences = 1
self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2))
else:
# num_return_sequences = 1
self._check_generated_ids(model.generate(do_sample=True, max_length=5, num_beams=2))
with self.assertRaises(ValueError):
# generating more sequences than having beams leads is not possible
model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
# num_return_sequences > 1, sample
self._check_generated_ids(
model.generate(
input_ids,
do_sample=True,
num_beams=2,
num_return_sequences=2,
)
)
# num_return_sequences > 1, greedy
self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2))
# check bad words tokens language generation
# create list of 1-seq bad token and list of 2-seq of bad tokens
bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
output_tokens = model.generate(
input_ids, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2
)
# only count generated tokens
generated_ids = output_tokens[:, input_ids.shape[-1] :]
self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
def test_lm_head_model_beam_search_generate_dict_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
input_ids = inputs_dict.get("input_ids", None)
if input_ids is None:
input_ids = inputs_dict.get("input_features", None)
# iterate over all generative models
for model_class in self.all_generative_model_classes:
model = model_class(config)
output_beam_search = model.generate(
input_ids,
num_beams=2,
do_sample=False,
output_scores=True,
output_hidden_states=True,
output_attentions=True,
return_dict_in_generate=True,
)
output_beam_sample = model.generate(
input_ids,
num_beams=2,
do_sample=True,
output_scores=True,
output_hidden_states=True,
output_attentions=True,
return_dict_in_generate=True,
)
if model.config.is_encoder_decoder:
self.assertIsInstance(output_beam_search, TFBeamSearchEncoderDecoderOutput)
self.assertIsInstance(output_beam_sample, TFBeamSampleEncoderDecoderOutput)
else:
self.assertIsInstance(output_beam_search, TFBeamSearchDecoderOnlyOutput)
self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput)
def test_loss_computation(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
@@ -1574,40 +1420,6 @@ class TFModelTesterMixin:
if tensor_spec.dtype.is_integer:
self.assertTrue(tensor_spec.dtype == tf.int32, "Input signatures should use tf.int32 for ints!")
def test_generate_with_headmasking(self):
attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_generative_model_classes:
model = model_class(config)
# We want to test only encoder-decoder models
if not config.is_encoder_decoder:
continue
head_masking = {
"head_mask": tf.zeros((config.encoder_layers, config.encoder_attention_heads)),
"decoder_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
"cross_attn_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
}
signature = inspect.signature(model.call)
if set(head_masking.keys()) < {*signature.parameters.keys()}:
continue
for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
out = model.generate(
inputs_dict["input_ids"],
num_beams=1,
max_length=inputs_dict["input_ids"] + 5,
output_attentions=True,
return_dict_in_generate=True,
**{name: mask},
)
# We check the state of decoder_attentions and cross_attentions just from the last step
attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
self.assertEqual(sum([tf.reduce_sum(w).numpy() for w in attn_weights]), 0.0)
def test_load_with_mismatched_shapes(self):
if not self.test_mismatched_shapes:
return
@@ -1717,133 +1529,6 @@ class TFModelTesterMixin:
model.compile(optimizer="sgd", run_eagerly=True)
model.train_on_batch(test_batch, test_batch_labels)
def _test_xla_generate(self, **generate_kwargs):
def _generate_and_check_results(model, inputs, is_input_ids):
# make sure there are no pad tokens in prompt, which may trigger unwanted behavior
if is_input_ids:
if model.generation_config.pad_token_id is not None:
if config.pad_token_id == 0:
new_pad_token = model.generation_config.pad_token_id + 1
else:
new_pad_token = model.generation_config.pad_token_id - 1
else:
new_pad_token = None
inputs = tf.where(inputs != model.generation_config.pad_token_id, inputs, new_pad_token)
generated = model.generate(inputs, **generate_kwargs).numpy()
generate_xla = tf.function(model.generate, jit_compile=True)
generated_xla = generate_xla(inputs, **generate_kwargs).numpy()
# Due to numerical instability, let's fail the test only if there are more than 10% of input sequences give
# different outputs between XLA and non-XLA versions. If there are less than 10 examples, let's be strict
# and not allow any difference.
diff = [[], []]
for _generated, _generated_xla in zip(generated.tolist(), generated_xla.tolist()):
if _generated != _generated_xla:
diff[0].append(_generated)
diff[1].append(_generated_xla)
ratio = len(diff[0]) / len(generated)
if ratio > 0.1 or (len(diff[0]) > 0 and len(generated) < 10):
self.assertListEqual(diff[0], diff[1])
for model_class in self.all_generative_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.eos_token_id = None # Generate until max length
config.do_sample = False
# extract the input to the model
is_input_ids = "input_ids" in inputs_dict
is_input_features = "input_features" in inputs_dict
if not (is_input_ids or is_input_features):
raise ValueError("No valid generate input found in inputs_dict")
inputs = inputs_dict["input_ids"] if is_input_ids else inputs_dict["input_features"]
# fix config for models with additional sequence-length limiting settings
seq_len = inputs.get_shape()[1]
for var_name in ["max_position_embeddings", "max_target_positions"]:
attr = getattr(config, var_name, None)
if attr is not None and attr < seq_len + generate_kwargs["max_new_tokens"]:
try:
setattr(config, var_name, seq_len + generate_kwargs["max_new_tokens"])
except NotImplementedError:
# xlnet will raise an exception when trying to set
# max_position_embeddings.
pass
model = model_class(config)
if model.supports_xla_generation:
_generate_and_check_results(model, inputs, is_input_ids)
else:
with self.assertRaises(ValueError):
_generate_and_check_results(model, inputs, is_input_ids)
def test_xla_generate_fast(self):
"""
Basic quick test for generate-compatible classes that confirms that XLA-generated tokens are the same as their
non XLA counterparts.
Either the model supports XLA generation and passes the inner test, or it raises an appropriate exception
"""
self._test_xla_generate(num_beams=1, num_return_sequences=1, max_new_tokens=3)
@slow
def test_xla_generate_contrastive(self):
"""
Slow and challenging version of `test_xla_generate_fast` for contrastive search -- contrastive search directly
manipulates the model cache and other outputs, and this test ensures that they are in a valid format that is
also supported by XLA.
Either the model supports XLA generation and passes the inner test, or it raises an appropriate exception
"""
self._test_xla_generate(num_beams=1, num_return_sequences=1, max_new_tokens=16, penalty_alpha=0.5, top_k=4)
@slow
def test_xla_generate_slow(self):
"""
Slow and challenging version of `test_xla_generate_fast` -- this test asks for several long sequences using
beam search, with and without XLA. The two outputs should match, and a failure in this test indicates that the
model may need further analysis if it is to be used for XLA generation.
Either the model supports XLA generation and passes the inner test, or it raises an appropriate exception
"""
self._test_xla_generate(num_beams=8, num_return_sequences=2, max_new_tokens=128)
def _generate_random_bad_tokens(self, num_bad_tokens, model):
# special tokens cannot be bad tokens
special_tokens = []
if model.config.bos_token_id is not None:
special_tokens.append(model.config.bos_token_id)
if model.config.pad_token_id is not None:
special_tokens.append(model.config.pad_token_id)
if model.config.eos_token_id is not None:
special_tokens.append(model.config.eos_token_id)
# create random bad tokens that are not special tokens
bad_tokens = []
while len(bad_tokens) < num_bad_tokens:
token = tf.squeeze(ids_tensor((1, 1), self.model_tester.vocab_size), 0).numpy()[0]
if token not in special_tokens:
bad_tokens.append(token)
return bad_tokens
def _check_generated_ids(self, output_ids):
for token_id in output_ids[0].numpy().tolist():
self.assertGreaterEqual(token_id, 0)
self.assertLess(token_id, self.model_tester.vocab_size)
def _check_match_tokens(self, generated_ids, bad_words_ids):
# for all bad word tokens
for bad_word_ids in bad_words_ids:
# for all slices in batch
for generated_ids_slice in generated_ids:
# for all word idx
for i in range(len(bad_word_ids), len(generated_ids_slice)):
# if tokens match
if generated_ids_slice[i - len(bad_word_ids) : i] == bad_word_ids:
return True
return False
def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
"""Creates a random int32 tensor of the shape within the vocab size."""