[tests] remove tf/flax tests in /generation (#36235)

2025-02-17 14:59:22 +00:00
parent c877c9fa5b
commit 55493f1390
26 changed files with 428 additions and 2663 deletions
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -70,16 +70,6 @@ if is_tf_available():
        TFAutoModelForSequenceClassification,
        TFSharedEmbeddings,
    )
-    from transformers.generation import (
-        TFBeamSampleDecoderOnlyOutput,
-        TFBeamSampleEncoderDecoderOutput,
-        TFBeamSearchDecoderOnlyOutput,
-        TFBeamSearchEncoderDecoderOutput,
-        TFGreedySearchDecoderOnlyOutput,
-        TFGreedySearchEncoderDecoderOutput,
-        TFSampleDecoderOnlyOutput,
-        TFSampleEncoderDecoderOutput,
-    )
    from transformers.modeling_tf_utils import keras

    tf.config.experimental.enable_tensor_float_32_execution(False)
@@ -1211,150 +1201,6 @@ class TFModelTesterMixin:
            with self.assertRaises(tf.errors.InvalidArgumentError):
                model(**prepared_inputs)

-    def test_lm_head_model_random_no_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_ids
-                with self.assertRaises(ValueError):
-                    model.generate(do_sample=True, max_length=5)
-                # num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_ids, do_sample=True))
-            elif model_class.__name__ not in ["TFSpeech2TextForConditionalGeneration"]:
-                # Models with non-text inputs won't work here; num_return_sequences = 1
-                self._check_generated_ids(model.generate(do_sample=True, max_length=5))
-
-            with self.assertRaises(ValueError):
-                # generating multiple sequences when no beam search generation
-                # is not allowed as it would always generate the same sequences
-                model.generate(input_ids, do_sample=False, num_return_sequences=2)
-
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(model.generate(input_ids, do_sample=True, num_return_sequences=2))
-
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
-            output_tokens = model.generate(
-                input_ids, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_ids.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
-
-    def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-        if input_ids is None:
-            input_ids = inputs_dict.get("input_features", None)
-
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-            output_greedy = model.generate(
-                input_ids,
-                do_sample=False,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-            output_sample = model.generate(
-                input_ids,
-                do_sample=True,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertIsInstance(output_greedy, TFGreedySearchEncoderDecoderOutput)
-                self.assertIsInstance(output_sample, TFSampleEncoderDecoderOutput)
-            else:
-                self.assertIsInstance(output_greedy, TFGreedySearchDecoderOnlyOutput)
-                self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput)
-
-    def test_lm_head_model_random_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_ids, num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2))
-            else:
-                # num_return_sequences = 1
-                self._check_generated_ids(model.generate(do_sample=True, max_length=5, num_beams=2))
-
-            with self.assertRaises(ValueError):
-                # generating more sequences than having beams leads is not possible
-                model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
-
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(
-                model.generate(
-                    input_ids,
-                    do_sample=True,
-                    num_beams=2,
-                    num_return_sequences=2,
-                )
-            )
-            # num_return_sequences > 1, greedy
-            self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2))
-
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
-            output_tokens = model.generate(
-                input_ids, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_ids.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
-
-    def test_lm_head_model_beam_search_generate_dict_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-        if input_ids is None:
-            input_ids = inputs_dict.get("input_features", None)
-
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-            output_beam_search = model.generate(
-                input_ids,
-                num_beams=2,
-                do_sample=False,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-            output_beam_sample = model.generate(
-                input_ids,
-                num_beams=2,
-                do_sample=True,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertIsInstance(output_beam_search, TFBeamSearchEncoderDecoderOutput)
-                self.assertIsInstance(output_beam_sample, TFBeamSampleEncoderDecoderOutput)
-            else:
-                self.assertIsInstance(output_beam_search, TFBeamSearchDecoderOnlyOutput)
-                self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput)
-
    def test_loss_computation(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        for model_class in self.all_model_classes:
@@ -1574,40 +1420,6 @@ class TFModelTesterMixin:
                if tensor_spec.dtype.is_integer:
                    self.assertTrue(tensor_spec.dtype == tf.int32, "Input signatures should use tf.int32 for ints!")

-    def test_generate_with_headmasking(self):
-        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            # We want to test only encoder-decoder models
-            if not config.is_encoder_decoder:
-                continue
-
-            head_masking = {
-                "head_mask": tf.zeros((config.encoder_layers, config.encoder_attention_heads)),
-                "decoder_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
-                "cross_attn_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
-            }
-
-            signature = inspect.signature(model.call)
-            if set(head_masking.keys()) < {*signature.parameters.keys()}:
-                continue
-
-            for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
-                out = model.generate(
-                    inputs_dict["input_ids"],
-                    num_beams=1,
-                    max_length=inputs_dict["input_ids"] + 5,
-                    output_attentions=True,
-                    return_dict_in_generate=True,
-                    **{name: mask},
-                )
-                # We check the state of decoder_attentions and cross_attentions just from the last step
-                attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
-                self.assertEqual(sum([tf.reduce_sum(w).numpy() for w in attn_weights]), 0.0)
-
    def test_load_with_mismatched_shapes(self):
        if not self.test_mismatched_shapes:
            return
@@ -1717,133 +1529,6 @@ class TFModelTesterMixin:
                model.compile(optimizer="sgd", run_eagerly=True)
                model.train_on_batch(test_batch, test_batch_labels)

-    def _test_xla_generate(self, **generate_kwargs):
-        def _generate_and_check_results(model, inputs, is_input_ids):
-            # make sure there are no pad tokens in prompt, which may trigger unwanted behavior
-            if is_input_ids:
-                if model.generation_config.pad_token_id is not None:
-                    if config.pad_token_id == 0:
-                        new_pad_token = model.generation_config.pad_token_id + 1
-                    else:
-                        new_pad_token = model.generation_config.pad_token_id - 1
-                else:
-                    new_pad_token = None
-                inputs = tf.where(inputs != model.generation_config.pad_token_id, inputs, new_pad_token)
-
-            generated = model.generate(inputs, **generate_kwargs).numpy()
-            generate_xla = tf.function(model.generate, jit_compile=True)
-            generated_xla = generate_xla(inputs, **generate_kwargs).numpy()
-
-            # Due to numerical instability, let's fail the test only if there are more than 10% of input sequences give
-            # different outputs between XLA and non-XLA versions. If there are less than 10 examples, let's be strict
-            # and not allow any difference.
-            diff = [[], []]
-            for _generated, _generated_xla in zip(generated.tolist(), generated_xla.tolist()):
-                if _generated != _generated_xla:
-                    diff[0].append(_generated)
-                    diff[1].append(_generated_xla)
-            ratio = len(diff[0]) / len(generated)
-            if ratio > 0.1 or (len(diff[0]) > 0 and len(generated) < 10):
-                self.assertListEqual(diff[0], diff[1])
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.eos_token_id = None  # Generate until max length
-            config.do_sample = False
-
-            # extract the input to the model
-            is_input_ids = "input_ids" in inputs_dict
-            is_input_features = "input_features" in inputs_dict
-            if not (is_input_ids or is_input_features):
-                raise ValueError("No valid generate input found in inputs_dict")
-            inputs = inputs_dict["input_ids"] if is_input_ids else inputs_dict["input_features"]
-
-            # fix config for models with additional sequence-length limiting settings
-            seq_len = inputs.get_shape()[1]
-            for var_name in ["max_position_embeddings", "max_target_positions"]:
-                attr = getattr(config, var_name, None)
-                if attr is not None and attr < seq_len + generate_kwargs["max_new_tokens"]:
-                    try:
-                        setattr(config, var_name, seq_len + generate_kwargs["max_new_tokens"])
-                    except NotImplementedError:
-                        # xlnet will raise an exception when trying to set
-                        # max_position_embeddings.
-                        pass
-
-            model = model_class(config)
-
-            if model.supports_xla_generation:
-                _generate_and_check_results(model, inputs, is_input_ids)
-            else:
-                with self.assertRaises(ValueError):
-                    _generate_and_check_results(model, inputs, is_input_ids)
-
-    def test_xla_generate_fast(self):
-        """
-        Basic quick test for generate-compatible classes that confirms that XLA-generated tokens are the same as their
-        non XLA counterparts.
-
-        Either the model supports XLA generation and passes the inner test, or it raises an appropriate exception
-        """
-        self._test_xla_generate(num_beams=1, num_return_sequences=1, max_new_tokens=3)
-
-    @slow
-    def test_xla_generate_contrastive(self):
-        """
-        Slow and challenging version of `test_xla_generate_fast` for contrastive search -- contrastive search directly
-        manipulates the model cache and other outputs, and this test ensures that they are in a valid format that is
-        also supported by XLA.
-
-        Either the model supports XLA generation and passes the inner test, or it raises an appropriate exception
-        """
-        self._test_xla_generate(num_beams=1, num_return_sequences=1, max_new_tokens=16, penalty_alpha=0.5, top_k=4)
-
-    @slow
-    def test_xla_generate_slow(self):
-        """
-        Slow and challenging version of `test_xla_generate_fast` -- this test asks for several long sequences using
-        beam search, with and without XLA. The two outputs should match, and a failure in this test indicates that the
-        model may need further analysis if it is to be used for XLA generation.
-
-        Either the model supports XLA generation and passes the inner test, or it raises an appropriate exception
-        """
-        self._test_xla_generate(num_beams=8, num_return_sequences=2, max_new_tokens=128)
-
-    def _generate_random_bad_tokens(self, num_bad_tokens, model):
-        # special tokens cannot be bad tokens
-        special_tokens = []
-        if model.config.bos_token_id is not None:
-            special_tokens.append(model.config.bos_token_id)
-        if model.config.pad_token_id is not None:
-            special_tokens.append(model.config.pad_token_id)
-        if model.config.eos_token_id is not None:
-            special_tokens.append(model.config.eos_token_id)
-
-        # create random bad tokens that are not special tokens
-        bad_tokens = []
-        while len(bad_tokens) < num_bad_tokens:
-            token = tf.squeeze(ids_tensor((1, 1), self.model_tester.vocab_size), 0).numpy()[0]
-            if token not in special_tokens:
-                bad_tokens.append(token)
-        return bad_tokens
-
-    def _check_generated_ids(self, output_ids):
-        for token_id in output_ids[0].numpy().tolist():
-            self.assertGreaterEqual(token_id, 0)
-            self.assertLess(token_id, self.model_tester.vocab_size)
-
-    def _check_match_tokens(self, generated_ids, bad_words_ids):
-        # for all bad word tokens
-        for bad_word_ids in bad_words_ids:
-            # for all slices in batch
-            for generated_ids_slice in generated_ids:
-                # for all word idx
-                for i in range(len(bad_word_ids), len(generated_ids_slice)):
-                    # if tokens match
-                    if generated_ids_slice[i - len(bad_word_ids) : i] == bad_word_ids:
-                        return True
-        return False
-

 def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
    """Creates a random int32 tensor of the shape within the vocab size."""