TF generate refactor - Greedy Search (#15562)

* TF generate start refactor * Add tf tests for sample generate * re-organize * boom boom * Apply suggestions from code review * re-add * add all code * make random greedy pass * make encoder-decoder random work * further improvements * delete bogus file * make gpt2 and t5 tests work * finish logits tests * correct logits processors * correct past / encoder_outputs drama * refactor some methods * another fix * refactor shape_list * fix more shape list * import shape _list * finish docs * fix imports * make style * correct tf utils * Fix TFRag as well * Apply Lysandre's and Sylvais suggestions * Update tests/test_generation_tf_logits_process.py Co-authored-by: Matt <Rocketknight1@users.noreply.github.com> * Update src/transformers/tf_utils.py Co-authored-by: Matt <Rocketknight1@users.noreply.github.com> * remove cpu according to gante * correct logit processor Co-authored-by: Matt <Rocketknight1@users.noreply.github.com>
2022-02-15 17:54:43 +01:00
parent a3dbbc3467
commit 2e12b907ae
56 changed files with 1491 additions and 222 deletions
--- a/tests/test_generation_tf_logits_process.py
+++ b/tests/test_generation_tf_logits_process.py
@@ -0,0 +1,172 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a clone of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_tf_available
+from transformers.testing_utils import require_tf
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers.generation_tf_logits_process import (
+        TFLogitsProcessorList,
+        TFMinLengthLogitsProcessor,
+        TFNoBadWordsLogitsProcessor,
+        TFNoRepeatNGramLogitsProcessor,
+        TFRepetitionPenaltyLogitsProcessor,
+    )
+    from transformers.tf_utils import set_tensor_by_indices_to_value
+
+    from .test_modeling_tf_common import ids_tensor
+
+
+@require_tf
+class TFLogitsProcessorTest(unittest.TestCase):
+    def _get_uniform_logits(self, batch_size: int, length: int):
+        scores = tf.ones((batch_size, length), dtype=tf.float32) / length
+        return scores
+
+    def test_min_length_dist_processor(self):
+        vocab_size = 20
+        batch_size = 4
+        eos_token_id = 0
+
+        min_dist_processor = TFMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
+
+        # check that min length is applied at length 5
+        input_ids = ids_tensor((batch_size, 5), vocab_size=20)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores_before_min_length = min_dist_processor(input_ids, scores)
+        self.assertListEqual(scores_before_min_length[:, eos_token_id].numpy().tolist(), 4 * [-float("inf")])
+
+        # check that min length is not applied anymore at length 15
+        input_ids = ids_tensor((batch_size, 15), vocab_size=20)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores_before_min_length = min_dist_processor(input_ids, scores)
+        self.assertFalse(tf.math.reduce_any(tf.math.is_inf(scores_before_min_length)).numpy())
+
+    def test_repetition_penalty_dist_process(self):
+        input_ids = tf.constant([[0, 1], [5, 0]], dtype=tf.int32)
+        vocab_size = 10
+
+        scores = self._get_uniform_logits(batch_size=2, length=vocab_size)
+
+        mask = tf.cast(tf.constant([[1] + 9 * [0], 10 * [0]]), tf.bool)
+        scores = set_tensor_by_indices_to_value(scores, mask, -1 / vocab_size)
+        mask = tf.cast(tf.constant([10 * [0], 5 * [0] + [1] + 4 * [0]]), tf.bool)
+        scores = set_tensor_by_indices_to_value(scores, mask, 4 / vocab_size)
+
+        rep_penalty_proc = TFRepetitionPenaltyLogitsProcessor(penalty=2.0)
+
+        scores = rep_penalty_proc(input_ids, tf.identity(scores))
+
+        # check that values were correctly changed
+        self.assertAlmostEqual(scores[0, 0].numpy(), -(1 / vocab_size) * 2)
+        self.assertAlmostEqual(scores[0, 1].numpy(), (1 / vocab_size) / 2)
+
+        self.assertAlmostEqual(scores[1, 0].numpy(), (1 / vocab_size) / 2)
+        self.assertAlmostEqual(scores[1, 5].numpy(), (4 / vocab_size) / 2)
+
+    def test_no_repeat_ngram_dist_processor(self):
+        vocab_size = 3
+        batch_size = 2
+
+        input_ids = tf.constant([[1, 1, 2, 1], [0, 1, 0, 1]], dtype=tf.int32)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+
+        no_repeat_proc_2_gram = TFNoRepeatNGramLogitsProcessor(2)
+        no_repeat_proc_3_gram = TFNoRepeatNGramLogitsProcessor(3)
+
+        filtered_scores_2_gram = no_repeat_proc_2_gram(input_ids, tf.identity(scores))
+        filtered_scores_3_gram = no_repeat_proc_3_gram(input_ids, tf.identity(scores))
+
+        # 2-gram would forbid 2nd and 3rd token (1,2) at 1st batch and 1st token (0) at 2nd batch
+        self.assertListEqual(
+            tf.math.is_inf(filtered_scores_2_gram).numpy().tolist(), [[False, True, True], [True, False, False]]
+        )
+
+        # 3-gram would forbid no token at 1st batch and 1st token (0) at 2nd batch
+        self.assertListEqual(
+            tf.math.is_inf(filtered_scores_3_gram).numpy().tolist(), [[False, False, False], [True, False, False]]
+        )
+
+    def test_no_bad_words_dist_processor(self):
+        vocab_size = 5
+        batch_size = 2
+        eos_token_id = 4
+
+        input_ids = tf.constant([[0, 1, 3, 1], [0, 1, 0, 1]], dtype=tf.int32)
+        bad_word_tokens = [[1], [4], [1, 0], [0, 1, 2], [1, 3, 1, 3]]
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+
+        no_bad_words_dist_proc = TFNoBadWordsLogitsProcessor(bad_words_ids=bad_word_tokens, eos_token_id=eos_token_id)
+
+        filtered_scores = no_bad_words_dist_proc(input_ids, tf.identity(scores))
+
+        # batch 1: 1st, 2nd, and 4th (0, 1, 3) token are forbidden
+        # batch 2: 1st, 2nd, and 3rd (0, 1, 2) token are forbidden
+        self.assertListEqual(
+            tf.math.is_inf(filtered_scores).numpy().tolist(),
+            [[True, True, False, True, True], [True, True, True, False, True]],
+        )
+
+    def test_processor_list(self):
+        batch_size = 4
+        sequence_length = 10
+        vocab_size = 15
+        eos_token_id = 0
+
+        # dummy input_ids and scores
+        input_ids = ids_tensor((batch_size, sequence_length), vocab_size)
+        input_ids_comp = tf.identity(input_ids)
+
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores_comp = tf.identity(scores)
+
+        # instantiate all dist processors
+        min_dist_proc = TFMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
+        rep_penalty_proc = TFRepetitionPenaltyLogitsProcessor(penalty=2.0)
+        no_repeat_proc = TFNoRepeatNGramLogitsProcessor(2)
+        no_bad_words_dist_proc = TFNoBadWordsLogitsProcessor(bad_words_ids=[[1]], eos_token_id=eos_token_id)
+
+        # no processor list
+        scores = min_dist_proc(input_ids, scores)
+        scores = rep_penalty_proc(input_ids, scores)
+        scores = no_repeat_proc(input_ids, scores)
+        scores = no_bad_words_dist_proc(input_ids, scores)
+
+        # with processor list
+        processor = TFLogitsProcessorList(
+            [
+                min_dist_proc,
+                rep_penalty_proc,
+                no_repeat_proc,
+                no_bad_words_dist_proc,
+            ]
+        )
+        scores_comp = processor(input_ids, scores_comp)
+
+        # remove inf
+        scores = set_tensor_by_indices_to_value(scores, tf.math.is_inf(scores), -1e9)
+        scores_comp = set_tensor_by_indices_to_value(scores_comp, tf.math.is_inf(scores_comp), -1e9)
+
+        # scores should be equal
+        tf.debugging.assert_near(scores, scores_comp, atol=1e-3)
+
+        # input_ids should never be changed
+        self.assertListEqual(input_ids.numpy().tolist(), input_ids_comp.numpy().tolist())
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -955,7 +955,7 @@ class TFModelTesterMixin:
                # Models with non-text inputs won't work here; num_return_sequences = 1
                self._check_generated_ids(model.generate(do_sample=True, max_length=5))

-            with self.assertRaises(AssertionError):
+            with self.assertRaises(ValueError):
                # generating multiple sequences when no beam search generation
                # is not allowed as it would always generate the same sequences
                model.generate(input_ids, do_sample=False, num_return_sequences=2)
--- a/tests/test_modeling_tf_gpt2.py
+++ b/tests/test_modeling_tf_gpt2.py
@@ -26,14 +26,15 @@ from .test_modeling_tf_core import TFCoreModelTesterMixin
 if is_tf_available():
    import tensorflow as tf

+    from transformers import GPT2Tokenizer
    from transformers.models.gpt2.modeling_tf_gpt2 import (
        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFGPT2DoubleHeadsModel,
        TFGPT2ForSequenceClassification,
        TFGPT2LMHeadModel,
        TFGPT2Model,
-        shape_list,
    )
+    from transformers.tf_utils import shape_list


 class TFGPT2ModelTester:
@@ -428,60 +429,53 @@ class TFGPT2ModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, unittest.TestC
@require_tf
 class TFGPT2ModelLanguageGenerationTest(unittest.TestCase):
    @slow
-    def test_lm_generate_gpt2(self):
-        model = TFGPT2LMHeadModel.from_pretrained("gpt2")
-        input_ids = tf.convert_to_tensor([[464, 3290]], dtype=tf.int32)  # The dog
-        expected_output_ids = [
-            464,
-            3290,
-            373,
-            1043,
-            287,
-            257,
-            2214,
-            1474,
-            262,
-            16246,
-            286,
-            2688,
-            290,
-            2688,
-            27262,
-            13,
-            198,
-            198,
-            464,
-            3290,
-        ]  # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog
+    def test_lm_generate_distilgpt2(self):
+        model = TFGPT2LMHeadModel.from_pretrained("distilgpt2")
+        input_ids = tf.convert_to_tensor([[464, 1893]], dtype=tf.int32)  # The president
+
+        # The president of the United States, and the president of the United Kingdom, have been in the White
+        # fmt: off
+        expected_output_ids = [464, 1893, 286, 262, 1578, 1829, 11, 290, 262, 1893, 286, 262, 1578, 7526, 11, 423, 587, 287, 262, 2635]
+        # fmt: on
+
        output_ids = model.generate(input_ids, do_sample=False)
        self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)

    @slow
-    def test_lm_generate_distilgpt2(self):
+    def test_lm_generate_distilgpt2_batch_special(self):
        model = TFGPT2LMHeadModel.from_pretrained("distilgpt2")
-        input_ids = tf.convert_to_tensor([[464, 1893]], dtype=tf.int32)  # The president
-        expected_output_ids = [
-            464,
-            1893,
-            286,
-            262,
-            1578,
-            1829,
-            11,
-            290,
-            262,
-            1893,
-            286,
-            262,
-            1578,
-            7526,
-            11,
-            423,
-            587,
-            287,
-            262,
-            2635,
-        ]  # The president of the United States, and the president of the United Kingdom, have been in the White
+        tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.padding_side = "left"
+
+        sentences = ["Today is a beautiful day and", "Yesterday was"]
+        input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids
+
+        generation_kwargs = {
+            "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids],
+            "no_repeat_ngram_size": 2,
+            "do_sample": False,
+            "repetition_penalty": 1.3,
+        }
+
+        output_ids = model.generate(input_ids, **generation_kwargs)
+
+        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        expected_output_string = [
+            "Today is a beautiful day and I am so happy to be able take part in this amazing event.",
+            "Yesterday was a very busy day for the first time since I started writing this post",
+        ]
+        self.assertListEqual(output_strings, expected_output_string)
+
+    @slow
+    def test_lm_generate_gpt2(self):
+        model = TFGPT2LMHeadModel.from_pretrained("gpt2")
+        input_ids = tf.convert_to_tensor([[464, 3290]], dtype=tf.int32)  # The dog
+
+        # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog
+        # fmt: off
+        expected_output_ids = [464, 3290, 373, 1043, 287, 257, 2214, 1474, 262, 16246, 286, 2688, 290, 2688, 27262, 13, 198, 198, 464, 3290]
+        # fmt: on
        output_ids = model.generate(input_ids, do_sample=False)
        self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
--- a/tests/test_modeling_tf_longformer.py
+++ b/tests/test_modeling_tf_longformer.py
@@ -36,14 +36,7 @@ if is_tf_available():
        TFLongformerModel,
        TFLongformerSelfAttention,
    )
-
-    def shape_list(x):
-        """
-        copied from transformers.modeling_tf_utils
-        """
-        static = x.shape.as_list()
-        dynamic = tf.shape(x)
-        return [dynamic[i] if s is None else s for i, s in enumerate(static)]
+    from transformers.tf_utils import shape_list


 class TFLongformerModelTester:
--- a/tests/test_modeling_tf_speech_to_text.py
+++ b/tests/test_modeling_tf_speech_to_text.py
@@ -474,7 +474,7 @@ class TFSpeech2TextModelTest(TFModelTesterMixin, unittest.TestCase):
                # num_return_sequences = 1
                self._check_generated_ids(model.generate(input_features, do_sample=True))

-            with self.assertRaises(AssertionError):
+            with self.assertRaises(ValueError):
                # generating multiple sequences when no beam search generation
                # is not allowed as it would always generate the same sequences
                model.generate(input_features, do_sample=False, num_return_sequences=2)
--- a/tests/test_modeling_tf_t5.py
+++ b/tests/test_modeling_tf_t5.py
@@ -453,6 +453,34 @@ class TFT5EncoderOnlyModelTest(TFModelTesterMixin, unittest.TestCase):
        pass


+@require_tf
+@require_sentencepiece
+@require_tokenizers
+class TFT5GenerationIntegrationTests(unittest.TestCase):
+    @slow
+    def test_greedy_generate(self):
+        model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+
+        sentences = ["Yesterday, my name was", "Today is a beautiful day and"]
+        input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids
+
+        generation_kwargs = {
+            "bad_words_ids": [tokenizer("my").input_ids, tokenizer("ein schöner").input_ids],
+            "no_repeat_ngram_size": 3,
+            "do_sample": False,
+            "repetition_penalty": 2.2,
+        }
+
+        output_ids = model.generate(input_ids, **generation_kwargs)
+
+        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+
+        expected_output_string = ["Yesterday, my name was", "Heute ist ein schöne Tag und"]
+
+        self.assertListEqual(expected_output_string, output_strings)
+
+
@require_tf
@require_sentencepiece
@require_tokenizers