Merge branch 'master' into add_models_special_tokens_to_specific_configs

2020-03-05 17:24:42 -05:00
parent f5b50c6b8e b623ddc000
commit 146c521235
161 changed files with 7362 additions and 10497 deletions
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -57,8 +57,18 @@ class ConfigTester(object):

        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())

+    def create_and_test_config_with_num_labels(self):
+        config = self.config_class(**self.inputs_dict, num_labels=5)
+        self.parent.assertEqual(len(config.id2label), 5)
+        self.parent.assertEqual(len(config.label2id), 5)
+
+        config.num_labels = 3
+        self.parent.assertEqual(len(config.id2label), 3)
+        self.parent.assertEqual(len(config.label2id), 3)
+
    def run_common_tests(self):
        self.create_and_test_config_common_properties()
        self.create_and_test_config_to_json_string()
        self.create_and_test_config_to_json_file()
        self.create_and_test_config_from_and_save_pretrained()
+        self.create_and_test_config_with_num_labels()
--- a/tests/test_doc_samples.py
+++ b/tests/test_doc_samples.py
@@ -78,6 +78,7 @@ class TestCodeExamples(unittest.TestCase):

        for file in files:
            # Open all files
+            print("Testing", file, end=" ")
            with open(os.path.join(directory, file)) as f:
                # Retrieve examples
                examples = get_examples_from_file(f)
@@ -99,7 +100,7 @@ class TestCodeExamples(unittest.TestCase):
                            joined_examples.append(example)
                            joined_examples_index += 1

-                print("Testing", file, str(len(joined_examples)) + "/" + str(len(joined_examples)))
+                print(str(len(joined_examples)) + "/" + str(len(joined_examples)))

                # Execute sub tests with every example.
                for index, code_example in enumerate(joined_examples):
@@ -114,7 +115,8 @@ class TestCodeExamples(unittest.TestCase):

    def test_main_doc_examples(self):
        doc_directory = "docs/source"
-        self.analyze_directory(doc_directory)
+        ignore_files = ["favicon.ico"]
+        self.analyze_directory(doc_directory, ignore_files=ignore_files)

    def test_modeling_examples(self):
        transformers_directory = "src/transformers"
@@ -125,5 +127,7 @@ class TestCodeExamples(unittest.TestCase):
            "modeling_tf_auto.py",
            "modeling_utils.py",
            "modeling_tf_t5.py",
+            "modeling_bart.py",
+            "modeling_tf_utils.py",
        ]
        self.analyze_directory(transformers_directory, identifier=modeling_files, ignore_files=ignore_files)
--- a/tests/test_modeling_albert.py
+++ b/tests/test_modeling_albert.py
@@ -29,6 +29,7 @@ if is_torch_available():
        AlbertModel,
        AlbertForMaskedLM,
        AlbertForSequenceClassification,
+        AlbertForTokenClassification,
        AlbertForQuestionAnswering,
    )
    from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -207,6 +208,25 @@ class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
            self.check_loss_output(result)

+        def create_and_check_albert_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            config.num_labels = self.num_labels
+            model = AlbertForTokenClassification(config=config)
+            model.to(torch_device)
+            model.eval()
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
+            )
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
+            )
+            self.check_loss_output(result)
+
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
            (
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -36,6 +36,7 @@ if is_torch_available():
        BertModel,
        BertConfig,
        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        top_k_top_p_filtering,
    )


@@ -68,7 +69,7 @@ class ModelTesterMixin:
            model.eval()
            with torch.no_grad():
                outputs = model(**inputs_dict)
-            out_2 = outputs[0].numpy()
+            out_2 = outputs[0].cpu().numpy()
            out_2[np.isnan(out_2)] = 0

            with tempfile.TemporaryDirectory() as tmpdirname:
@@ -263,7 +264,7 @@ class ModelTesterMixin:
            # Prepare head_mask
            # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
            head_mask = torch.ones(
-                self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device
+                self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device,
            )
            head_mask[0, 0] = 0
            head_mask[-1, :-1] = 0
@@ -303,7 +304,7 @@ class ModelTesterMixin:
            return

        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            (config, inputs_dict,) = self.model_tester.prepare_config_and_inputs_for_common()

            if "head_mask" in inputs_dict:
                del inputs_dict["head_mask"]
@@ -313,7 +314,10 @@ class ModelTesterMixin:
            model = model_class(config=config)
            model.to(torch_device)
            model.eval()
-            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
+            heads_to_prune = {
+                0: list(range(1, self.model_tester.num_attention_heads)),
+                -1: [0],
+            }
            model.prune_heads(heads_to_prune)
            with torch.no_grad():
                outputs = model(**inputs_dict)
@@ -329,7 +333,7 @@ class ModelTesterMixin:
            return

        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            (config, inputs_dict,) = self.model_tester.prepare_config_and_inputs_for_common()

            if "head_mask" in inputs_dict:
                del inputs_dict["head_mask"]
@@ -339,7 +343,10 @@ class ModelTesterMixin:
            model = model_class(config=config)
            model.to(torch_device)
            model.eval()
-            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
+            heads_to_prune = {
+                0: list(range(1, self.model_tester.num_attention_heads)),
+                -1: [0],
+            }
            model.prune_heads(heads_to_prune)

            with tempfile.TemporaryDirectory() as temp_dir_name:
@@ -359,7 +366,7 @@ class ModelTesterMixin:
            return

        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            (config, inputs_dict,) = self.model_tester.prepare_config_and_inputs_for_common()

            if "head_mask" in inputs_dict:
                del inputs_dict["head_mask"]
@@ -367,7 +374,10 @@ class ModelTesterMixin:
            config.output_attentions = True
            config.output_hidden_states = False

-            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
+            heads_to_prune = {
+                0: list(range(1, self.model_tester.num_attention_heads)),
+                -1: [0],
+            }
            config.pruned_heads = heads_to_prune

            model = model_class(config=config)
@@ -387,7 +397,7 @@ class ModelTesterMixin:
            return

        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            (config, inputs_dict,) = self.model_tester.prepare_config_and_inputs_for_common()

            if "head_mask" in inputs_dict:
                del inputs_dict["head_mask"]
@@ -465,13 +475,14 @@ class ModelTesterMixin:
            )

    def test_resize_tokens_embeddings(self):
-        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (original_config, inputs_dict,) = self.model_tester.prepare_config_and_inputs_for_common()
        if not self.test_resize_embeddings:
            return

        for model_class in self.all_model_classes:
            config = copy.deepcopy(original_config)
            model = model_class(config)
+            model.to(torch_device)

            model_vocab_size = config.vocab_size
            # Retrieve the embeddings and clone theme
@@ -515,6 +526,21 @@ class ModelTesterMixin:
            x = model.get_output_embeddings()
            self.assertTrue(x is None or isinstance(x, torch.nn.Linear))

+    def test_correct_missing_keys(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            base_model_prefix = model.base_model_prefix
+
+            if hasattr(model, base_model_prefix):
+                with tempfile.TemporaryDirectory() as temp_dir_name:
+                    model.base_model.save_pretrained(temp_dir_name)
+                    model, loading_info = model_class.from_pretrained(temp_dir_name, output_loading_info=True)
+
+                    with self.subTest(msg="Missing keys for {}".format(model.__class__.__name__)):
+                        self.assertGreater(len(loading_info["missing_keys"]), 0)
+
    def test_tie_model_weights(self):
        if not self.test_torchscript:
            return
@@ -620,10 +646,20 @@ class ModelTesterMixin:
                # batch_size = 1, num_beams > 1
                self._check_generated_tokens(model.generate(max_length=5, num_beams=3))

+            with self.assertRaises(AssertionError):
+                # generating multiple sequences when greedy no beam generation
+                # is not allowed as it would always generate the same sequences
+                model.generate(input_ids, do_sample=False, num_return_sequences=2)
+
+            with self.assertRaises(AssertionError):
+                # generating more sequences than having beams leads is not possible
+                model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
+
            # batch_size > 1, sample
            self._check_generated_tokens(model.generate(input_ids, num_return_sequences=3))
            # batch_size > 1, greedy
-            self._check_generated_tokens(model.generate(input_ids, do_sample=False, num_return_sequences=3))
+            self._check_generated_tokens(model.generate(input_ids, do_sample=False))
+
            # batch_size > 1, num_beams > 1, sample
            self._check_generated_tokens(model.generate(input_ids, num_beams=3, num_return_sequences=3,))
            # batch_size > 1, num_beams > 1, greedy
@@ -694,3 +730,110 @@ class ModelUtilsTest(unittest.TestCase):
            self.assertEqual(model.config.output_attentions, True)
            self.assertEqual(model.config.output_hidden_states, True)
            self.assertEqual(model.config, config)
+
+
+@require_torch
+class UtilsFunctionsTest(unittest.TestCase):
+
+    # tests whether the top_k_top_p function behaves as expected
+    def test_top_k_top_p_filtering(self):
+        logits = torch.tensor(
+            [
+                [
+                    8.2220991,  # 3rd highest value; idx. 0
+                    -0.5620044,
+                    5.23229752,
+                    4.0386393,
+                    -6.8798378,
+                    -0.54785802,
+                    -3.2012153,
+                    2.92777176,
+                    1.88171953,
+                    7.35341276,  # 5th highest value; idx. 9
+                    8.43207833,  # 2nd highest value; idx. 10
+                    -9.85711836,
+                    -5.96209236,
+                    -1.13039161,
+                    -7.1115294,
+                    -0.8369633,
+                    -5.3186408,
+                    7.06427407,
+                    0.81369344,
+                    -0.82023817,
+                    -5.9179796,
+                    0.58813443,
+                    -6.99778438,
+                    4.71551189,
+                    -0.18771637,
+                    7.44020759,  # 4th highest value; idx. 25
+                    9.38450987,  # 1st highest value; idx. 26
+                    2.12662941,
+                    -9.32562038,
+                    2.35652522,
+                ],  # cummulative prob of 5 highest values <= 0.6
+                [
+                    0.58425518,
+                    4.53139238,
+                    -5.57510464,
+                    -6.28030699,
+                    -7.19529503,
+                    -4.02122551,
+                    1.39337037,
+                    -6.06707057,
+                    1.59480517,
+                    -9.643119,
+                    0.03907799,
+                    0.67231762,
+                    -8.88206726,
+                    6.27115922,  # 4th highest value; idx. 13
+                    2.28520723,
+                    4.82767506,
+                    4.30421368,
+                    8.8275313,  # 2nd highest value; idx. 17
+                    5.44029958,  # 5th highest value; idx. 18
+                    -4.4735794,
+                    7.38579536,  # 3rd highest value; idx. 20
+                    -2.91051663,
+                    2.61946077,
+                    -2.5674762,
+                    -9.48959302,
+                    -4.02922645,
+                    -1.35416918,
+                    9.67702323,  # 1st highest value; idx. 27
+                    -5.89478553,
+                    1.85370467,
+                ],  # cummulative prob of 5 highest values <= 0.6
+            ],
+            dtype=torch.float,
+            device=torch_device,
+        )
+
+        non_inf_expected_idx = torch.tensor(
+            [[0, 0], [0, 9], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 18], [1, 20], [1, 27]],
+            dtype=torch.long,
+            device=torch_device,
+        )  # expected non filtered idx as noted above
+
+        non_inf_expected_output = torch.tensor(
+            [
+                8.2221,
+                7.3534,
+                8.4321,
+                7.4402,
+                9.3845,
+                6.2712,
+                8.8275,
+                5.4403,
+                7.3858,
+                9.6770,
+            ],  # expected non filtered values as noted above
+            dtype=torch.float,
+            device=torch_device,
+        )
+
+        output = top_k_top_p_filtering(logits, top_k=10, top_p=0.6, min_tokens_to_keep=4)
+        non_inf_output = output[output != -float("inf")].to(device=torch_device)
+        non_inf_idx = (output != -float("inf")).nonzero().to(device=torch_device)
+
+        self.assertTrue(torch.allclose(non_inf_expected_output, non_inf_output, atol=1e-12))
+        self.assertTrue(torch.all(torch.eq(non_inf_expected_idx, non_inf_idx)))
--- a/tests/test_modeling_encoder_decoder.py
+++ b/tests/test_modeling_encoder_decoder.py
@@ -1,50 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Hugging Face Inc. Team
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import unittest
-
-from transformers import is_torch_available
-
-from .utils import require_torch, slow
-
-
-if is_torch_available():
-    from transformers import BertModel, BertForMaskedLM, Model2Model
-    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@require_torch
-class EncoderDecoderModelTest(unittest.TestCase):
-    @slow
-    def test_model2model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = Model2Model.from_pretrained(model_name)
-            self.assertIsInstance(model.encoder, BertModel)
-            self.assertIsInstance(model.decoder, BertForMaskedLM)
-            self.assertEqual(model.decoder.config.is_decoder, True)
-            self.assertEqual(model.encoder.config.is_decoder, False)
-
-    def test_model2model_from_pretrained_not_bert(self):
-        logging.basicConfig(level=logging.INFO)
-        with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained("roberta")
-
-        with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained("distilbert")
-
-        with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained("does-not-exist")
--- a/tests/test_modeling_gpt2.py
+++ b/tests/test_modeling_gpt2.py
@@ -170,6 +170,74 @@ class GPT2ModelTest(ModelTesterMixin, unittest.TestCase):
            )
            self.parent.assertEqual(len(result["presents"]), config.n_layer)

+        def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+            model = GPT2Model(config=config)
+            model.to(torch_device)
+            model.eval()
+
+            # first forward pass
+            output, past = model(input_ids, token_type_ids=token_type_ids)
+
+            # create hypothetical next token and extent to next_input_ids
+            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+            next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
+
+            # append to next input_ids and token_type_ids
+            next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+            next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
+
+            output_from_no_past, _ = model(next_input_ids, token_type_ids=next_token_type_ids)
+            output_from_past, _ = model(next_tokens, token_type_ids=next_token_types, past=past)
+
+            # select random slice
+            random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+            output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+            # test that outputs are equal for slice
+            self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+        def create_and_check_gpt2_model_attention_mask_past(
+            self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+        ):
+            model = GPT2Model(config=config)
+            model.to(torch_device)
+            model.eval()
+
+            # create attention mask
+            attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+            half_seq_length = self.seq_length // 2
+            attn_mask[:, half_seq_length:] = 0
+
+            # first forward pass
+            output, past = model(input_ids, attention_mask=attn_mask)
+
+            # create hypothetical next token and extent to next_input_ids
+            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+            # change a random masked slice from input_ids
+            random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+            random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+            input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+            # append to next input_ids and attn_mask
+            next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+            attn_mask = torch.cat(
+                [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], dim=1
+            )
+
+            # get two different outputs
+            output_from_no_past, _ = model(next_input_ids, attention_mask=attn_mask)
+            output_from_past, _ = model(next_tokens, past=past, attention_mask=attn_mask)
+
+            # select random slice
+            random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+            output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+            # test that outputs are equal for slice
+            self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
        def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = GPT2LMHeadModel(config)
            model.to(torch_device)
@@ -248,6 +316,14 @@ class GPT2ModelTest(ModelTesterMixin, unittest.TestCase):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_gpt2_model(*config_and_inputs)

+    def test_gpt2_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model_past(*config_and_inputs)
+
+    def test_gpt2_model_att_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model_attention_mask_past(*config_and_inputs)
+
    def test_gpt2_lm_head_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
@@ -299,30 +375,29 @@ class GPT2ModelLanguageGenerationTest(unittest.TestCase):
    @slow
    def test_lm_generate_distilgpt2(self):
        model = GPT2LMHeadModel.from_pretrained("distilgpt2")
-        input_ids = torch.Tensor([[464, 3290, 318, 13779]]).long()  # The dog is cute
+        input_ids = torch.Tensor([[464, 1893]]).long()  # The president
        expected_output_ids = [
            464,
-            3290,
-            318,
-            13779,
-            996,
-            339,
-            460,
-            3360,
-            655,
-            2513,
+            1893,
+            286,
+            262,
+            1578,
+            1829,
+            11,
+            290,
+            262,
+            1893,
+            286,
+            262,
+            1578,
+            7526,
+            11,
+            423,
+            587,
            287,
            262,
-            3952,
-            13,
-            632,
-            318,
-            407,
-            845,
-            3621,
-            284,
-        ]  # The dog is cute though he can sometimes just walk in the park. It is not very nice to
-        torch.manual_seed(0)
+            2635,
+        ]  # The president of the United States, and the president of the United Kingdom, have been in the White

-        output_ids = model.generate(input_ids)
+        output_ids = model.generate(input_ids, do_sample=False)
        self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
--- a/tests/test_modeling_roberta.py
+++ b/tests/test_modeling_roberta.py
@@ -329,10 +329,15 @@ class RobertaModelIntegrationTest(unittest.TestCase):
        expected_shape = torch.Size((1, 11, 50265))
        self.assertEqual(output.shape, expected_shape)
        # compare the actual values for a slice.
-        expected_slice = torch.Tensor(
-            [[[33.8843, -4.3107, 22.7779], [4.6533, -2.8099, 13.6252], [1.8222, -3.6898, 8.8600]]]
+        expected_slice = torch.tensor(
+            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
        )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
+
+        # roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
+        # roberta.eval()
+        # expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach()
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))

    @slow
    def test_inference_no_head(self):
@@ -341,10 +346,15 @@ class RobertaModelIntegrationTest(unittest.TestCase):
        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
        output = model(input_ids)[0]
        # compare the actual values for a slice.
-        expected_slice = torch.Tensor(
-            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0539, -0.0174], [0.0548, 0.0799, 0.1687]]]
+        expected_slice = torch.tensor(
+            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
        )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
+
+        # roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
+        # roberta.eval()
+        # expected_slice = roberta.extract_features(input_ids)[:, :3, :3].detach()
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))

    @slow
    def test_inference_classification_head(self):
@@ -354,5 +364,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
        output = model(input_ids)[0]
        expected_shape = torch.Size((1, 3))
        self.assertEqual(output.shape, expected_shape)
-        expected_tensor = torch.Tensor([[-0.9469, 0.3913, 0.5118]])
-        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-3))
+        expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]])
+
+        # roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
+        # roberta.eval()
+        # expected_tensor = roberta.predict("mnli", input_ids, return_logits=True).detach()
+
+        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
--- a/tests/test_modeling_t5.py
+++ b/tests/test_modeling_t5.py
@@ -20,7 +20,7 @@ from transformers import is_torch_available

 from .test_configuration_common import ConfigTester
 from .test_modeling_common import ModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_torch, slow
+from .utils import CACHE_DIR, require_torch, slow, torch_device


 if is_torch_available():
@@ -125,6 +125,7 @@ class T5ModelTest(ModelTesterMixin, unittest.TestCase):
            decoder_lm_labels,
        ):
            model = T5Model(config=config)
+            model.to(torch_device)
            model.eval()
            decoder_output, encoder_output = model(
                encoder_input_ids=encoder_input_ids,
@@ -157,6 +158,7 @@ class T5ModelTest(ModelTesterMixin, unittest.TestCase):
            decoder_lm_labels,
        ):
            model = T5WithLMHeadModel(config=config)
+            model.to(torch_device)
            model.eval()
            outputs = model(
                encoder_input_ids=encoder_input_ids,
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -18,17 +18,32 @@ import copy
 import os
 import random
 import tempfile
+import unittest

 from transformers import is_tf_available, is_torch_available

-from .utils import require_tf
+from .utils import _tf_gpu_memory_limit, require_tf


 if is_tf_available():
    import tensorflow as tf
    import numpy as np

-    # from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    from transformers import tf_top_k_top_p_filtering
+
+    if _tf_gpu_memory_limit is not None:
+        gpus = tf.config.list_physical_devices("GPU")
+        for gpu in gpus:
+            # Restrict TensorFlow to only allocate x GB of memory on the GPUs
+            try:
+                tf.config.experimental.set_virtual_device_configuration(
+                    gpu, [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)]
+                )
+                logical_gpus = tf.config.experimental.list_logical_devices("GPU")
+                print("Logical GPUs", logical_gpus)
+            except RuntimeError as e:
+                # Virtual devices must be set before GPUs have been initialized
+                print(e)


 def _config_zero_init(config):
@@ -44,6 +59,7 @@ class TFModelTesterMixin:

    model_tester = None
    all_model_classes = ()
+    all_generative_model_classes = ()
    test_torchscript = True
    test_pruning = True
    test_resize_embeddings = True
@@ -204,7 +220,7 @@ class TFModelTesterMixin:
            outputs_dict = model(inputs_dict)

            inputs_keywords = copy.deepcopy(inputs_dict)
-            input_ids = inputs_keywords.pop("input_ids" if not self.is_encoder_decoder else "decoder_input_ids", None)
+            input_ids = inputs_keywords.pop("input_ids" if not self.is_encoder_decoder else "decoder_input_ids", None,)
            outputs_keywords = model(input_ids, **inputs_keywords)

            output_dict = outputs_dict[0].numpy()
@@ -287,7 +303,7 @@ class TFModelTesterMixin:
            self.assertEqual(model.config.output_hidden_states, True)
            self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size]
+                list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size],
            )

    def test_model_common_attributes(self):
@@ -304,7 +320,10 @@ class TFModelTesterMixin:

        for model_class in self.all_model_classes:
            model = model_class(config)
-            first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
+            first, second = (
+                model(inputs_dict, training=False)[0],
+                model(inputs_dict, training=False)[0],
+            )
            out_1 = first.numpy()
            out_2 = second.numpy()
            out_1 = out_1[~np.isnan(out_1)]
@@ -326,9 +345,9 @@ class TFModelTesterMixin:
                    x = wte([input_ids, None, None, None], mode="embedding")
                except Exception:
                    if hasattr(self.model_tester, "embedding_size"):
-                        x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
+                        x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32,)
                    else:
-                        x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
+                        x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32,)
        return x

    def test_inputs_embeds(self):
@@ -354,6 +373,55 @@ class TFModelTesterMixin:

            model(inputs_dict)

+    def test_lm_head_model_random_generate(self):
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict.get(
+            "input_ids", None
+        )  # TODO (PVP): ugly workaround to make code work for t5 for the moment - has to changed when t5 is fixed.
+
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+
+            if config.bos_token_id is None:
+                with self.assertRaises(AssertionError):
+                    model.generate(max_length=5)
+                # batch_size = 1
+                self._check_generated_tokens(model.generate(input_ids))
+                # batch_size = 1, num_beams > 1
+                self._check_generated_tokens(model.generate(input_ids, num_beams=3))
+            else:
+                # batch_size = 1
+                self._check_generated_tokens(model.generate(max_length=5))
+                # batch_size = 1, num_beams > 1
+                self._check_generated_tokens(model.generate(max_length=5, num_beams=3))
+
+            with self.assertRaises(AssertionError):
+                # generating multiple sequences when greedy no beam generation
+                # is not allowed as it would always generate the same sequences
+                model.generate(input_ids, do_sample=False, num_return_sequences=2)
+
+            with self.assertRaises(AssertionError):
+                # generating more sequences than having beams leads is not possible
+                model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
+
+            # batch_size > 1, sample
+            self._check_generated_tokens(model.generate(input_ids, num_return_sequences=3))
+            # batch_size > 1, greedy
+            self._check_generated_tokens(model.generate(input_ids, do_sample=False))
+
+            # batch_size > 1, num_beams > 1, sample
+            self._check_generated_tokens(model.generate(input_ids, num_beams=3, num_return_sequences=3,))
+            # batch_size > 1, num_beams > 1, greedy
+            self._check_generated_tokens(
+                model.generate(input_ids, do_sample=False, num_beams=3, num_return_sequences=3)
+            )
+
+    def _check_generated_tokens(self, output_ids):
+        for token_id in output_ids[0].numpy().tolist():
+            self.assertGreaterEqual(token_id, 0)
+            self.assertLess(token_id, self.model_tester.vocab_size)
+

 def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
    """Creates a random int32 tensor of the shape within the vocab size."""
@@ -371,3 +439,98 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
    output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32)

    return output
+
+
+@require_tf
+class UtilsFunctionsTest(unittest.TestCase):
+
+    # tests whether the top_k_top_p_filtering function behaves as expected
+    def test_top_k_top_p_filtering(self):
+        logits = tf.convert_to_tensor(
+            [
+                [
+                    8.2220991,  # 3rd highest value; idx. 0
+                    -0.5620044,
+                    5.23229752,
+                    4.0386393,
+                    -6.8798378,
+                    -0.54785802,
+                    -3.2012153,
+                    2.92777176,
+                    1.88171953,
+                    7.35341276,  # 5th highest value; idx. 9
+                    8.43207833,  # 2nd highest value; idx. 10
+                    -9.85711836,
+                    -5.96209236,
+                    -1.13039161,
+                    -7.1115294,
+                    -0.8369633,
+                    -5.3186408,
+                    7.06427407,
+                    0.81369344,
+                    -0.82023817,
+                    -5.9179796,
+                    0.58813443,
+                    -6.99778438,
+                    4.71551189,
+                    -0.18771637,
+                    7.44020759,  # 4th highest value; idx. 25
+                    9.38450987,  # 1st highest value; idx. 26
+                    2.12662941,
+                    -9.32562038,
+                    2.35652522,
+                ],  # cummulative prob of 5 highest values <= 0.6
+                [
+                    0.58425518,
+                    4.53139238,
+                    -5.57510464,
+                    -6.28030699,
+                    -7.19529503,
+                    -4.02122551,
+                    1.39337037,
+                    -6.06707057,
+                    1.59480517,
+                    -9.643119,
+                    0.03907799,
+                    0.67231762,
+                    -8.88206726,
+                    6.27115922,  # 4th highest value; idx. 13
+                    2.28520723,
+                    4.82767506,
+                    4.30421368,
+                    8.8275313,  # 2nd highest value; idx. 17
+                    5.44029958,  # 5th highest value; idx. 18
+                    -4.4735794,
+                    7.38579536,  # 3rd highest value; idx. 20
+                    -2.91051663,
+                    2.61946077,
+                    -2.5674762,
+                    -9.48959302,
+                    -4.02922645,
+                    -1.35416918,
+                    9.67702323,  # 1st highest value; idx. 27
+                    -5.89478553,
+                    1.85370467,
+                ],  # cummulative prob of 5 highest values <= 0.6
+            ],
+            dtype=tf.float32,
+        )
+
+        non_inf_expected_idx = tf.convert_to_tensor(
+            [[0, 0], [0, 9], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 18], [1, 20], [1, 27]], dtype=tf.int32,
+        )  # expected non filtered idx as noted above
+
+        non_inf_expected_output = tf.convert_to_tensor(
+            [8.222099, 7.3534126, 8.432078, 7.4402075, 9.38451, 6.271159, 8.827531, 5.4402995, 7.3857956, 9.677023],
+            dtype=tf.float32,
+        )  # expected non filtered values as noted above
+
+        output = tf_top_k_top_p_filtering(logits, top_k=10, top_p=0.6, min_tokens_to_keep=4)
+
+        non_inf_output = output[output != -float("inf")]
+        non_inf_idx = tf.cast(
+            tf.where(tf.not_equal(output, tf.constant(-float("inf"), dtype=tf.float32))), dtype=tf.int32,
+        )
+
+        tf.debugging.assert_near(non_inf_output, non_inf_expected_output, rtol=1e-12)
+        tf.debugging.assert_equal(non_inf_idx, non_inf_expected_idx)
--- a/tests/test_modeling_tf_ctrl.py
+++ b/tests/test_modeling_tf_ctrl.py
@@ -31,6 +31,7 @@ if is_tf_available():
 class TFCTRLModelTest(TFModelTesterMixin, unittest.TestCase):

    all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else ()
+    all_generative_model_classes = (TFCTRLLMHeadModel,) if is_tf_available() else ()

    class TFCTRLModelTester(object):
        def __init__(
--- a/tests/test_modeling_tf_gpt2.py
+++ b/tests/test_modeling_tf_gpt2.py
@@ -30,6 +30,7 @@ if is_tf_available():
        TFGPT2LMHeadModel,
        TFGPT2DoubleHeadsModel,
        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        shape_list,
    )


@@ -37,7 +38,7 @@ if is_tf_available():
 class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):

    all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel) if is_tf_available() else ()
-    # all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel) if is_tf_available() else ()
+    all_generative_model_classes = (TFGPT2LMHeadModel,) if is_tf_available() else ()

    class TFGPT2ModelTester(object):
        def __init__(
@@ -89,6 +90,8 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
            self.num_labels = num_labels
            self.num_choices = num_choices
            self.scope = scope
+            self.bos_token_id = vocab_size - 1
+            self.eos_token_id = vocab_size - 1

        def prepare_config_and_inputs(self):
            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -123,9 +126,11 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
                # hidden_dropout_prob=self.hidden_dropout_prob,
                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                n_positions=self.max_position_embeddings,
-                n_ctx=self.max_position_embeddings
+                n_ctx=self.max_position_embeddings,
                # type_vocab_size=self.type_vocab_size,
                # initializer_range=self.initializer_range
+                bos_token_id=self.bos_token_id,
+                eos_token_ids=self.eos_token_id,
            )

            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
@@ -144,7 +149,11 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):

        def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = TFGPT2Model(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            inputs = {
+                "input_ids": input_ids,
+                "attention_mask": input_mask,
+                "token_type_ids": token_type_ids,
+            }
            sequence_output = model(inputs)[0]

            inputs = [input_ids, None, input_mask]  # None is the input for 'past'
@@ -156,18 +165,89 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
                "sequence_output": sequence_output.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size],
            )

+        def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+            model = TFGPT2Model(config=config)
+
+            # first forward pass
+            output, past = model(input_ids, token_type_ids=token_type_ids)
+
+            # create hypothetical next token and extent to next_input_ids
+            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+            next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
+
+            # append to next input_ids and token_type_ids
+            next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+            next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1)
+
+            output_from_no_past, _ = model(next_input_ids, token_type_ids=next_token_type_ids)
+            output_from_past, _ = model(next_tokens, token_type_ids=next_token_types, past=past)
+
+            # select random slice
+            random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
+            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
+            output_from_past_slice = output_from_past[:, 0, random_slice_idx]
+
+            # test that outputs are equal for slice
+            tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-12)
+
+        def create_and_check_gpt2_model_attention_mask_past(
+            self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+        ):
+            model = TFGPT2Model(config=config)
+
+            # create attention mask
+            half_seq_length = self.seq_length // 2
+            attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
+            attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
+            attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
+
+            # first forward pass
+            output, past = model(input_ids, attention_mask=attn_mask)
+
+            # create hypothetical next token and extent to next_input_ids
+            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+            # change a random masked slice from input_ids
+            random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
+            random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
+            vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
+            condition = tf.transpose(
+                tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
+            )
+            input_ids = tf.where(condition, random_other_next_tokens, input_ids)
+
+            # append to next input_ids and attn_mask
+            next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+            attn_mask = tf.concat([attn_mask, tf.ones((shape_list(attn_mask)[0], 1), dtype=tf.int32)], axis=1)
+
+            # get two different outputs
+            output_from_no_past, _ = model(next_input_ids, attention_mask=attn_mask)
+            output_from_past, _ = model(next_tokens, past=past, attention_mask=attn_mask)
+
+            # select random slice
+            random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
+            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
+            output_from_past_slice = output_from_past[:, 0, random_slice_idx]
+
+            # test that outputs are equal for slice
+            tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-12)
+
        def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = TFGPT2LMHeadModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            inputs = {
+                "input_ids": input_ids,
+                "attention_mask": input_mask,
+                "token_type_ids": token_type_ids,
+            }
            prediction_scores = model(inputs)[0]
            result = {
                "prediction_scores": prediction_scores.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size],
            )

        def create_and_check_gpt2_double_head(
@@ -188,7 +268,7 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
            lm_logits, mc_logits = model(inputs)[:2]
            result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
            self.parent.assertListEqual(
-                list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
+                list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size],
            )
            self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])

@@ -207,7 +287,11 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
                choice_labels,
            ) = config_and_inputs

-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+            inputs_dict = {
+                "input_ids": input_ids,
+                "token_type_ids": token_type_ids,
+                "attention_mask": input_mask,
+            }
            return config, inputs_dict

    def setUp(self):
@@ -221,6 +305,14 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_gpt2_model(*config_and_inputs)

+    def test_gpt2_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model_past(*config_and_inputs)
+
+    def test_gpt2_model_att_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model_attention_mask_past(*config_and_inputs)
+
    def test_gpt2_lm_head(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_gpt2_lm_head(*config_and_inputs)
@@ -234,3 +326,48 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
        for model_name in list(TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            model = TFGPT2Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
            self.assertIsNotNone(model)
+
+
+def prepare_generation_special_tokens():
+    return {"bos_token_id": 50256, "eos_token_id": 50256}
+
+
+class TFGPT2ModelLanguageGenerationTest(unittest.TestCase):
+
+    special_tokens = prepare_generation_special_tokens()
+
+    @slow
+    def test_lm_generate_distilgpt2(self):
+        model = TFGPT2LMHeadModel.from_pretrained("distilgpt2")
+        input_ids = tf.convert_to_tensor([[464, 1893]], dtype=tf.int32)  # The president
+        expected_output_ids = [
+            464,
+            1893,
+            286,
+            262,
+            1578,
+            1829,
+            11,
+            290,
+            262,
+            1893,
+            286,
+            262,
+            1578,
+            7526,
+            11,
+            423,
+            587,
+            287,
+            262,
+            2635,
+        ]  # The president of the United States, and the president of the United Kingdom, have been in the White
+
+        output_ids = model.generate(
+            input_ids,
+            do_sample=False,
+            bos_token_id=self.special_tokens["bos_token_id"],
+            eos_token_ids=self.special_tokens["eos_token_id"],
+        )
+
+        self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
--- a/tests/test_modeling_tf_openai_gpt.py
+++ b/tests/test_modeling_tf_openai_gpt.py
@@ -39,6 +39,9 @@ class TFOpenAIGPTModelTest(TFModelTesterMixin, unittest.TestCase):
    all_model_classes = (
        (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel) if is_tf_available() else ()
    )
+    all_generative_model_classes = (
+        (TFOpenAIGPTLMHeadModel,) if is_tf_available() else ()
+    )  # TODO (PVP): Add Double HeadsModel when generate() function is changed accordingly

    class TFOpenAIGPTModelTester(object):
        def __init__(
--- a/tests/test_modeling_tf_roberta.py
+++ b/tests/test_modeling_tf_roberta.py
@@ -222,9 +222,9 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
        self.assertEqual(list(output.numpy().shape), expected_shape)
        # compare the actual values for a slice.
        expected_slice = tf.constant(
-            [[[33.8843, -4.3107, 22.7779], [4.6533, -2.8099, 13.6252], [1.8222, -3.6898, 8.8600]]]
+            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
        )
-        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3))
+        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))

    @slow
    def test_inference_no_head(self):
@@ -234,9 +234,9 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
        output = model(input_ids)[0]
        # compare the actual values for a slice.
        expected_slice = tf.constant(
-            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0539, -0.0174], [0.0548, 0.0799, 0.1687]]]
+            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
        )
-        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3))
+        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))

    @slow
    def test_inference_classification_head(self):
@@ -247,4 +247,4 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
        expected_shape = [1, 3]
        self.assertEqual(list(output.numpy().shape), expected_shape)
        expected_tensor = tf.constant([[-0.9469, 0.3913, 0.5118]])
-        self.assertTrue(numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-3))
+        self.assertTrue(numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-4))
--- a/tests/test_modeling_tf_transfo_xl.py
+++ b/tests/test_modeling_tf_transfo_xl.py
@@ -37,6 +37,8 @@ if is_tf_available():
 class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):

    all_model_classes = (TFTransfoXLModel, TFTransfoXLLMHeadModel) if is_tf_available() else ()
+    all_generative_model_classes = () if is_tf_available() else ()
+    # TODO: add this test when TFTransfoXLLMHead has a linear output layer implemented
    test_pruning = False
    test_torchscript = False
    test_resize_embeddings = False
@@ -62,6 +64,7 @@ class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
            num_hidden_layers=5,
            scope=None,
            seed=1,
+            eos_token_id=0,
        ):
            self.parent = parent
            self.batch_size = batch_size
@@ -82,6 +85,7 @@ class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
            self.num_hidden_layers = num_hidden_layers
            self.scope = scope
            self.seed = seed
+            self.eos_token_id = eos_token_id

        def prepare_config_and_inputs(self):
            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -103,6 +107,7 @@ class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
                d_inner=self.d_inner,
                div_val=self.div_val,
                n_layer=self.num_hidden_layers,
+                eos_token_ids=self.eos_token_id,
            )

            return (config, input_ids_1, input_ids_2, lm_labels)
--- a/tests/test_modeling_tf_xlm.py
+++ b/tests/test_modeling_tf_xlm.py
@@ -43,6 +43,9 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
        if is_tf_available()
        else ()
    )
+    all_generative_model_classes = (
+        (TFXLMWithLMHeadModel,) if is_tf_available() else ()
+    )  # TODO (PVP): Check other models whether language generation is also applicable

    class TFXLMModelTester(object):
        def __init__(
@@ -75,6 +78,7 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
            summary_type="last",
            use_proj=True,
            scope=None,
+            bos_token_id=0,
        ):
            self.parent = parent
            self.batch_size = batch_size
@@ -105,6 +109,7 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
            self.num_labels = num_labels
            self.num_choices = num_choices
            self.scope = scope
+            self.bos_token_id = bos_token_id

        def prepare_config_and_inputs(self):
            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -145,6 +150,7 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
                initializer_range=self.initializer_range,
                summary_type=self.summary_type,
                use_proj=self.use_proj,
+                bos_token_id=self.bos_token_id,
            )

            return (
--- a/tests/test_modeling_tf_xlnet.py
+++ b/tests/test_modeling_tf_xlnet.py
@@ -51,6 +51,9 @@ class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
        if is_tf_available()
        else ()
    )
+    all_generative_model_classes = (
+        (TFXLNetLMHeadModel,) if is_tf_available() else ()
+    )  # TODO (PVP): Check other models whether language generation is also applicable
    test_pruning = False

    class TFXLNetModelTester(object):
@@ -77,6 +80,9 @@ class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
            initializer_range=0.05,
            seed=1,
            type_vocab_size=2,
+            bos_token_id=1,
+            eos_token_id=2,
+            pad_token_id=5,
        ):
            self.parent = parent
            self.batch_size = batch_size
@@ -100,6 +106,9 @@ class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
            self.seed = seed
            self.type_vocab_size = type_vocab_size
            self.type_sequence_label_size = type_sequence_label_size
+            self.bos_token_id = bos_token_id
+            self.pad_token_id = pad_token_id
+            self.eos_token_id = eos_token_id

        def prepare_config_and_inputs(self):
            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -139,6 +148,9 @@ class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
                bi_data=self.bi_data,
                initializer_range=self.initializer_range,
                num_labels=self.type_sequence_label_size,
+                bos_token_id=self.bos_token_id,
+                pad_token_id=self.pad_token_id,
+                eos_token_id=self.eos_token_id,
            )

            return (
--- a/tests/test_modeling_xlm_roberta.py
+++ b/tests/test_modeling_xlm_roberta.py
@@ -0,0 +1,66 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+
+from .utils import slow
+
+
+if is_torch_available():
+    import torch
+    from transformers import XLMRobertaModel
+
+
+class XLMRobertaModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_xlm_roberta_base(self):
+        model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
+        input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
+        # The dog is cute and lives in the garden house
+
+        expected_output_shape = torch.Size((1, 12, 768))  # batch_size, sequence_length, embedding_vector_dim
+        expected_output_values_last_dim = torch.tensor(
+            [[-0.0101, 0.1218, -0.0803, 0.0801, 0.1327, 0.0776, -0.1215, 0.2383, 0.3338, 0.3106, 0.0300, 0.0252]]
+        )
+        #  xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base')
+        #  xlmr.eval()
+        #  expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1]
+
+        output = model(input_ids)[0].detach()
+        self.assertEqual(output.shape, expected_output_shape)
+        # compare the actual values for a slice of last dim
+        self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
+
+    @slow
+    def test_xlm_roberta_large(self):
+        model = XLMRobertaModel.from_pretrained("xlm-roberta-large")
+        input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
+        # The dog is cute and lives in the garden house
+
+        expected_output_shape = torch.Size((1, 12, 1024))  # batch_size, sequence_length, embedding_vector_dim
+        expected_output_values_last_dim = torch.tensor(
+            [[-0.0699, -0.0318, 0.0705, -0.1241, 0.0999, -0.0520, 0.1004, -0.1838, -0.4704, 0.1437, 0.0821, 0.0126]]
+        )
+        #  xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.large')
+        #  xlmr.eval()
+        #  expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1]
+
+        output = model(input_ids)[0].detach()
+        self.assertEqual(output.shape, expected_output_shape)
+        # compare the actual values for a slice of last dim
+        self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -2,9 +2,16 @@ import unittest
 from typing import Iterable, List, Optional

 from transformers import pipeline
-from transformers.pipelines import Pipeline
+from transformers.pipelines import (
+    FeatureExtractionPipeline,
+    FillMaskPipeline,
+    NerPipeline,
+    Pipeline,
+    QuestionAnsweringPipeline,
+    TextClassificationPipeline,
+)

-from .utils import require_tf, require_torch
+from .utils import require_tf, require_torch, slow


 QA_FINETUNED_MODELS = [
@@ -304,3 +311,30 @@ class MultiColumnInputTestCase(unittest.TestCase):
        for tokenizer, model, config in TF_QA_FINETUNED_MODELS:
            nlp = pipeline(task="question-answering", model=model, config=config, tokenizer=tokenizer, framework="tf")
            self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys)
+
+
+class PipelineCommonTests(unittest.TestCase):
+
+    pipelines = (
+        NerPipeline,
+        FeatureExtractionPipeline,
+        QuestionAnsweringPipeline,
+        FillMaskPipeline,
+        TextClassificationPipeline,
+    )
+
+    @slow
+    @require_tf
+    def test_tf_defaults(self):
+        # Test that pipelines can be correctly loaded without any argument
+        for default_pipeline in self.pipelines:
+            with self.subTest(msg="Testing Torch defaults with PyTorch and {}".format(default_pipeline.task)):
+                default_pipeline(framework="tf")
+
+    @slow
+    @require_torch
+    def test_pt_defaults(self):
+        # Test that pipelines can be correctly loaded without any argument
+        for default_pipeline in self.pipelines:
+            with self.subTest(msg="Testing Torch defaults with PyTorch and {}".format(default_pipeline.task)):
+                default_pipeline(framework="pt")
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -449,6 +449,10 @@ class TokenizerTesterMixin:

        sequence = "Sequence"
        padding_size = 10
+
+        # check correct behaviour if no pad_token_id exists and add it eventually
+        self._check_no_pad_token_padding(tokenizer, sequence)
+
        padding_idx = tokenizer.pad_token_id

        # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
@@ -490,6 +494,10 @@ class TokenizerTesterMixin:
        tokenizer = self.get_tokenizer()

        sequence = "Sequence"
+
+        # check correct behaviour if no pad_token_id exists and add it eventually
+        self._check_no_pad_token_padding(tokenizer, sequence)
+
        padding_size = 10
        padding_idx = tokenizer.pad_token_id
        token_type_padding_idx = tokenizer.pad_token_type_id
@@ -503,6 +511,7 @@ class TokenizerTesterMixin:

        # Test right padding
        tokenizer.padding_side = "right"
+
        padded_sequence = tokenizer.encode_plus(
            sequence,
            max_length=sequence_length + padding_size,
@@ -588,10 +597,14 @@ class TokenizerTesterMixin:

        maximum_length = len(max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len))

+        # check correct behaviour if no pad_token_id exists and add it eventually
+        self._check_no_pad_token_padding(tokenizer, sequences)
+
        encoded_sequences_padded = [
            tokenizer.encode_plus(sequence, pad_to_max_length=True, max_length=maximum_length)
            for sequence in sequences
        ]
+
        encoded_sequences_batch_padded = tokenizer.batch_encode_plus(sequences, pad_to_max_length=True)
        self.assertListEqual(
            encoded_sequences_padded,
@@ -610,6 +623,10 @@ class TokenizerTesterMixin:
        ]

        max_length = 100
+
+        # check correct behaviour if no pad_token_id exists and add it eventually
+        self._check_no_pad_token_padding(tokenizer, sequences)
+
        encoded_sequences = [
            tokenizer.encode_plus(sequence, pad_to_max_length=True, max_length=max_length) for sequence in sequences
        ]
@@ -620,6 +637,7 @@ class TokenizerTesterMixin:

        # Left padding tests
        tokenizer = self.get_tokenizer()
+
        tokenizer.padding_side = "left"
        sequences = [
            "Testing batch encode plus",
@@ -628,6 +646,10 @@ class TokenizerTesterMixin:
        ]

        max_length = 100
+
+        # check correct behaviour if no pad_token_id exists and add it eventually
+        self._check_no_pad_token_padding(tokenizer, sequences)
+
        encoded_sequences = [
            tokenizer.encode_plus(sequence, pad_to_max_length=True, max_length=max_length) for sequence in sequences
        ]
@@ -668,3 +690,15 @@ class TokenizerTesterMixin:
                encoded_value = encoded_sequences[key]

                self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
+
+    def _check_no_pad_token_padding(self, tokenizer, sequences):
+        # if tokenizer does not have pad_token_id, an error should be thrown
+        if tokenizer.pad_token_id is None:
+            with self.assertRaises(ValueError):
+                if isinstance(sequences, list):
+                    tokenizer.batch_encode_plus(sequences, pad_to_max_length=True)
+                else:
+                    tokenizer.encode_plus(sequences, pad_to_max_length=True)
+
+            # add pad_token_id to pass subsequent tests
+            tokenizer.add_special_tokens({"pad_token": "<PAD>"})
--- a/tests/test_tokenization_xlm_roberta.py
+++ b/tests/test_tokenization_xlm_roberta.py
@@ -0,0 +1,111 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.tokenization_xlm_roberta import XLMRobertaTokenizer
+
+from .utils import slow
+
+
+class XLMRobertaTokenizationIntegrationTest(unittest.TestCase):
+    @slow
+    def test_tokenization_base_easy_symbols(self):
+        tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
+
+        symbols = "Hello World!"
+        original_tokenizer_encodings = [0, 35378, 6661, 38, 2]
+        # xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base')  # xlmr.large has same tokenizer
+        # xlmr.eval()
+        # xlmr.encode(symbols)
+
+        self.assertListEqual(original_tokenizer_encodings, tokenizer.encode(symbols))
+
+    @slow
+    def test_tokenization_base_hard_symbols(self):
+        tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
+
+        symbols = 'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth'
+        original_tokenizer_encodings = [
+            0,
+            3293,
+            83,
+            10,
+            4552,
+            4989,
+            7986,
+            678,
+            10,
+            5915,
+            111,
+            179459,
+            124850,
+            4,
+            6044,
+            237,
+            12,
+            6,
+            5,
+            6,
+            4,
+            6780,
+            705,
+            15,
+            1388,
+            44,
+            378,
+            10114,
+            711,
+            152,
+            20,
+            6,
+            5,
+            22376,
+            642,
+            1221,
+            15190,
+            34153,
+            450,
+            5608,
+            959,
+            1119,
+            57702,
+            136,
+            186,
+            47,
+            1098,
+            29367,
+            47,
+            4426,
+            3678,
+            2740,
+            4,
+            6044,
+            237,
+            6284,
+            50901,
+            528,
+            31,
+            90,
+            34,
+            927,
+            2,
+        ]
+        # xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base')  # xlmr.large has same tokenizer
+        # xlmr.eval()
+        # xlmr.encode(symbols)
+
+        self.assertListEqual(original_tokenizer_encodings, tokenizer.encode(symbols))
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -29,8 +29,22 @@ def parse_flag_from_env(key, default=False):
    return _value


+def parse_int_from_env(key, default=None):
+    try:
+        value = os.environ[key]
+    except KeyError:
+        _value = default
+    else:
+        try:
+            _value = int(value)
+        except ValueError:
+            raise ValueError("If set, {} must be a int.".format(key))
+    return _value
+
+
 _run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
 _run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False)
+_tf_gpu_memory_limit = parse_int_from_env("TF_GPU_MEMORY_LIMIT", default=None)


 def slow(test_case):