From 27cf1d97f043729a9e086108bd35fb6a49798da3 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Thu, 25 Jun 2020 17:24:28 +0200
Subject: [PATCH] [Tokenization] Fix #5181 - make #5155 more explicit - move
 back the default logging level in tests to WARNING (#5252)

* fix-5181

Padding to max sequence length while truncation to another length was wrong on slow tokenizers

* clean up and fix #5155

* fix XLM test

* Fix tests for Transfo-XL

* logging only above WARNING in tests

* switch slow tokenizers tests in @slow

* fix Marian truncation tokenization test

* style and quality

* make the test a lot faster by limiting the sequence length used in tests
---
 src/transformers/tokenization_utils.py      |  70 +++++++------
 src/transformers/tokenization_utils_base.py |   2 +-
 tests/test_modeling_auto.py                 |  12 ---
 tests/test_modeling_common.py               |   2 -
 tests/test_modeling_tf_auto.py              |   8 --
 tests/test_tokenization_auto.py             |   6 --
 tests/test_tokenization_common.py           | 106 +++++++++++++++++---
 tests/test_tokenization_fast.py             |   2 -
 tests/test_trainer_distributed.py           |   1 -
 9 files changed, 134 insertions(+), 75 deletions(-)

diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index 414e2e1095..405f7cf17b 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -576,18 +576,6 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
         Args:
             batch_ids_pairs: list of tokenized input ids or input ids pairs
         """
-        if padding_strategy == PaddingStrategy.LONGEST:
-            # For simplicity we keep the single sentnce path here
-            def total_sequence_length(input_pairs):
-                first_ids, second_ids = input_pairs
-                return len(first_ids) + (
-                    self.num_special_tokens_to_add()
-                    if second_ids is None
-                    else (len(second_ids) + self.num_special_tokens_to_add(pair=True))
-                )
-
-            max_length = max([total_sequence_length(input_pairs) for input_pairs in batch_ids_pairs])
-            padding_strategy = PaddingStrategy.MAX_LENGTH
 
         batch_outputs = {}
         for first_ids, second_ids in batch_ids_pairs:
@@ -595,16 +583,16 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
                 first_ids,
                 second_ids,
                 add_special_tokens=add_special_tokens,
-                padding_strategy=padding_strategy,
+                padding_strategy=PaddingStrategy.DO_NOT_PAD,  # we pad in batch afterward
                 truncation_strategy=truncation_strategy,
                 max_length=max_length,
                 stride=stride,
-                return_attention_mask=return_attention_mask,
+                return_attention_mask=False,  # we pad in batch afterward
                 return_token_type_ids=return_token_type_ids,
                 return_overflowing_tokens=return_overflowing_tokens,
                 return_special_tokens_mask=return_special_tokens_mask,
                 return_length=return_length,
-                return_tensors=None,  # We will convert the whole batch to tensors at the end
+                return_tensors=None,  # We convert the whole batch to tensors at the end
                 prepend_batch_axis=False,
                 verbose=verbose,
             )
@@ -614,6 +602,13 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
                     batch_outputs[key] = []
                 batch_outputs[key].append(value)
 
+        batch_outputs = self.pad(
+            batch_outputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            return_attention_mask=return_attention_mask,
+        )
+
         batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
 
         return batch_outputs
@@ -700,12 +695,13 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
             )
 
         # Padding
-        encoded_inputs = self.pad(
-            encoded_inputs,
-            max_length=max_length,
-            padding=padding_strategy.value,
-            return_attention_mask=return_attention_mask,
-        )
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
+            encoded_inputs = self.pad(
+                encoded_inputs,
+                max_length=max_length,
+                padding=padding_strategy.value,
+                return_attention_mask=return_attention_mask,
+            )
 
         if return_length:
             encoded_inputs["length"] = len(encoded_inputs["input_ids"])
@@ -768,15 +764,29 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
                 else:
                     pair_ids = pair_ids[:-1]
         elif truncation_strategy == TruncationStrategy.ONLY_FIRST:
-            assert len(ids) > num_tokens_to_remove
-            window_len = min(len(ids), stride + num_tokens_to_remove)
-            overflowing_tokens = ids[-window_len:]
-            ids = ids[:-num_tokens_to_remove]
-        elif truncation_strategy == TruncationStrategy.ONLY_SECOND:
-            assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
-            window_len = min(len(pair_ids), stride + num_tokens_to_remove)
-            overflowing_tokens = pair_ids[-window_len:]
-            pair_ids = pair_ids[:-num_tokens_to_remove]
+            if len(ids) > num_tokens_to_remove:
+                window_len = min(len(ids), stride + num_tokens_to_remove)
+                overflowing_tokens = ids[-window_len:]
+                ids = ids[:-num_tokens_to_remove]
+            else:
+                logger.error(
+                    f"We need to remove {num_tokens_to_remove} to truncate the input"
+                    f"but the first sequence has a length {len(ids)}. "
+                    f"Please select another truncation strategy than {truncation_strategy}, "
+                    f"for instance 'longest_first' or 'only_second'."
+                )
+        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
+            if len(pair_ids) > num_tokens_to_remove:
+                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
+                overflowing_tokens = pair_ids[-window_len:]
+                pair_ids = pair_ids[:-num_tokens_to_remove]
+            else:
+                logger.error(
+                    f"We need to remove {num_tokens_to_remove} to truncate the input"
+                    f"but the second sequence has a length {len(pair_ids)}. "
+                    f"Please select another truncation strategy than {truncation_strategy}, "
+                    f"for instance 'longest_first' or 'only_first'."
+                )
 
         return (ids, pair_ids, overflowing_tokens)
 
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 3e6caebcec..3181517cd8 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1890,7 +1890,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
         if return_attention_mask is None:
             return_attention_mask = "attention_mask" in self.model_input_names
 
-        if padding_strategy == PaddingStrategy.LONGEST and max_length is None:
+        if padding_strategy == PaddingStrategy.LONGEST:
             max_length = len(encoded_inputs["input_ids"])
 
         needs_to_be_padded = (
diff --git a/tests/test_modeling_auto.py b/tests/test_modeling_auto.py
index 21a8aa4e81..0529528c60 100644
--- a/tests/test_modeling_auto.py
+++ b/tests/test_modeling_auto.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 
-import logging
 import unittest
 
 from transformers import is_torch_available
@@ -67,7 +66,6 @@ if is_torch_available():
 class AutoModelTest(unittest.TestCase):
     @slow
     def test_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
         for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
@@ -82,7 +80,6 @@ class AutoModelTest(unittest.TestCase):
 
     @slow
     def test_model_for_pretraining_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
         for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
@@ -98,7 +95,6 @@ class AutoModelTest(unittest.TestCase):
 
     @slow
     def test_lmhead_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
         for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
@@ -111,7 +107,6 @@ class AutoModelTest(unittest.TestCase):
 
     @slow
     def test_model_for_causal_lm(self):
-        logging.basicConfig(level=logging.INFO)
         for model_name in GPT2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
@@ -124,7 +119,6 @@ class AutoModelTest(unittest.TestCase):
 
     @slow
     def test_model_for_masked_lm(self):
-        logging.basicConfig(level=logging.INFO)
         for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
@@ -137,7 +131,6 @@ class AutoModelTest(unittest.TestCase):
 
     @slow
     def test_model_for_encoder_decoder_lm(self):
-        logging.basicConfig(level=logging.INFO)
         for model_name in T5_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
@@ -150,7 +143,6 @@ class AutoModelTest(unittest.TestCase):
 
     @slow
     def test_sequence_classification_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
         for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
@@ -165,7 +157,6 @@ class AutoModelTest(unittest.TestCase):
 
     @slow
     def test_question_answering_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
         for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
@@ -178,7 +169,6 @@ class AutoModelTest(unittest.TestCase):
 
     @slow
     def test_token_classification_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
         for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
@@ -190,14 +180,12 @@ class AutoModelTest(unittest.TestCase):
             self.assertIsInstance(model, BertForTokenClassification)
 
     def test_from_pretrained_identifier(self):
-        logging.basicConfig(level=logging.INFO)
         model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
         self.assertIsInstance(model, BertForMaskedLM)
         self.assertEqual(model.num_parameters(), 14830)
         self.assertEqual(model.num_parameters(only_trainable=True), 14830)
 
     def test_from_identifier_from_model_type(self):
-        logging.basicConfig(level=logging.INFO)
         model = AutoModelWithLMHead.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER)
         self.assertIsInstance(model, RobertaForMaskedLM)
         self.assertEqual(model.num_parameters(), 14830)
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index ea27aa7278..4df1f75011 100644
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import copy
-import logging
 import os.path
 import random
 import tempfile
@@ -855,7 +854,6 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None):
 class ModelUtilsTest(unittest.TestCase):
     @slow
     def test_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
         for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             config = BertConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
diff --git a/tests/test_modeling_tf_auto.py b/tests/test_modeling_tf_auto.py
index 2aaec4b0b1..9cf3854f59 100644
--- a/tests/test_modeling_tf_auto.py
+++ b/tests/test_modeling_tf_auto.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 
-import logging
 import unittest
 
 from transformers import is_tf_available
@@ -48,7 +47,6 @@ class TFAutoModelTest(unittest.TestCase):
 
         self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
 
-        logging.basicConfig(level=logging.INFO)
         # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
         for model_name in ["bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
@@ -65,7 +63,6 @@ class TFAutoModelTest(unittest.TestCase):
 
         self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
 
-        logging.basicConfig(level=logging.INFO)
         # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
         for model_name in ["bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
@@ -78,7 +75,6 @@ class TFAutoModelTest(unittest.TestCase):
 
     @slow
     def test_lmhead_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
         # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
         for model_name in ["bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
@@ -91,7 +87,6 @@ class TFAutoModelTest(unittest.TestCase):
 
     @slow
     def test_sequence_classification_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
         # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
         for model_name in ["bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
@@ -104,7 +99,6 @@ class TFAutoModelTest(unittest.TestCase):
 
     @slow
     def test_question_answering_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
         # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
         for model_name in ["bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
@@ -116,14 +110,12 @@ class TFAutoModelTest(unittest.TestCase):
             self.assertIsInstance(model, TFBertForQuestionAnswering)
 
     def test_from_pretrained_identifier(self):
-        logging.basicConfig(level=logging.INFO)
         model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
         self.assertIsInstance(model, TFBertForMaskedLM)
         self.assertEqual(model.num_parameters(), 14830)
         self.assertEqual(model.num_parameters(only_trainable=True), 14830)
 
     def test_from_identifier_from_model_type(self):
-        logging.basicConfig(level=logging.INFO)
         model = TFAutoModelWithLMHead.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER)
         self.assertIsInstance(model, TFRobertaForMaskedLM)
         self.assertEqual(model.num_parameters(), 14830)
diff --git a/tests/test_tokenization_auto.py b/tests/test_tokenization_auto.py
index e39d18bac0..bc07117429 100644
--- a/tests/test_tokenization_auto.py
+++ b/tests/test_tokenization_auto.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 
-import logging
 import unittest
 
 from transformers import (
@@ -36,7 +35,6 @@ from .utils import DUMMY_UNKWOWN_IDENTIFIER, SMALL_MODEL_IDENTIFIER, slow  # noq
 class AutoTokenizerTest(unittest.TestCase):
     # @slow
     def test_tokenizer_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
         for model_name in (x for x in BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys() if "japanese" not in x):
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             self.assertIsNotNone(tokenizer)
@@ -50,19 +48,16 @@ class AutoTokenizerTest(unittest.TestCase):
             self.assertGreater(len(tokenizer), 0)
 
     def test_tokenizer_from_pretrained_identifier(self):
-        logging.basicConfig(level=logging.INFO)
         tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
         self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
         self.assertEqual(tokenizer.vocab_size, 12)
 
     def test_tokenizer_from_model_type(self):
-        logging.basicConfig(level=logging.INFO)
         tokenizer = AutoTokenizer.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER)
         self.assertIsInstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast))
         self.assertEqual(tokenizer.vocab_size, 20)
 
     def test_tokenizer_identifier_with_correct_config(self):
-        logging.basicConfig(level=logging.INFO)
         for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
             tokenizer = tokenizer_class.from_pretrained("wietsedv/bert-base-dutch-cased")
             self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
@@ -75,7 +70,6 @@ class AutoTokenizerTest(unittest.TestCase):
             self.assertEqual(tokenizer.max_len, 512)
 
     def test_tokenizer_identifier_non_existent(self):
-        logging.basicConfig(level=logging.INFO)
         for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
             with self.assertRaises(EnvironmentError):
                 _ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists")
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 82286c14fa..08d3d24a09 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -22,7 +22,7 @@ import tempfile
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Dict, List, Tuple, Union
 
-from tests.utils import require_tf, require_torch
+from tests.utils import require_tf, require_torch, slow
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast
 
 
@@ -71,7 +71,7 @@ class TokenizerTesterMixin:
         input_txt = self.get_clean_sequence(tokenizer)[0]
         return input_txt, input_txt
 
-    def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=None) -> Tuple[str, list]:
+    def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20) -> Tuple[str, list]:
         toks = [(i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in range(len(tokenizer))]
         toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks))
         toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks))
@@ -436,17 +436,51 @@ class TokenizerTesterMixin:
                     )
 
     def test_maximum_encoding_length_single_input(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
+        tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-                seq_0, ids = self.get_clean_sequence(tokenizer)
-                stride = 2
+                seq_0, ids = self.get_clean_sequence(tokenizer, max_length=20)
 
                 sequence = tokenizer.encode(seq_0, add_special_tokens=False)
-                # self.assertEqual(sequence, ids)
-
                 total_length = len(sequence)
-                information = tokenizer.encode_plus(
+
+                assert total_length > 1, "Issue with the testing sequence, please update it it's too short"
+
+                # Test with max model input length
+                model_max_length = tokenizer.model_max_length
+                self.assertEqual(model_max_length, 100)
+                seq_1 = seq_0 * model_max_length
+
+                sequence1 = tokenizer(seq_1, add_special_tokens=False)
+                total_length1 = len(sequence1["input_ids"])
+                assert (
+                    total_length1 > model_max_length
+                ), "Issue with the testing sequence, please update it it's too short"
+
+                # Simple
+                padding_strategies = (
+                    [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
+                )
+                for padding_state in padding_strategies:
+                    with self.subTest(f"Padding: {padding_state}"):
+                        for truncation_state in [True, "longest_first", "only_first"]:
+                            with self.subTest(f"Truncation: {truncation_state}"):
+                                output = tokenizer(seq_1, padding=padding_state, truncation=truncation_state)
+                                self.assertEqual(len(output["input_ids"]), model_max_length)
+
+                                output = tokenizer([seq_1], padding=padding_state, truncation=truncation_state)
+                                self.assertEqual(len(output["input_ids"][0]), model_max_length)
+
+                        # Simple with no truncation
+                        output = tokenizer(seq_1, padding=padding_state, truncation=False)
+                        self.assertNotEqual(len(output["input_ids"]), model_max_length)
+
+                        output = tokenizer([seq_1], padding=padding_state, truncation=False)
+                        self.assertNotEqual(len(output["input_ids"][0]), model_max_length)
+
+                # Overflowing tokens
+                stride = 2
+                information = tokenizer(
                     seq_0,
                     max_length=total_length - 2,
                     add_special_tokens=False,
@@ -479,22 +513,22 @@ class TokenizerTesterMixin:
                     )  # No overflowing tokens when using 'longest' in python tokenizers
 
     def test_maximum_encoding_length_pair_input(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
+        tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 # Build a sequence from our model's vocabulary
                 stride = 2
-                seq_0, ids = self.get_clean_sequence(tokenizer)
+                seq_0, ids = self.get_clean_sequence(tokenizer, max_length=20)
                 if len(ids) <= 2 + stride:
-                    seq_0 = [s for s in seq_0 for _ in range(2 + stride)]
-                    ids = [i for i in ids for _ in range(2 + stride)]
+                    seq_0 = (seq_0 + " ") * (2 + stride)
+                    ids = None
 
                 seq0_tokens = tokenizer.encode(seq_0, add_special_tokens=False)
                 assert len(seq0_tokens) > 2 + stride
 
                 seq_1 = "This is another sentence to be encoded."
                 seq1_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
-                if len(seq0_tokens) == len(seq1_tokens):
+                if abs(len(seq0_tokens) - len(seq1_tokens)) <= 2:
                     seq1_tokens = seq1_tokens + seq1_tokens
                     seq_1 = tokenizer.decode(seq1_tokens, clean_up_tokenization_spaces=False)
                 seq1_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
@@ -506,6 +540,49 @@ class TokenizerTesterMixin:
                 # We are not using the special tokens - a bit too hard to test all the tokenizers with this
                 # TODO try this again later
                 sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)  # , add_prefix_space=False)
+
+                # Test with max model input length
+                model_max_length = tokenizer.model_max_length
+                self.assertEqual(model_max_length, 100)
+                seq_2 = seq_0 * model_max_length
+
+                sequence1 = tokenizer(seq_1, add_special_tokens=False)
+                total_length1 = len(sequence1["input_ids"])
+                sequence2 = tokenizer(seq_2, seq_1, add_special_tokens=False)
+                total_length2 = len(sequence2["input_ids"])
+                assert total_length1 < model_max_length - 10, "Issue with the testing sequence, please update it."
+                assert total_length2 > model_max_length, "Issue with the testing sequence, please update it."
+
+                # Simple
+                padding_strategies = (
+                    [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
+                )
+                for padding_state in padding_strategies:
+                    with self.subTest(f"Padding: {padding_state}"):
+                        for truncation_state in [True, "longest_first", "only_first"]:
+                            with self.subTest(f"Truncation: {truncation_state}"):
+                                output = tokenizer(seq_2, seq_1, padding=padding_state, truncation=truncation_state)
+                                self.assertEqual(len(output["input_ids"]), model_max_length)
+
+                                output = tokenizer(
+                                    [seq_2], [seq_1], padding=padding_state, truncation=truncation_state
+                                )
+                                self.assertEqual(len(output["input_ids"][0]), model_max_length)
+
+                        # Simple
+                        output = tokenizer(seq_1, seq_2, padding=padding_state, truncation="only_second")
+                        self.assertEqual(len(output["input_ids"]), model_max_length)
+
+                        output = tokenizer([seq_1], [seq_2], padding=padding_state, truncation="only_second")
+                        self.assertEqual(len(output["input_ids"][0]), model_max_length)
+
+                        # Simple with no truncation
+                        output = tokenizer(seq_1, seq_2, padding=padding_state, truncation=False)
+                        self.assertNotEqual(len(output["input_ids"]), model_max_length)
+
+                        output = tokenizer([seq_1], [seq_2], padding=padding_state, truncation=False)
+                        self.assertNotEqual(len(output["input_ids"][0]), model_max_length)
+
                 truncated_first_sequence = tokenizer.encode(seq_0, add_special_tokens=False)[:-2] + tokenizer.encode(
                     seq_1, add_special_tokens=False
                 )
@@ -1229,6 +1306,7 @@ class TokenizerTesterMixin:
             # add pad_token_id to pass subsequent tests
             tokenizer.add_special_tokens({"pad_token": "<PAD>"})
 
+    @slow
     @require_torch
     def test_torch_encode_plus_sent_to_model(self):
         import torch
@@ -1278,6 +1356,7 @@ class TokenizerTesterMixin:
         #     model(**encoded_sequence_fast)
         #     model(**batch_encoded_sequence_fast)
 
+    @slow
     @require_tf
     def test_tf_encode_plus_sent_to_model(self):
         from transformers import TF_MODEL_MAPPING, TOKENIZER_MAPPING
@@ -1312,6 +1391,7 @@ class TokenizerTesterMixin:
                 model(batch_encoded_sequence)
 
     # TODO: Check if require_torch is the best to test for numpy here ... Maybe move to require_flax when available
+    @slow
     @require_torch
     def test_np_encode_plus_sent_to_model(self):
         from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
diff --git a/tests/test_tokenization_fast.py b/tests/test_tokenization_fast.py
index 57bb773132..7f2f662c75 100644
--- a/tests/test_tokenization_fast.py
+++ b/tests/test_tokenization_fast.py
@@ -22,8 +22,6 @@ from transformers.tokenization_roberta import RobertaTokenizerFast
 from transformers.tokenization_transfo_xl import TransfoXLTokenizerFast
 
 
-logging.basicConfig(level=logging.INFO)
-
 logger = logging.getLogger(__name__)
 
 NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
diff --git a/tests/test_trainer_distributed.py b/tests/test_trainer_distributed.py
index 3836930544..0a31e2eaad 100644
--- a/tests/test_trainer_distributed.py
+++ b/tests/test_trainer_distributed.py
@@ -62,7 +62,6 @@ if __name__ == "__main__":
     parser = HfArgumentParser((TrainingArguments,))
     training_args = parser.parse_args_into_dataclasses(sys.argv + ["--output_dir", "./examples"])[0]
 
-    logging.basicConfig(level=logging.INFO)
     logger.warning(
         "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
         training_args.local_rank,