[Tokenization] Fix #5181 - make #5155 more explicit - move back the default logging level in tests to WARNING (#5252)

* fix-5181 Padding to max sequence length while truncation to another length was wrong on slow tokenizers * clean up and fix #5155 * fix XLM test * Fix tests for Transfo-XL * logging only above WARNING in tests * switch slow tokenizers tests in @slow * fix Marian truncation tokenization test * style and quality * make the test a lot faster by limiting the sequence length used in tests
2020-06-25 17:24:28 +02:00
parent e008d520bb
commit 27cf1d97f0
9 changed files with 134 additions and 75 deletions
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -22,7 +22,7 @@ import tempfile
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Dict, List, Tuple, Union

-from tests.utils import require_tf, require_torch
+from tests.utils import require_tf, require_torch, slow
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast


@@ -71,7 +71,7 @@ class TokenizerTesterMixin:
        input_txt = self.get_clean_sequence(tokenizer)[0]
        return input_txt, input_txt

-    def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=None) -> Tuple[str, list]:
+    def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20) -> Tuple[str, list]:
        toks = [(i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in range(len(tokenizer))]
        toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks))
        toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks))
@@ -436,17 +436,51 @@ class TokenizerTesterMixin:
                    )

    def test_maximum_encoding_length_single_input(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
+        tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
        for tokenizer in tokenizers:
            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                seq_0, ids = self.get_clean_sequence(tokenizer)
-                stride = 2
+                seq_0, ids = self.get_clean_sequence(tokenizer, max_length=20)

                sequence = tokenizer.encode(seq_0, add_special_tokens=False)
-                # self.assertEqual(sequence, ids)
-
                total_length = len(sequence)
-                information = tokenizer.encode_plus(
+
+                assert total_length > 1, "Issue with the testing sequence, please update it it's too short"
+
+                # Test with max model input length
+                model_max_length = tokenizer.model_max_length
+                self.assertEqual(model_max_length, 100)
+                seq_1 = seq_0 * model_max_length
+
+                sequence1 = tokenizer(seq_1, add_special_tokens=False)
+                total_length1 = len(sequence1["input_ids"])
+                assert (
+                    total_length1 > model_max_length
+                ), "Issue with the testing sequence, please update it it's too short"
+
+                # Simple
+                padding_strategies = (
+                    [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
+                )
+                for padding_state in padding_strategies:
+                    with self.subTest(f"Padding: {padding_state}"):
+                        for truncation_state in [True, "longest_first", "only_first"]:
+                            with self.subTest(f"Truncation: {truncation_state}"):
+                                output = tokenizer(seq_1, padding=padding_state, truncation=truncation_state)
+                                self.assertEqual(len(output["input_ids"]), model_max_length)
+
+                                output = tokenizer([seq_1], padding=padding_state, truncation=truncation_state)
+                                self.assertEqual(len(output["input_ids"][0]), model_max_length)
+
+                        # Simple with no truncation
+                        output = tokenizer(seq_1, padding=padding_state, truncation=False)
+                        self.assertNotEqual(len(output["input_ids"]), model_max_length)
+
+                        output = tokenizer([seq_1], padding=padding_state, truncation=False)
+                        self.assertNotEqual(len(output["input_ids"][0]), model_max_length)
+
+                # Overflowing tokens
+                stride = 2
+                information = tokenizer(
                    seq_0,
                    max_length=total_length - 2,
                    add_special_tokens=False,
@@ -479,22 +513,22 @@ class TokenizerTesterMixin:
                    )  # No overflowing tokens when using 'longest' in python tokenizers

    def test_maximum_encoding_length_pair_input(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
+        tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
        for tokenizer in tokenizers:
            with self.subTest(f"{tokenizer.__class__.__name__}"):
                # Build a sequence from our model's vocabulary
                stride = 2
-                seq_0, ids = self.get_clean_sequence(tokenizer)
+                seq_0, ids = self.get_clean_sequence(tokenizer, max_length=20)
                if len(ids) <= 2 + stride:
-                    seq_0 = [s for s in seq_0 for _ in range(2 + stride)]
-                    ids = [i for i in ids for _ in range(2 + stride)]
+                    seq_0 = (seq_0 + " ") * (2 + stride)
+                    ids = None

                seq0_tokens = tokenizer.encode(seq_0, add_special_tokens=False)
                assert len(seq0_tokens) > 2 + stride

                seq_1 = "This is another sentence to be encoded."
                seq1_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
-                if len(seq0_tokens) == len(seq1_tokens):
+                if abs(len(seq0_tokens) - len(seq1_tokens)) <= 2:
                    seq1_tokens = seq1_tokens + seq1_tokens
                    seq_1 = tokenizer.decode(seq1_tokens, clean_up_tokenization_spaces=False)
                seq1_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
@@ -506,6 +540,49 @@ class TokenizerTesterMixin:
                # We are not using the special tokens - a bit too hard to test all the tokenizers with this
                # TODO try this again later
                sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)  # , add_prefix_space=False)
+
+                # Test with max model input length
+                model_max_length = tokenizer.model_max_length
+                self.assertEqual(model_max_length, 100)
+                seq_2 = seq_0 * model_max_length
+
+                sequence1 = tokenizer(seq_1, add_special_tokens=False)
+                total_length1 = len(sequence1["input_ids"])
+                sequence2 = tokenizer(seq_2, seq_1, add_special_tokens=False)
+                total_length2 = len(sequence2["input_ids"])
+                assert total_length1 < model_max_length - 10, "Issue with the testing sequence, please update it."
+                assert total_length2 > model_max_length, "Issue with the testing sequence, please update it."
+
+                # Simple
+                padding_strategies = (
+                    [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
+                )
+                for padding_state in padding_strategies:
+                    with self.subTest(f"Padding: {padding_state}"):
+                        for truncation_state in [True, "longest_first", "only_first"]:
+                            with self.subTest(f"Truncation: {truncation_state}"):
+                                output = tokenizer(seq_2, seq_1, padding=padding_state, truncation=truncation_state)
+                                self.assertEqual(len(output["input_ids"]), model_max_length)
+
+                                output = tokenizer(
+                                    [seq_2], [seq_1], padding=padding_state, truncation=truncation_state
+                                )
+                                self.assertEqual(len(output["input_ids"][0]), model_max_length)
+
+                        # Simple
+                        output = tokenizer(seq_1, seq_2, padding=padding_state, truncation="only_second")
+                        self.assertEqual(len(output["input_ids"]), model_max_length)
+
+                        output = tokenizer([seq_1], [seq_2], padding=padding_state, truncation="only_second")
+                        self.assertEqual(len(output["input_ids"][0]), model_max_length)
+
+                        # Simple with no truncation
+                        output = tokenizer(seq_1, seq_2, padding=padding_state, truncation=False)
+                        self.assertNotEqual(len(output["input_ids"]), model_max_length)
+
+                        output = tokenizer([seq_1], [seq_2], padding=padding_state, truncation=False)
+                        self.assertNotEqual(len(output["input_ids"][0]), model_max_length)
+
                truncated_first_sequence = tokenizer.encode(seq_0, add_special_tokens=False)[:-2] + tokenizer.encode(
                    seq_1, add_special_tokens=False
                )
@@ -1229,6 +1306,7 @@ class TokenizerTesterMixin:
            # add pad_token_id to pass subsequent tests
            tokenizer.add_special_tokens({"pad_token": "<PAD>"})

+    @slow
    @require_torch
    def test_torch_encode_plus_sent_to_model(self):
        import torch
@@ -1278,6 +1356,7 @@ class TokenizerTesterMixin:
        #     model(**encoded_sequence_fast)
        #     model(**batch_encoded_sequence_fast)

+    @slow
    @require_tf
    def test_tf_encode_plus_sent_to_model(self):
        from transformers import TF_MODEL_MAPPING, TOKENIZER_MAPPING
@@ -1312,6 +1391,7 @@ class TokenizerTesterMixin:
                model(batch_encoded_sequence)

    # TODO: Check if require_torch is the best to test for numpy here ... Maybe move to require_flax when available
+    @slow
    @require_torch
    def test_np_encode_plus_sent_to_model(self):
        from transformers import MODEL_MAPPING, TOKENIZER_MAPPING