Correct order of overflowing_tokens for slow tokenizer (#13179)

* correct order of overflowing_tokens for slow tokenizer (issue fix #13148) * python 3.9 requires sentencepiece version 0.1.94 or above * slicing of ids fixed in truncated_sequence() * Update setup.py * Correct order of overflowing tokens for pair of sentences * code reformatted * Update tokenization_utils_base.py * reformatting file * test to check single_input added * missing function restored * test to check pair_input overflowing tokens order * test to check pair_input overflowing tokens order * test to check pair_input overflowing tokens order * added an error message for pair of seq and longest_first strategy * test for pair_input modified * variable name corrected * fixed a typo in error message * requested changes implemented * required test added * Corrected the message to match test message * added error message for Luke Tokenizer * lost test recovered * docstring for truncate_sequences and prepare_for_model updated * docstring for luke tokenizer updated * updated ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING * aligned text and fixed puncuatations * improved style and quality of code * fixed error_msg in truncate_sequences * replaced encode_plus method with regular call method * clean up * rephrased the docstring
2021-09-02 15:28:23 +05:30
parent c9184a2e03
commit b91e65afe0
3 changed files with 119 additions and 65 deletions
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -941,6 +941,7 @@ class TokenizerTesterMixin:
                    self.assertEqual(truncated_sequence, sequence[:-2])

                    self.assertEqual(len(overflowing_tokens), 2 + stride)
+                    self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])

    def test_maximum_encoding_length_pair_input(self):
        tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
@@ -1053,18 +1054,18 @@ class TokenizerTesterMixin:
                    overflow_first_sequence if len(seq0_tokens) > len(seq1_tokens) else overflow_second_sequence
                )

-                information = tokenizer.encode_plus(
-                    seq_0,
-                    seq_1,
-                    max_length=len(sequence) - 2,
-                    add_special_tokens=False,
-                    stride=stride,
-                    truncation="longest_first",
-                    return_overflowing_tokens=True,
-                    # add_prefix_space=False,
-                )
                # Overflowing tokens are handled quite differently in slow and fast tokenizers
                if isinstance(tokenizer, PreTrainedTokenizerFast):
+                    information = tokenizer(
+                        seq_0,
+                        seq_1,
+                        max_length=len(sequence) - 2,
+                        add_special_tokens=False,
+                        stride=stride,
+                        truncation="longest_first",
+                        return_overflowing_tokens=True,
+                        # add_prefix_space=False,
+                    )
                    truncated_sequence = information["input_ids"][0]
                    overflowing_tokens = information["input_ids"][1]
                    self.assertEqual(len(information["input_ids"]), 2)
@@ -1075,28 +1076,39 @@ class TokenizerTesterMixin:
                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest))
                    self.assertEqual(overflowing_tokens, overflow_longest_sequence)
                else:
-                    truncated_sequence = information["input_ids"]
-                    overflowing_tokens = information["overflowing_tokens"]
+                    # No overflowing tokens when using 'longest' in python tokenizers
+                    with self.assertRaises(ValueError) as context:
+                        information = tokenizer(
+                            seq_0,
+                            seq_1,
+                            max_length=len(sequence) - 2,
+                            add_special_tokens=False,
+                            stride=stride,
+                            truncation="longest_first",
+                            return_overflowing_tokens=True,
+                            # add_prefix_space=False,
+                        )

-                    self.assertEqual(len(truncated_sequence), len(sequence) - 2)
-                    self.assertEqual(truncated_sequence, truncated_longest_sequence)
+                    self.assertTrue(
+                        context.exception.args[0].startswith(
+                            "Not possible to return overflowing tokens for pair of sequences with the "
+                            "`longest_first`. Please select another truncation strategy than `longest_first`, "
+                            "for instance `only_second` or `only_first`."
+                        )
+                    )

-                    self.assertEqual(
-                        len(overflowing_tokens), 2 + stride
-                    )  # No overflowing tokens when using 'longest' in python tokenizers
-
-                information = tokenizer.encode_plus(
-                    seq_0,
-                    seq_1,
-                    max_length=len(sequence) - 2,
-                    add_special_tokens=False,
-                    stride=stride,
-                    truncation=True,
-                    return_overflowing_tokens=True,
-                    # add_prefix_space=False,
-                )
                # Overflowing tokens are handled quite differently in slow and fast tokenizers
                if isinstance(tokenizer, PreTrainedTokenizerFast):
+                    information = tokenizer(
+                        seq_0,
+                        seq_1,
+                        max_length=len(sequence) - 2,
+                        add_special_tokens=False,
+                        stride=stride,
+                        truncation=True,
+                        return_overflowing_tokens=True,
+                        # add_prefix_space=False,
+                    )
                    truncated_sequence = information["input_ids"][0]
                    overflowing_tokens = information["input_ids"][1]
                    self.assertEqual(len(information["input_ids"]), 2)
@@ -1107,17 +1119,28 @@ class TokenizerTesterMixin:
                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest))
                    self.assertEqual(overflowing_tokens, overflow_longest_sequence)
                else:
-                    truncated_sequence = information["input_ids"]
-                    overflowing_tokens = information["overflowing_tokens"]
+                    # No overflowing tokens when using 'longest' in python tokenizers
+                    with self.assertRaises(ValueError) as context:
+                        information = tokenizer(
+                            seq_0,
+                            seq_1,
+                            max_length=len(sequence) - 2,
+                            add_special_tokens=False,
+                            stride=stride,
+                            truncation=True,
+                            return_overflowing_tokens=True,
+                            # add_prefix_space=False,
+                        )

-                    self.assertEqual(len(truncated_sequence), len(sequence) - 2)
-                    self.assertEqual(truncated_sequence, truncated_longest_sequence)
+                    self.assertTrue(
+                        context.exception.args[0].startswith(
+                            "Not possible to return overflowing tokens for pair of sequences with the "
+                            "`longest_first`. Please select another truncation strategy than `longest_first`, "
+                            "for instance `only_second` or `only_first`."
+                        )
+                    )

-                    self.assertEqual(
-                        len(overflowing_tokens), 2 + stride
-                    )  # No overflowing tokens when using 'longest' in python tokenizers
-
-                information_first_truncated = tokenizer.encode_plus(
+                information_first_truncated = tokenizer(
                    seq_0,
                    seq_1,
                    max_length=len(sequence) - 2,
@@ -1148,7 +1171,7 @@ class TokenizerTesterMixin:
                    self.assertEqual(len(overflowing_tokens), 2 + stride)
                    self.assertEqual(overflowing_tokens, seq0_tokens[-(2 + stride) :])

-                information_second_truncated = tokenizer.encode_plus(
+                information_second_truncated = tokenizer(
                    seq_0,
                    seq_1,
                    max_length=len(sequence) - 2,