Correct order of overflowing_tokens for slow tokenizer (#13179)

* correct order of overflowing_tokens for slow tokenizer (issue fix #13148)

* python 3.9 requires sentencepiece version 0.1.94 or above

* slicing of ids fixed in truncated_sequence()

* Update setup.py

* Correct order of overflowing tokens for pair of sentences

* code reformatted

* Update tokenization_utils_base.py

* reformatting file

* test to check single_input added

* missing function restored

* test to check pair_input overflowing tokens order

* test to check pair_input overflowing tokens order

* test to check pair_input overflowing tokens order

* added an error message for pair of seq and longest_first strategy

* test for pair_input modified

* variable name corrected

* fixed a typo in error message

* requested changes implemented

* required test added

* Corrected the message to match test message

* added error message for Luke Tokenizer

* lost test recovered

* docstring for truncate_sequences and prepare_for_model updated

* docstring for luke tokenizer updated

* updated ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING

* aligned text and fixed puncuatations

* improved style and quality of code

* fixed error_msg in truncate_sequences

* replaced encode_plus method with regular call method

* clean up

* rephrased the docstring
This commit is contained in:
Apoorv Garg
2021-09-02 15:28:23 +05:30
committed by GitHub
parent c9184a2e03
commit b91e65afe0
3 changed files with 119 additions and 65 deletions

View File

@@ -941,6 +941,7 @@ class TokenizerTesterMixin:
self.assertEqual(truncated_sequence, sequence[:-2])
self.assertEqual(len(overflowing_tokens), 2 + stride)
self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])
def test_maximum_encoding_length_pair_input(self):
tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
@@ -1053,18 +1054,18 @@ class TokenizerTesterMixin:
overflow_first_sequence if len(seq0_tokens) > len(seq1_tokens) else overflow_second_sequence
)
information = tokenizer.encode_plus(
seq_0,
seq_1,
max_length=len(sequence) - 2,
add_special_tokens=False,
stride=stride,
truncation="longest_first",
return_overflowing_tokens=True,
# add_prefix_space=False,
)
# Overflowing tokens are handled quite differently in slow and fast tokenizers
if isinstance(tokenizer, PreTrainedTokenizerFast):
information = tokenizer(
seq_0,
seq_1,
max_length=len(sequence) - 2,
add_special_tokens=False,
stride=stride,
truncation="longest_first",
return_overflowing_tokens=True,
# add_prefix_space=False,
)
truncated_sequence = information["input_ids"][0]
overflowing_tokens = information["input_ids"][1]
self.assertEqual(len(information["input_ids"]), 2)
@@ -1075,28 +1076,39 @@ class TokenizerTesterMixin:
self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest))
self.assertEqual(overflowing_tokens, overflow_longest_sequence)
else:
truncated_sequence = information["input_ids"]
overflowing_tokens = information["overflowing_tokens"]
# No overflowing tokens when using 'longest' in python tokenizers
with self.assertRaises(ValueError) as context:
information = tokenizer(
seq_0,
seq_1,
max_length=len(sequence) - 2,
add_special_tokens=False,
stride=stride,
truncation="longest_first",
return_overflowing_tokens=True,
# add_prefix_space=False,
)
self.assertEqual(len(truncated_sequence), len(sequence) - 2)
self.assertEqual(truncated_sequence, truncated_longest_sequence)
self.assertTrue(
context.exception.args[0].startswith(
"Not possible to return overflowing tokens for pair of sequences with the "
"`longest_first`. Please select another truncation strategy than `longest_first`, "
"for instance `only_second` or `only_first`."
)
)
self.assertEqual(
len(overflowing_tokens), 2 + stride
) # No overflowing tokens when using 'longest' in python tokenizers
information = tokenizer.encode_plus(
seq_0,
seq_1,
max_length=len(sequence) - 2,
add_special_tokens=False,
stride=stride,
truncation=True,
return_overflowing_tokens=True,
# add_prefix_space=False,
)
# Overflowing tokens are handled quite differently in slow and fast tokenizers
if isinstance(tokenizer, PreTrainedTokenizerFast):
information = tokenizer(
seq_0,
seq_1,
max_length=len(sequence) - 2,
add_special_tokens=False,
stride=stride,
truncation=True,
return_overflowing_tokens=True,
# add_prefix_space=False,
)
truncated_sequence = information["input_ids"][0]
overflowing_tokens = information["input_ids"][1]
self.assertEqual(len(information["input_ids"]), 2)
@@ -1107,17 +1119,28 @@ class TokenizerTesterMixin:
self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest))
self.assertEqual(overflowing_tokens, overflow_longest_sequence)
else:
truncated_sequence = information["input_ids"]
overflowing_tokens = information["overflowing_tokens"]
# No overflowing tokens when using 'longest' in python tokenizers
with self.assertRaises(ValueError) as context:
information = tokenizer(
seq_0,
seq_1,
max_length=len(sequence) - 2,
add_special_tokens=False,
stride=stride,
truncation=True,
return_overflowing_tokens=True,
# add_prefix_space=False,
)
self.assertEqual(len(truncated_sequence), len(sequence) - 2)
self.assertEqual(truncated_sequence, truncated_longest_sequence)
self.assertTrue(
context.exception.args[0].startswith(
"Not possible to return overflowing tokens for pair of sequences with the "
"`longest_first`. Please select another truncation strategy than `longest_first`, "
"for instance `only_second` or `only_first`."
)
)
self.assertEqual(
len(overflowing_tokens), 2 + stride
) # No overflowing tokens when using 'longest' in python tokenizers
information_first_truncated = tokenizer.encode_plus(
information_first_truncated = tokenizer(
seq_0,
seq_1,
max_length=len(sequence) - 2,
@@ -1148,7 +1171,7 @@ class TokenizerTesterMixin:
self.assertEqual(len(overflowing_tokens), 2 + stride)
self.assertEqual(overflowing_tokens, seq0_tokens[-(2 + stride) :])
information_second_truncated = tokenizer.encode_plus(
information_second_truncated = tokenizer(
seq_0,
seq_1,
max_length=len(sequence) - 2,