Correct order of overflowing_tokens for slow tokenizer (#13179)
* correct order of overflowing_tokens for slow tokenizer (issue fix #13148) * python 3.9 requires sentencepiece version 0.1.94 or above * slicing of ids fixed in truncated_sequence() * Update setup.py * Correct order of overflowing tokens for pair of sentences * code reformatted * Update tokenization_utils_base.py * reformatting file * test to check single_input added * missing function restored * test to check pair_input overflowing tokens order * test to check pair_input overflowing tokens order * test to check pair_input overflowing tokens order * added an error message for pair of seq and longest_first strategy * test for pair_input modified * variable name corrected * fixed a typo in error message * requested changes implemented * required test added * Corrected the message to match test message * added error message for Luke Tokenizer * lost test recovered * docstring for truncate_sequences and prepare_for_model updated * docstring for luke tokenizer updated * updated ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING * aligned text and fixed puncuatations * improved style and quality of code * fixed error_msg in truncate_sequences * replaced encode_plus method with regular call method * clean up * rephrased the docstring
This commit is contained in:
@@ -941,6 +941,7 @@ class TokenizerTesterMixin:
|
||||
self.assertEqual(truncated_sequence, sequence[:-2])
|
||||
|
||||
self.assertEqual(len(overflowing_tokens), 2 + stride)
|
||||
self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])
|
||||
|
||||
def test_maximum_encoding_length_pair_input(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
|
||||
@@ -1053,18 +1054,18 @@ class TokenizerTesterMixin:
|
||||
overflow_first_sequence if len(seq0_tokens) > len(seq1_tokens) else overflow_second_sequence
|
||||
)
|
||||
|
||||
information = tokenizer.encode_plus(
|
||||
seq_0,
|
||||
seq_1,
|
||||
max_length=len(sequence) - 2,
|
||||
add_special_tokens=False,
|
||||
stride=stride,
|
||||
truncation="longest_first",
|
||||
return_overflowing_tokens=True,
|
||||
# add_prefix_space=False,
|
||||
)
|
||||
# Overflowing tokens are handled quite differently in slow and fast tokenizers
|
||||
if isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
information = tokenizer(
|
||||
seq_0,
|
||||
seq_1,
|
||||
max_length=len(sequence) - 2,
|
||||
add_special_tokens=False,
|
||||
stride=stride,
|
||||
truncation="longest_first",
|
||||
return_overflowing_tokens=True,
|
||||
# add_prefix_space=False,
|
||||
)
|
||||
truncated_sequence = information["input_ids"][0]
|
||||
overflowing_tokens = information["input_ids"][1]
|
||||
self.assertEqual(len(information["input_ids"]), 2)
|
||||
@@ -1075,28 +1076,39 @@ class TokenizerTesterMixin:
|
||||
self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest))
|
||||
self.assertEqual(overflowing_tokens, overflow_longest_sequence)
|
||||
else:
|
||||
truncated_sequence = information["input_ids"]
|
||||
overflowing_tokens = information["overflowing_tokens"]
|
||||
# No overflowing tokens when using 'longest' in python tokenizers
|
||||
with self.assertRaises(ValueError) as context:
|
||||
information = tokenizer(
|
||||
seq_0,
|
||||
seq_1,
|
||||
max_length=len(sequence) - 2,
|
||||
add_special_tokens=False,
|
||||
stride=stride,
|
||||
truncation="longest_first",
|
||||
return_overflowing_tokens=True,
|
||||
# add_prefix_space=False,
|
||||
)
|
||||
|
||||
self.assertEqual(len(truncated_sequence), len(sequence) - 2)
|
||||
self.assertEqual(truncated_sequence, truncated_longest_sequence)
|
||||
self.assertTrue(
|
||||
context.exception.args[0].startswith(
|
||||
"Not possible to return overflowing tokens for pair of sequences with the "
|
||||
"`longest_first`. Please select another truncation strategy than `longest_first`, "
|
||||
"for instance `only_second` or `only_first`."
|
||||
)
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
len(overflowing_tokens), 2 + stride
|
||||
) # No overflowing tokens when using 'longest' in python tokenizers
|
||||
|
||||
information = tokenizer.encode_plus(
|
||||
seq_0,
|
||||
seq_1,
|
||||
max_length=len(sequence) - 2,
|
||||
add_special_tokens=False,
|
||||
stride=stride,
|
||||
truncation=True,
|
||||
return_overflowing_tokens=True,
|
||||
# add_prefix_space=False,
|
||||
)
|
||||
# Overflowing tokens are handled quite differently in slow and fast tokenizers
|
||||
if isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
information = tokenizer(
|
||||
seq_0,
|
||||
seq_1,
|
||||
max_length=len(sequence) - 2,
|
||||
add_special_tokens=False,
|
||||
stride=stride,
|
||||
truncation=True,
|
||||
return_overflowing_tokens=True,
|
||||
# add_prefix_space=False,
|
||||
)
|
||||
truncated_sequence = information["input_ids"][0]
|
||||
overflowing_tokens = information["input_ids"][1]
|
||||
self.assertEqual(len(information["input_ids"]), 2)
|
||||
@@ -1107,17 +1119,28 @@ class TokenizerTesterMixin:
|
||||
self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest))
|
||||
self.assertEqual(overflowing_tokens, overflow_longest_sequence)
|
||||
else:
|
||||
truncated_sequence = information["input_ids"]
|
||||
overflowing_tokens = information["overflowing_tokens"]
|
||||
# No overflowing tokens when using 'longest' in python tokenizers
|
||||
with self.assertRaises(ValueError) as context:
|
||||
information = tokenizer(
|
||||
seq_0,
|
||||
seq_1,
|
||||
max_length=len(sequence) - 2,
|
||||
add_special_tokens=False,
|
||||
stride=stride,
|
||||
truncation=True,
|
||||
return_overflowing_tokens=True,
|
||||
# add_prefix_space=False,
|
||||
)
|
||||
|
||||
self.assertEqual(len(truncated_sequence), len(sequence) - 2)
|
||||
self.assertEqual(truncated_sequence, truncated_longest_sequence)
|
||||
self.assertTrue(
|
||||
context.exception.args[0].startswith(
|
||||
"Not possible to return overflowing tokens for pair of sequences with the "
|
||||
"`longest_first`. Please select another truncation strategy than `longest_first`, "
|
||||
"for instance `only_second` or `only_first`."
|
||||
)
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
len(overflowing_tokens), 2 + stride
|
||||
) # No overflowing tokens when using 'longest' in python tokenizers
|
||||
|
||||
information_first_truncated = tokenizer.encode_plus(
|
||||
information_first_truncated = tokenizer(
|
||||
seq_0,
|
||||
seq_1,
|
||||
max_length=len(sequence) - 2,
|
||||
@@ -1148,7 +1171,7 @@ class TokenizerTesterMixin:
|
||||
self.assertEqual(len(overflowing_tokens), 2 + stride)
|
||||
self.assertEqual(overflowing_tokens, seq0_tokens[-(2 + stride) :])
|
||||
|
||||
information_second_truncated = tokenizer.encode_plus(
|
||||
information_second_truncated = tokenizer(
|
||||
seq_0,
|
||||
seq_1,
|
||||
max_length=len(sequence) - 2,
|
||||
|
||||
Reference in New Issue
Block a user