From b91e65afe0f467e24183928bf57d92b2cef4b69f Mon Sep 17 00:00:00 2001 From: Apoorv Garg <57873504+Apoorvgarg-creator@users.noreply.github.com> Date: Thu, 2 Sep 2021 15:28:23 +0530 Subject: [PATCH] Correct order of overflowing_tokens for slow tokenizer (#13179) * correct order of overflowing_tokens for slow tokenizer (issue fix #13148) * python 3.9 requires sentencepiece version 0.1.94 or above * slicing of ids fixed in truncated_sequence() * Update setup.py * Correct order of overflowing tokens for pair of sentences * code reformatted * Update tokenization_utils_base.py * reformatting file * test to check single_input added * missing function restored * test to check pair_input overflowing tokens order * test to check pair_input overflowing tokens order * test to check pair_input overflowing tokens order * added an error message for pair of seq and longest_first strategy * test for pair_input modified * variable name corrected * fixed a typo in error message * requested changes implemented * required test added * Corrected the message to match test message * added error message for Luke Tokenizer * lost test recovered * docstring for truncate_sequences and prepare_for_model updated * docstring for luke tokenizer updated * updated ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING * aligned text and fixed puncuatations * improved style and quality of code * fixed error_msg in truncate_sequences * replaced encode_plus method with regular call method * clean up * rephrased the docstring --- .../models/luke/tokenization_luke.py | 19 +++- src/transformers/tokenization_utils_base.py | 66 ++++++++----- tests/test_tokenization_common.py | 99 ++++++++++++------- 3 files changed, 119 insertions(+), 65 deletions(-) diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py index c5739317f0..fb9d7f7acd 100644 --- a/src/transformers/models/luke/tokenization_luke.py +++ b/src/transformers/models/luke/tokenization_luke.py @@ -85,7 +85,9 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" `What are attention masks? <../glossary.html#attention-mask>`__ return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not to return overflowing token sequences. + Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch + of pairs) is provided with :obj:`truncation_strategy = longest_first` or :obj:`True`, an error is + raised instead of returning overflowing tokens. return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether or not to return special tokens mask information. return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): @@ -1037,8 +1039,9 @@ class LukeTokenizer(RobertaTokenizer): Prepares a sequence of input id, entity id and entity span, or a pair of sequences of inputs ids, entity ids, entity spans so that it can be used by the model. It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and manages a moving window (with user defined stride) for - overflowing tokens - + overflowing tokens. Please Note, for `pair_ids` different than `None` and `truncation_strategy = longest_first` + or `True`, it is not possible to return overflowing tokens. Such a combination of arguments will raise an + error. Args: ids (:obj:`List[int]`): @@ -1078,6 +1081,16 @@ class LukeTokenizer(RobertaTokenizer): "results in an undefined behavior. Please set add_special_tokens to True or " "set return_token_type_ids to None." ) + if ( + return_overflowing_tokens + and truncation_strategy == TruncationStrategy.LONGEST_FIRST + and pair_ids is not None + ): + raise ValueError( + "Not possible to return overflowing tokens for pair of sequences with the " + "`longest_first`. Please select another truncation strategy than `longest_first`, " + "for instance `only_second` or `only_first`." + ) # Load from model defaults if return_token_type_ids is None: diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 963199bb7e..690fba4aed 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1324,7 +1324,9 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" `What are attention masks? <../glossary.html#attention-mask>`__ return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not to return overflowing token sequences. + Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch + of pairs) is provided with :obj:`truncation_strategy = longest_first` or :obj:`True`, an error is + raised instead of returning overflowing tokens. return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether or not to return special tokens mask information. return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): @@ -2838,7 +2840,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and - manages a moving window (with user defined stride) for overflowing tokens + manages a moving window (with user defined stride) for overflowing tokens. Please Note, for `pair_ids` + different than `None` and `truncation_strategy = longest_first` or `True`, it is not possible to return + overflowing tokens. Such a combination of arguments will raise an error. Args: ids (:obj:`List[int]`): @@ -2870,6 +2874,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): "set return_token_type_ids to None." ) + if ( + return_overflowing_tokens + and truncation_strategy == TruncationStrategy.LONGEST_FIRST + and pair_ids is not None + ): + raise ValueError( + "Not possible to return overflowing tokens for pair of sequences with the " + "`longest_first`. Please select another truncation strategy than `longest_first`, " + "for instance `only_second` or `only_first`." + ) + # Load from model defaults if return_token_type_ids is None: return_token_type_ids = "token_type_ids" in self.model_input_names @@ -2977,7 +2992,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): Returns: :obj:`Tuple[List[int], List[int], List[int]]`: The truncated ``ids``, the truncated ``pair_ids`` and the - list of overflowing tokens. + list of overflowing tokens. Note: The `longest_first` strategy returns empty list of overflowing_tokens if + a pair of sequences (or a batch of pairs) is provided. """ if num_tokens_to_remove <= 0: return ids, pair_ids, [] @@ -2986,34 +3002,36 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): truncation_strategy = TruncationStrategy(truncation_strategy) overflowing_tokens = [] - if truncation_strategy == TruncationStrategy.LONGEST_FIRST: - for _ in range(num_tokens_to_remove): - if pair_ids is None or len(ids) > len(pair_ids): - if not overflowing_tokens: - window_len = min(len(ids), stride + 1) - else: - window_len = 1 - overflowing_tokens.extend(ids[-window_len:]) - ids = ids[:-1] - else: - if not overflowing_tokens: - window_len = min(len(pair_ids), stride + 1) - else: - window_len = 1 - overflowing_tokens.extend(pair_ids[-window_len:]) - pair_ids = pair_ids[:-1] - elif truncation_strategy == TruncationStrategy.ONLY_FIRST: + if truncation_strategy == TruncationStrategy.ONLY_FIRST or ( + truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None + ): if len(ids) > num_tokens_to_remove: window_len = min(len(ids), stride + num_tokens_to_remove) overflowing_tokens = ids[-window_len:] ids = ids[:-num_tokens_to_remove] else: - logger.error( - f"We need to remove {num_tokens_to_remove} to truncate the input" + error_msg = ( + f"We need to remove {num_tokens_to_remove} to truncate the input " f"but the first sequence has a length {len(ids)}. " - f"Please select another truncation strategy than {truncation_strategy}, " - f"for instance 'longest_first' or 'only_second'." ) + if truncation_strategy == TruncationStrategy.ONLY_FIRST: + error_msg = ( + error_msg + "Please select another truncation strategy than " + f"{truncation_strategy}, for instance 'longest_first' or 'only_second'." + ) + logger.error(error_msg) + elif truncation_strategy == TruncationStrategy.LONGEST_FIRST: + logger.warning( + f"Be aware, overflowing tokens are not returned for the setting you have chosen," + f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' " + f"truncation strategy. So the returned list will always be empty even if some " + f"tokens have been removed." + ) + for _ in range(num_tokens_to_remove): + if pair_ids is None or len(ids) > len(pair_ids): + ids = ids[:-1] + else: + pair_ids = pair_ids[:-1] elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None: if len(pair_ids) > num_tokens_to_remove: window_len = min(len(pair_ids), stride + num_tokens_to_remove) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 6e35828dc4..5dfc738814 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -941,6 +941,7 @@ class TokenizerTesterMixin: self.assertEqual(truncated_sequence, sequence[:-2]) self.assertEqual(len(overflowing_tokens), 2 + stride) + self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :]) def test_maximum_encoding_length_pair_input(self): tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100) @@ -1053,18 +1054,18 @@ class TokenizerTesterMixin: overflow_first_sequence if len(seq0_tokens) > len(seq1_tokens) else overflow_second_sequence ) - information = tokenizer.encode_plus( - seq_0, - seq_1, - max_length=len(sequence) - 2, - add_special_tokens=False, - stride=stride, - truncation="longest_first", - return_overflowing_tokens=True, - # add_prefix_space=False, - ) # Overflowing tokens are handled quite differently in slow and fast tokenizers if isinstance(tokenizer, PreTrainedTokenizerFast): + information = tokenizer( + seq_0, + seq_1, + max_length=len(sequence) - 2, + add_special_tokens=False, + stride=stride, + truncation="longest_first", + return_overflowing_tokens=True, + # add_prefix_space=False, + ) truncated_sequence = information["input_ids"][0] overflowing_tokens = information["input_ids"][1] self.assertEqual(len(information["input_ids"]), 2) @@ -1075,28 +1076,39 @@ class TokenizerTesterMixin: self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest)) self.assertEqual(overflowing_tokens, overflow_longest_sequence) else: - truncated_sequence = information["input_ids"] - overflowing_tokens = information["overflowing_tokens"] + # No overflowing tokens when using 'longest' in python tokenizers + with self.assertRaises(ValueError) as context: + information = tokenizer( + seq_0, + seq_1, + max_length=len(sequence) - 2, + add_special_tokens=False, + stride=stride, + truncation="longest_first", + return_overflowing_tokens=True, + # add_prefix_space=False, + ) - self.assertEqual(len(truncated_sequence), len(sequence) - 2) - self.assertEqual(truncated_sequence, truncated_longest_sequence) + self.assertTrue( + context.exception.args[0].startswith( + "Not possible to return overflowing tokens for pair of sequences with the " + "`longest_first`. Please select another truncation strategy than `longest_first`, " + "for instance `only_second` or `only_first`." + ) + ) - self.assertEqual( - len(overflowing_tokens), 2 + stride - ) # No overflowing tokens when using 'longest' in python tokenizers - - information = tokenizer.encode_plus( - seq_0, - seq_1, - max_length=len(sequence) - 2, - add_special_tokens=False, - stride=stride, - truncation=True, - return_overflowing_tokens=True, - # add_prefix_space=False, - ) # Overflowing tokens are handled quite differently in slow and fast tokenizers if isinstance(tokenizer, PreTrainedTokenizerFast): + information = tokenizer( + seq_0, + seq_1, + max_length=len(sequence) - 2, + add_special_tokens=False, + stride=stride, + truncation=True, + return_overflowing_tokens=True, + # add_prefix_space=False, + ) truncated_sequence = information["input_ids"][0] overflowing_tokens = information["input_ids"][1] self.assertEqual(len(information["input_ids"]), 2) @@ -1107,17 +1119,28 @@ class TokenizerTesterMixin: self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest)) self.assertEqual(overflowing_tokens, overflow_longest_sequence) else: - truncated_sequence = information["input_ids"] - overflowing_tokens = information["overflowing_tokens"] + # No overflowing tokens when using 'longest' in python tokenizers + with self.assertRaises(ValueError) as context: + information = tokenizer( + seq_0, + seq_1, + max_length=len(sequence) - 2, + add_special_tokens=False, + stride=stride, + truncation=True, + return_overflowing_tokens=True, + # add_prefix_space=False, + ) - self.assertEqual(len(truncated_sequence), len(sequence) - 2) - self.assertEqual(truncated_sequence, truncated_longest_sequence) + self.assertTrue( + context.exception.args[0].startswith( + "Not possible to return overflowing tokens for pair of sequences with the " + "`longest_first`. Please select another truncation strategy than `longest_first`, " + "for instance `only_second` or `only_first`." + ) + ) - self.assertEqual( - len(overflowing_tokens), 2 + stride - ) # No overflowing tokens when using 'longest' in python tokenizers - - information_first_truncated = tokenizer.encode_plus( + information_first_truncated = tokenizer( seq_0, seq_1, max_length=len(sequence) - 2, @@ -1148,7 +1171,7 @@ class TokenizerTesterMixin: self.assertEqual(len(overflowing_tokens), 2 + stride) self.assertEqual(overflowing_tokens, seq0_tokens[-(2 + stride) :]) - information_second_truncated = tokenizer.encode_plus( + information_second_truncated = tokenizer( seq_0, seq_1, max_length=len(sequence) - 2,