From 135791e8ef12802ceb21a4abbb3a93f7da1bf390 Mon Sep 17 00:00:00 2001 From: Funtowicz Morgan Date: Fri, 26 Jun 2020 11:55:57 +0200 Subject: [PATCH] Add pad_to_multiple_of on tokenizers (reimport) (#5054) * Add new parameter `pad_to_multiple_of` on tokenizers. * unittest for pad_to_multiple_of * Add .name when logging enum. * Fix missing .items() on dict in tests. * Add special check + warning if the tokenizer doesn't have proper pad_token. * Use the correct logger format specifier. * Ensure tokenizer with no pad_token do not modify the underlying padding strategy. * Skip test if tokenizer doesn't have pad_token * Fix RobertaTokenizer on empty input * Format. Signed-off-by: Morgan Funtowicz * fix and updating to simpler API Co-authored-by: Thomas Wolf --- src/transformers/tokenization_roberta.py | 2 +- src/transformers/tokenization_utils.py | 13 ++++- src/transformers/tokenization_utils_base.py | 56 +++++++++++++++++++-- src/transformers/tokenization_utils_fast.py | 30 ++++++----- tests/test_tokenization_common.py | 34 +++++++++++++ 5 files changed, 117 insertions(+), 18 deletions(-) diff --git a/src/transformers/tokenization_roberta.py b/src/transformers/tokenization_roberta.py index f5ec1f3a00..19b482976c 100644 --- a/src/transformers/tokenization_roberta.py +++ b/src/transformers/tokenization_roberta.py @@ -244,7 +244,7 @@ class RobertaTokenizer(GPT2Tokenizer): def prepare_for_tokenization(self, text, is_pretokenized=False, **kwargs): add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) - if (is_pretokenized or add_prefix_space) and text: + if (is_pretokenized or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): text = " " + text return (text, kwargs) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 405f7cf17b..4a69ecc725 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -409,6 +409,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): max_length: Optional[int] = None, stride: int = 0, is_pretokenized: bool = False, + pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -461,6 +462,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): truncation_strategy=truncation_strategy, max_length=max_length, stride=stride, + pad_to_multiple_of=pad_to_multiple_of, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -487,6 +489,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): max_length: Optional[int] = None, stride: int = 0, is_pretokenized: bool = False, + pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -541,6 +544,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): truncation_strategy=truncation_strategy, max_length=max_length, stride=stride, + pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -561,6 +565,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, max_length: Optional[int] = None, stride: int = 0, + pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -587,6 +592,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): truncation_strategy=truncation_strategy, max_length=max_length, stride=stride, + pad_to_multiple_of=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -606,6 +612,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): batch_outputs, padding=padding_strategy.value, max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) @@ -623,6 +630,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, max_length: Optional[int] = None, stride: int = 0, + pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[str] = None, prepend_batch_axis: bool = False, return_token_type_ids: Optional[bool] = None, @@ -654,8 +662,10 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): encoded_inputs = {} - # Truncation: Handle max sequence length + # Compute the total size of the returned encodings total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0) + + # Truncation: Handle max sequence length if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: ids, pair_ids, overflowing_tokens = self.truncate_sequences( ids, @@ -700,6 +710,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): encoded_inputs, max_length=max_length, padding=padding_strategy.value, + pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 2d5c62aa0e..13e24bf02c 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -960,6 +960,9 @@ ENCODE_KWARGS_DOCSTRING = r""" The value of this argument defines the number of overlapping tokens. is_pretokenized (:obj:`bool`, defaults to :obj:`False`): Set to True to indicate the input is already tokenized + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + >= 7.5 (Volta). return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`, PyTorch :obj:`torch.Tensor` or Numpy :oj: `np.ndarray` instead of a list of python integers. @@ -1427,7 +1430,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): raise NotImplementedError def _get_padding_truncation_strategies( - self, padding=False, truncation=False, max_length=None, verbose=True, **kwargs + self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs ): """ Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy and pad_to_max_length) and behaviors. @@ -1527,6 +1530,19 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`." ) + # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided + if ( + truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE + and padding_strategy != PaddingStrategy.DO_NOT_PAD + and pad_to_multiple_of is not None + and max_length is not None + and (max_length % pad_to_multiple_of != 0) + ): + raise ValueError( + f"Truncation and padding are both activated but " + f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})." + ) + return padding_strategy, truncation_strategy, max_length, kwargs @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) @@ -1540,6 +1556,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): max_length: Optional[int] = None, stride: int = 0, is_pretokenized: bool = False, + pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1581,6 +1598,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): max_length=max_length, stride=stride, is_pretokenized=is_pretokenized, + pad_to_multiple_of=pad_to_multiple_of, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -1601,6 +1619,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): max_length=max_length, stride=stride, is_pretokenized=is_pretokenized, + pad_to_multiple_of=pad_to_multiple_of, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -1623,6 +1642,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): max_length: Optional[int] = None, stride: int = 0, is_pretokenized: bool = False, + pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1650,7 +1670,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( - padding, truncation, max_length, verbose, **kwargs + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, ) return self._encode_plus( @@ -1662,6 +1687,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): max_length=max_length, stride=stride, is_pretokenized=is_pretokenized, + pad_to_multiple_of=pad_to_multiple_of, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -1683,6 +1709,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): max_length: Optional[int] = None, stride: int = 0, is_pretokenized: bool = False, + pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1712,6 +1739,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): max_length: Optional[int] = None, stride: int = 0, is_pretokenized: bool = False, + pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1738,7 +1766,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( - padding, truncation, max_length, verbose, **kwargs + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, ) return self._batch_encode_plus( @@ -1749,6 +1782,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): max_length=max_length, stride=stride, is_pretokenized=is_pretokenized, + pad_to_multiple_of=pad_to_multiple_of, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -1776,6 +1810,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): max_length: Optional[int] = None, stride: int = 0, is_pretokenized: bool = False, + pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1799,6 +1834,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ], padding: Union[bool, str] = True, max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, return_attention_mask: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, verbose: bool = True, @@ -1820,6 +1856,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): - 'do_not_pad' (or `False`): Do not pad max_length: maximum length of the returned list and optionally padding length (see below). Will truncate by taking into account the special tokens. + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + >= 7.5 (Volta). return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`, @@ -1842,7 +1881,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): encoded_inputs["attention_mask"] = [] return encoded_inputs - # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + # Convert padding_strategy in PaddingStrategy padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( padding=padding, max_length=max_length, verbose=verbose ) @@ -1852,6 +1891,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): encoded_inputs, max_length=max_length, padding_strategy=padding_strategy, + pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) return BatchEncoding(encoded_inputs, tensor_type=return_tensors) @@ -1872,6 +1912,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): inputs, max_length=max_length, padding_strategy=padding_strategy, + pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) @@ -1887,6 +1928,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ Pad encoded inputs (on left/right and up to predefined legnth or max length in the batch) @@ -1902,6 +1944,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): The tokenizer padding sides are defined in self.padding_side: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + >= 7.5 (Volta). return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ # Load from model defaults @@ -1911,6 +1956,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): if padding_strategy == PaddingStrategy.LONGEST: max_length = len(encoded_inputs["input_ids"]) + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + needs_to_be_padded = ( padding_strategy != PaddingStrategy.DO_NOT_PAD and len(encoded_inputs["input_ids"]) != max_length ) diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 199445d2d2..f78b6313ec 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -241,25 +241,26 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): return self._tokenizer.encode(text, pair, add_special_tokens=add_special_tokens).tokens def set_truncation_and_padding( - self, padding_strategy: PaddingStrategy, truncation_strategy: TruncationStrategy, max_length: int, stride: int, + self, + padding_strategy: PaddingStrategy, + truncation_strategy: TruncationStrategy, + max_length: int, + stride: int, + pad_to_multiple_of: Optional[int], ): - """ This contextmanager is in charge of defining the truncation and the padding strategies for fast tokenizers + """ Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers library) and restore the tokenizer settings afterwards. - This contextmanager assumes the provider tokenizer has no padding / truncation strategy + The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a padding / truncation strategy before, then it will be reset to no padding/truncation when exiting the managed section. Args: - tokenizer (BaseTokenizerFast): The tokenizer which will be used - max_length (int): The maximum size of the sequence - stride (int): The stride to use when handling overflow - strategy (str): Overflowing logic to use - pad_to_max_length (bool): Boolean indicating if the output needs to be padded up to max_length - padding_side (str): "left" or "right" indicating the direction the output sequence will be padded - pad_token_id (int): The integer representation of the padding token to use - pad_token_type_id (int): The integer representation of the padding token type to use - pad_token (str): The string representation of the padding token to use + padding_strategy (:obj:`PaddingStrategy`): The kind of padding that will be applied to the input + truncation_strategy (:obj:`TruncationStrategy`): The kind of truncation that will be applied to the input + max_length (:obj:`int`): The maximum size of the sequence + stride (:obj:`int`): The stride to use when handling overflow + pad_to_multiple_of (:obj:`int`, `optional`, defaults to `None`) """ # Set truncation and padding on the backend tokenizer @@ -275,6 +276,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): pad_id=self.pad_token_id, pad_type_id=self.pad_token_type_id, pad_token=self.pad_token, + pad_to_multiple_of=pad_to_multiple_of, ) else: self._tokenizer.no_padding() @@ -290,6 +292,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): max_length: Optional[int] = None, stride: int = 0, is_pretokenized: bool = False, + pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -315,6 +318,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): truncation_strategy=truncation_strategy, max_length=max_length, stride=stride, + pad_to_multiple_of=pad_to_multiple_of, ) # Avoid thread overhead if only one example. @@ -383,6 +387,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): max_length: Optional[int] = None, stride: int = 0, is_pretokenized: bool = False, + pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[bool] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -403,6 +408,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): truncation_strategy=truncation_strategy, max_length=max_length, stride=stride, + pad_to_multiple_of=pad_to_multiple_of, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 08d3d24a09..92098916ad 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -883,6 +883,40 @@ class TokenizerTesterMixin: assert sequence_length == padded_sequence_right_length assert encoded_sequence == padded_sequence_right + def test_padding_to_multiple_of(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + if tokenizer.pad_token is None: + self.skipTest("No padding token.") + else: + with self.subTest(f"{tokenizer.__class__.__name__}"): + empty_tokens = tokenizer("", padding=True, pad_to_multiple_of=8) + normal_tokens = tokenizer("This is a sample input", padding=True, pad_to_multiple_of=8) + for key, value in empty_tokens.items(): + self.assertEqual(len(value) % 8, 0, "BatchEncoding.{} is not multiple of 8".format(key)) + for key, value in normal_tokens.items(): + self.assertEqual(len(value) % 8, 0, "BatchEncoding.{} is not multiple of 8".format(key)) + + normal_tokens = tokenizer("This", pad_to_multiple_of=8) + for key, value in normal_tokens.items(): + self.assertNotEqual(len(value) % 8, 0, "BatchEncoding.{} is not multiple of 8".format(key)) + + # Should also work with truncation + normal_tokens = tokenizer("This", padding=True, truncation=True, pad_to_multiple_of=8) + for key, value in normal_tokens.items(): + self.assertEqual(len(value) % 8, 0, "BatchEncoding.{} is not multiple of 8".format(key)) + + # truncation to something which is not a multiple of pad_to_multiple_of raises an error + self.assertRaises( + ValueError, + tokenizer.__call__, + "This", + padding=True, + truncation=True, + max_length=12, + pad_to_multiple_of=8, + ) + def test_encode_plus_with_padding(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: