Add pad_to_multiple_of on tokenizers (reimport) (#5054)

* Add new parameter `pad_to_multiple_of` on tokenizers.

* unittest for pad_to_multiple_of

* Add .name when logging enum.

* Fix missing .items() on dict in tests.

* Add special check + warning if the tokenizer doesn't have proper pad_token.

* Use the correct logger format specifier.

* Ensure tokenizer with no pad_token do not modify the underlying padding strategy.

* Skip test if tokenizer doesn't have pad_token

* Fix RobertaTokenizer on empty input

* Format.

Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com>

* fix and updating to simpler API

Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com>
This commit is contained in:
Funtowicz Morgan
2020-06-26 11:55:57 +02:00
committed by GitHub
parent 7cc15bdd96
commit 135791e8ef
5 changed files with 117 additions and 18 deletions

View File

@@ -960,6 +960,9 @@ ENCODE_KWARGS_DOCSTRING = r"""
The value of this argument defines the number of overlapping tokens.
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
Set to True to indicate the input is already tokenized
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
>= 7.5 (Volta).
return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`,
PyTorch :obj:`torch.Tensor` or Numpy :oj: `np.ndarray` instead of a list of python integers.
@@ -1427,7 +1430,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
raise NotImplementedError
def _get_padding_truncation_strategies(
self, padding=False, truncation=False, max_length=None, verbose=True, **kwargs
self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
):
""" Find the correct padding/truncation strategy with backward compatibility
for old arguments (truncation_strategy and pad_to_max_length) and behaviors.
@@ -1527,6 +1530,19 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
"or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
)
# Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
if (
truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
and padding_strategy != PaddingStrategy.DO_NOT_PAD
and pad_to_multiple_of is not None
and max_length is not None
and (max_length % pad_to_multiple_of != 0)
):
raise ValueError(
f"Truncation and padding are both activated but "
f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
)
return padding_strategy, truncation_strategy, max_length, kwargs
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
@@ -1540,6 +1556,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length: Optional[int] = None,
stride: int = 0,
is_pretokenized: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1581,6 +1598,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length=max_length,
stride=stride,
is_pretokenized=is_pretokenized,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -1601,6 +1619,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length=max_length,
stride=stride,
is_pretokenized=is_pretokenized,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -1623,6 +1642,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length: Optional[int] = None,
stride: int = 0,
is_pretokenized: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1650,7 +1670,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding, truncation, max_length, verbose, **kwargs
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._encode_plus(
@@ -1662,6 +1687,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length=max_length,
stride=stride,
is_pretokenized=is_pretokenized,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -1683,6 +1709,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length: Optional[int] = None,
stride: int = 0,
is_pretokenized: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1712,6 +1739,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length: Optional[int] = None,
stride: int = 0,
is_pretokenized: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1738,7 +1766,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding, truncation, max_length, verbose, **kwargs
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._batch_encode_plus(
@@ -1749,6 +1782,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length=max_length,
stride=stride,
is_pretokenized=is_pretokenized,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -1776,6 +1810,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length: Optional[int] = None,
stride: int = 0,
is_pretokenized: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1799,6 +1834,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
],
padding: Union[bool, str] = True,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
verbose: bool = True,
@@ -1820,6 +1856,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
- 'do_not_pad' (or `False`): Do not pad
max_length: maximum length of the returned list and optionally padding length (see below).
Will truncate by taking into account the special tokens.
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
>= 7.5 (Volta).
return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics)
return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`,
@@ -1842,7 +1881,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
encoded_inputs["attention_mask"] = []
return encoded_inputs
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
# Convert padding_strategy in PaddingStrategy
padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
padding=padding, max_length=max_length, verbose=verbose
)
@@ -1852,6 +1891,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
encoded_inputs,
max_length=max_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
@@ -1872,6 +1912,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
inputs,
max_length=max_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
@@ -1887,6 +1928,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
""" Pad encoded inputs (on left/right and up to predefined legnth or max length in the batch)
@@ -1902,6 +1944,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
The tokenizer padding sides are defined in self.padding_side:
- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
>= 7.5 (Volta).
return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
# Load from model defaults
@@ -1911,6 +1956,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
if padding_strategy == PaddingStrategy.LONGEST:
max_length = len(encoded_inputs["input_ids"])
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
needs_to_be_padded = (
padding_strategy != PaddingStrategy.DO_NOT_PAD and len(encoded_inputs["input_ids"]) != max_length
)