Fixes to make life easier with the nlp library (#6423)

* allow using tokenizer.pad as a collate_fn in pytorch

* allow using tokenizer.pad as a collate_fn in pytorch

* Add documentation and tests

* Make attention mask the right shape

* Better test

Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com>
This commit is contained in:
Sylvain Gugger
2020-08-12 08:00:56 -04:00
committed by GitHub
parent 87b359439f
commit e9c3031463
3 changed files with 127 additions and 43 deletions

View File

@@ -553,11 +553,12 @@ class BatchEncoding(UserDict):
tensor = as_tensor(value)
# at-least2d
if tensor.ndim > 2:
tensor = tensor.squeeze(0)
elif tensor.ndim < 2:
tensor = tensor[None, :]
# Removing this for now in favor of controling the shape with `prepend_batch_axis`
# # at-least2d
# if tensor.ndim > 2:
# tensor = tensor.squeeze(0)
# elif tensor.ndim < 2:
# tensor = tensor[None, :]
self[key] = tensor
except: # noqa E722
@@ -589,43 +590,6 @@ class BatchEncoding(UserDict):
return self
# class AddedToken(UserString):
# """ AddedToken represents a token to be added to a Tokenizer
# An AddedToken can have special options defining the way it should behave.
# Args:
# content: str:
# The content of the token
# single_word: bool
# Whether this token should only match against single word. If True,
# this token will never match inside of a word.
# lstrip: bool
# Whether this token should strip all potential whitespaces on the left side.
# If True, this token will greedily match any whitespace on the left and then strip
# them out.
# rstrip: bool
# Whether this token should strip all potential whitespaces on the right side.
# If True, this token will greedily match any whitespace on the right and then strip
# them out.
# """
# def __init__(
# self, data: str, single_word: bool = False, lstrip: bool = False, rstrip: bool = False,
# ):
# super().__init__(data)
# self._single_word = single_word
# self._lstrip = lstrip
# self._rstrip = rstrip
# def lower(self):
# return AddedToken(self.data.lower(), self._single_word, self._lstrip, self._rstrip)
class SpecialTokensMixin:
"""
A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`
@@ -2225,12 +2189,21 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
Padding side (left/right) padding token ids are defined at the tokenizer level
(with ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``)
.. note::
If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
case of PyTorch tensors, you will lose the specific device of your tensors however.
Args:
encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or
:obj:`Dict[str, List[int]]`) or a batch of tokenized inputs (list of
:class:`~transformers.BatchEncoding`, `Dict[str, List[List[int]]]` or `List[Dict[str, List[int]]]`) so
you can use this method during preprocessing as well as in a PyTorch Dataloader collate function.
Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
see the note above for the return type.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
@@ -2263,6 +2236,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
Whether or not to print informations and warnings.
"""
# If we have a list of dicts, let's convert it in a dict of lists
# We do this to allow using this method as a collate_fn function in PyTorch Dataloader
if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
@@ -2277,6 +2251,40 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
encoded_inputs["attention_mask"] = []
return encoded_inputs
# If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
# and rebuild them afterwards if no return_tensors is specified
# Note that we lose the specific device the tensor may be on for PyTorch
first_element = encoded_inputs["input_ids"][0]
if isinstance(first_element, (list, tuple)) and first_element:
first_element = first_element[0]
if not isinstance(first_element, int):
if is_tf_available() and isinstance(first_element, tf.Tensor):
return_tensors = "tf" if return_tensors is None else return_tensors
elif is_torch_available() and isinstance(first_element, torch.Tensor):
return_tensors = "pt" if return_tensors is None else return_tensors
elif isinstance(first_element, np.ndarray):
return_tensors = "np" if return_tensors is None else return_tensors
else:
raise ValueError(
f"type of {first_element} unknown: {type(first_element)}. "
f"Should be one of a python, numpy, pytorch or tensorflow object."
)
def to_py_obj(obj):
if isinstance(obj, (list, tuple)):
return [to_py_obj(o) for o in obj]
elif is_tf_available() and isinstance(obj, tf.Tensor):
return obj.numpy().tolist()
elif is_torch_available() and isinstance(obj, torch.Tensor):
return obj.cpu().tolist()
elif isinstance(obj, np.ndarray):
return obj.tolist()
else:
return obj
for key, value in encoded_inputs.items():
encoded_inputs[key] = to_py_obj(value)
# Convert padding_strategy in PaddingStrategy
padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
padding=padding, max_length=max_length, verbose=verbose