Fixes to make life easier with the nlp library (#6423)
* allow using tokenizer.pad as a collate_fn in pytorch * allow using tokenizer.pad as a collate_fn in pytorch * Add documentation and tests * Make attention mask the right shape * Better test Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com>
This commit is contained in:
@@ -2318,7 +2318,7 @@ class ConversationalPipeline(Pipeline):
|
|||||||
max_len = max([len(item) for item in outputs])
|
max_len = max([len(item) for item in outputs])
|
||||||
outputs = [output + [self.pad_token_id] * (max_len - len(output)) for output in outputs]
|
outputs = [output + [self.pad_token_id] * (max_len - len(output)) for output in outputs]
|
||||||
outputs = BatchEncoding(
|
outputs = BatchEncoding(
|
||||||
{"input_ids": outputs, "attention_mask": [1] * len(outputs)}, tensor_type=self.framework
|
{"input_ids": outputs, "attention_mask": [[1] * len(outputs)]}, tensor_type=self.framework,
|
||||||
)
|
)
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
|||||||
@@ -553,11 +553,12 @@ class BatchEncoding(UserDict):
|
|||||||
|
|
||||||
tensor = as_tensor(value)
|
tensor = as_tensor(value)
|
||||||
|
|
||||||
# at-least2d
|
# Removing this for now in favor of controling the shape with `prepend_batch_axis`
|
||||||
if tensor.ndim > 2:
|
# # at-least2d
|
||||||
tensor = tensor.squeeze(0)
|
# if tensor.ndim > 2:
|
||||||
elif tensor.ndim < 2:
|
# tensor = tensor.squeeze(0)
|
||||||
tensor = tensor[None, :]
|
# elif tensor.ndim < 2:
|
||||||
|
# tensor = tensor[None, :]
|
||||||
|
|
||||||
self[key] = tensor
|
self[key] = tensor
|
||||||
except: # noqa E722
|
except: # noqa E722
|
||||||
@@ -589,43 +590,6 @@ class BatchEncoding(UserDict):
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
# class AddedToken(UserString):
|
|
||||||
# """ AddedToken represents a token to be added to a Tokenizer
|
|
||||||
|
|
||||||
# An AddedToken can have special options defining the way it should behave.
|
|
||||||
|
|
||||||
# Args:
|
|
||||||
# content: str:
|
|
||||||
# The content of the token
|
|
||||||
|
|
||||||
# single_word: bool
|
|
||||||
# Whether this token should only match against single word. If True,
|
|
||||||
# this token will never match inside of a word.
|
|
||||||
|
|
||||||
# lstrip: bool
|
|
||||||
# Whether this token should strip all potential whitespaces on the left side.
|
|
||||||
# If True, this token will greedily match any whitespace on the left and then strip
|
|
||||||
# them out.
|
|
||||||
|
|
||||||
# rstrip: bool
|
|
||||||
# Whether this token should strip all potential whitespaces on the right side.
|
|
||||||
# If True, this token will greedily match any whitespace on the right and then strip
|
|
||||||
# them out.
|
|
||||||
# """
|
|
||||||
|
|
||||||
# def __init__(
|
|
||||||
# self, data: str, single_word: bool = False, lstrip: bool = False, rstrip: bool = False,
|
|
||||||
# ):
|
|
||||||
# super().__init__(data)
|
|
||||||
|
|
||||||
# self._single_word = single_word
|
|
||||||
# self._lstrip = lstrip
|
|
||||||
# self._rstrip = rstrip
|
|
||||||
|
|
||||||
# def lower(self):
|
|
||||||
# return AddedToken(self.data.lower(), self._single_word, self._lstrip, self._rstrip)
|
|
||||||
|
|
||||||
|
|
||||||
class SpecialTokensMixin:
|
class SpecialTokensMixin:
|
||||||
"""
|
"""
|
||||||
A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`
|
A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`
|
||||||
@@ -2225,12 +2189,21 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
|||||||
Padding side (left/right) padding token ids are defined at the tokenizer level
|
Padding side (left/right) padding token ids are defined at the tokenizer level
|
||||||
(with ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``)
|
(with ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``)
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
|
||||||
|
result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
|
||||||
|
case of PyTorch tensors, you will lose the specific device of your tensors however.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
|
encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
|
||||||
Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or
|
Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or
|
||||||
:obj:`Dict[str, List[int]]`) or a batch of tokenized inputs (list of
|
:obj:`Dict[str, List[int]]`) or a batch of tokenized inputs (list of
|
||||||
:class:`~transformers.BatchEncoding`, `Dict[str, List[List[int]]]` or `List[Dict[str, List[int]]]`) so
|
:class:`~transformers.BatchEncoding`, `Dict[str, List[List[int]]]` or `List[Dict[str, List[int]]]`) so
|
||||||
you can use this method during preprocessing as well as in a PyTorch Dataloader collate function.
|
you can use this method during preprocessing as well as in a PyTorch Dataloader collate function.
|
||||||
|
|
||||||
|
Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
|
||||||
|
see the note above for the return type.
|
||||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
||||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
||||||
index) among:
|
index) among:
|
||||||
@@ -2263,6 +2236,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
|||||||
Whether or not to print informations and warnings.
|
Whether or not to print informations and warnings.
|
||||||
"""
|
"""
|
||||||
# If we have a list of dicts, let's convert it in a dict of lists
|
# If we have a list of dicts, let's convert it in a dict of lists
|
||||||
|
# We do this to allow using this method as a collate_fn function in PyTorch Dataloader
|
||||||
if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
|
if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
|
||||||
encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
|
encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
|
||||||
|
|
||||||
@@ -2277,6 +2251,40 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
|||||||
encoded_inputs["attention_mask"] = []
|
encoded_inputs["attention_mask"] = []
|
||||||
return encoded_inputs
|
return encoded_inputs
|
||||||
|
|
||||||
|
# If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
|
||||||
|
# and rebuild them afterwards if no return_tensors is specified
|
||||||
|
# Note that we lose the specific device the tensor may be on for PyTorch
|
||||||
|
first_element = encoded_inputs["input_ids"][0]
|
||||||
|
if isinstance(first_element, (list, tuple)) and first_element:
|
||||||
|
first_element = first_element[0]
|
||||||
|
if not isinstance(first_element, int):
|
||||||
|
if is_tf_available() and isinstance(first_element, tf.Tensor):
|
||||||
|
return_tensors = "tf" if return_tensors is None else return_tensors
|
||||||
|
elif is_torch_available() and isinstance(first_element, torch.Tensor):
|
||||||
|
return_tensors = "pt" if return_tensors is None else return_tensors
|
||||||
|
elif isinstance(first_element, np.ndarray):
|
||||||
|
return_tensors = "np" if return_tensors is None else return_tensors
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"type of {first_element} unknown: {type(first_element)}. "
|
||||||
|
f"Should be one of a python, numpy, pytorch or tensorflow object."
|
||||||
|
)
|
||||||
|
|
||||||
|
def to_py_obj(obj):
|
||||||
|
if isinstance(obj, (list, tuple)):
|
||||||
|
return [to_py_obj(o) for o in obj]
|
||||||
|
elif is_tf_available() and isinstance(obj, tf.Tensor):
|
||||||
|
return obj.numpy().tolist()
|
||||||
|
elif is_torch_available() and isinstance(obj, torch.Tensor):
|
||||||
|
return obj.cpu().tolist()
|
||||||
|
elif isinstance(obj, np.ndarray):
|
||||||
|
return obj.tolist()
|
||||||
|
else:
|
||||||
|
return obj
|
||||||
|
|
||||||
|
for key, value in encoded_inputs.items():
|
||||||
|
encoded_inputs[key] = to_py_obj(value)
|
||||||
|
|
||||||
# Convert padding_strategy in PaddingStrategy
|
# Convert padding_strategy in PaddingStrategy
|
||||||
padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
|
padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
|
||||||
padding=padding, max_length=max_length, verbose=verbose
|
padding=padding, max_length=max_length, verbose=verbose
|
||||||
|
|||||||
@@ -16,6 +16,8 @@ import pickle
|
|||||||
import unittest
|
import unittest
|
||||||
from typing import Callable, Optional
|
from typing import Callable, Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from transformers import BatchEncoding, BertTokenizer, BertTokenizerFast, PreTrainedTokenizer, TensorType
|
from transformers import BatchEncoding, BertTokenizer, BertTokenizerFast, PreTrainedTokenizer, TensorType
|
||||||
from transformers.testing_utils import require_tf, require_torch, slow
|
from transformers.testing_utils import require_tf, require_torch, slow
|
||||||
from transformers.tokenization_gpt2 import GPT2Tokenizer
|
from transformers.tokenization_gpt2 import GPT2Tokenizer
|
||||||
@@ -135,3 +137,77 @@ class TokenizerUtilsTest(unittest.TestCase):
|
|||||||
|
|
||||||
with self.subTest("Rust Tokenizer"):
|
with self.subTest("Rust Tokenizer"):
|
||||||
self.assertTrue(tokenizer_r("Small example to_encode").is_fast)
|
self.assertTrue(tokenizer_r("Small example to_encode").is_fast)
|
||||||
|
|
||||||
|
def test_batch_encoding_with_labels(self):
|
||||||
|
batch = BatchEncoding({"inputs": [[1, 2, 3], [4, 5, 6]], "labels": [0, 1]})
|
||||||
|
tensor_batch = batch.convert_to_tensors(tensor_type="np")
|
||||||
|
self.assertEqual(tensor_batch["inputs"].shape, (2, 3))
|
||||||
|
self.assertEqual(tensor_batch["labels"].shape, (2,))
|
||||||
|
|
||||||
|
batch = BatchEncoding({"inputs": [1, 2, 3], "labels": 0})
|
||||||
|
tensor_batch = batch.convert_to_tensors(tensor_type="np", prepend_batch_axis=True)
|
||||||
|
self.assertEqual(tensor_batch["inputs"].shape, (1, 3))
|
||||||
|
self.assertEqual(tensor_batch["labels"].shape, (1,))
|
||||||
|
|
||||||
|
@require_torch
|
||||||
|
def test_batch_encoding_with_labels_pt(self):
|
||||||
|
batch = BatchEncoding({"inputs": [[1, 2, 3], [4, 5, 6]], "labels": [0, 1]})
|
||||||
|
tensor_batch = batch.convert_to_tensors(tensor_type="pt")
|
||||||
|
self.assertEqual(tensor_batch["inputs"].shape, (2, 3))
|
||||||
|
self.assertEqual(tensor_batch["labels"].shape, (2,))
|
||||||
|
|
||||||
|
batch = BatchEncoding({"inputs": [1, 2, 3], "labels": 0})
|
||||||
|
tensor_batch = batch.convert_to_tensors(tensor_type="pt", prepend_batch_axis=True)
|
||||||
|
self.assertEqual(tensor_batch["inputs"].shape, (1, 3))
|
||||||
|
self.assertEqual(tensor_batch["labels"].shape, (1,))
|
||||||
|
|
||||||
|
@require_tf
|
||||||
|
def test_batch_encoding_with_labels_tf(self):
|
||||||
|
batch = BatchEncoding({"inputs": [[1, 2, 3], [4, 5, 6]], "labels": [0, 1]})
|
||||||
|
tensor_batch = batch.convert_to_tensors(tensor_type="tf")
|
||||||
|
self.assertEqual(tensor_batch["inputs"].shape, (2, 3))
|
||||||
|
self.assertEqual(tensor_batch["labels"].shape, (2,))
|
||||||
|
|
||||||
|
batch = BatchEncoding({"inputs": [1, 2, 3], "labels": 0})
|
||||||
|
tensor_batch = batch.convert_to_tensors(tensor_type="tf", prepend_batch_axis=True)
|
||||||
|
self.assertEqual(tensor_batch["inputs"].shape, (1, 3))
|
||||||
|
self.assertEqual(tensor_batch["labels"].shape, (1,))
|
||||||
|
|
||||||
|
def test_padding_accepts_tensors(self):
|
||||||
|
features = [{"input_ids": np.array([0, 1, 2])}, {"input_ids": np.array([0, 1, 2, 3])}]
|
||||||
|
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
||||||
|
|
||||||
|
batch = tokenizer.pad(features, padding=True)
|
||||||
|
self.assertTrue(isinstance(batch["input_ids"], np.ndarray))
|
||||||
|
self.assertEqual(batch["input_ids"].tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
|
||||||
|
batch = tokenizer.pad(features, padding=True, return_tensors="np")
|
||||||
|
self.assertTrue(isinstance(batch["input_ids"], np.ndarray))
|
||||||
|
self.assertEqual(batch["input_ids"].tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
|
||||||
|
|
||||||
|
@require_torch
|
||||||
|
def test_padding_accepts_tensors_pt(self):
|
||||||
|
import torch
|
||||||
|
|
||||||
|
features = [{"input_ids": torch.tensor([0, 1, 2])}, {"input_ids": torch.tensor([0, 1, 2, 3])}]
|
||||||
|
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
||||||
|
|
||||||
|
batch = tokenizer.pad(features, padding=True)
|
||||||
|
self.assertTrue(isinstance(batch["input_ids"], torch.Tensor))
|
||||||
|
self.assertEqual(batch["input_ids"].tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
|
||||||
|
batch = tokenizer.pad(features, padding=True, return_tensors="pt")
|
||||||
|
self.assertTrue(isinstance(batch["input_ids"], torch.Tensor))
|
||||||
|
self.assertEqual(batch["input_ids"].tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
|
||||||
|
|
||||||
|
@require_tf
|
||||||
|
def test_padding_accepts_tensors_tf(self):
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
features = [{"input_ids": tf.constant([0, 1, 2])}, {"input_ids": tf.constant([0, 1, 2, 3])}]
|
||||||
|
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
||||||
|
|
||||||
|
batch = tokenizer.pad(features, padding=True)
|
||||||
|
self.assertTrue(isinstance(batch["input_ids"], tf.Tensor))
|
||||||
|
self.assertEqual(batch["input_ids"].numpy().tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
|
||||||
|
batch = tokenizer.pad(features, padding=True, return_tensors="tf")
|
||||||
|
self.assertTrue(isinstance(batch["input_ids"], tf.Tensor))
|
||||||
|
self.assertEqual(batch["input_ids"].numpy().tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
|
||||||
|
|||||||
Reference in New Issue
Block a user