[tests|tokenizers] Refactoring pipelines test backbone - Small tokenizers improvements - General tests speedups (#7970)

* WIP refactoring pipeline tests - switching to fast tokenizers

* fix dialog pipeline and fill-mask

* refactoring pipeline tests backbone

* make large tests slow

* fix tests (tf Bart inactive for now)

* fix doc...

* clean up for merge

* fixing tests - remove bart from summarization until there is TF

* fix quality and RAG

* Add new translation pipeline tests - fix JAX tests

* only slow for dialog

* Fixing the missing TF-BART imports in modeling_tf_auto

* spin out pipeline tests in separate CI job

* adding pipeline test to CI YAML

* add slow pipeline tests

* speed up tf and pt join test to avoid redoing all the standalone pt and tf tests

* Update src/transformers/tokenization_utils_base.py

Co-authored-by: Sam Shleifer <sshleifer@gmail.com>

* Update src/transformers/pipelines.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/pipelines.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>

* Update src/transformers/testing_utils.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* add require_torch and require_tf in is_pt_tf_cross_test

Co-authored-by: Sam Shleifer <sshleifer@gmail.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
This commit is contained in:
Thomas Wolf
2020-10-23 15:58:19 +02:00
committed by GitHub
parent 88b3a91e61
commit 3a40cdf58d
32 changed files with 1587 additions and 1143 deletions

View File

@@ -175,6 +175,23 @@ class TokenSpan(NamedTuple):
end: int
def to_py_obj(obj):
"""
Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list
to a python list.
"""
if isinstance(obj, (list, tuple)):
return [to_py_obj(o) for o in obj]
elif is_tf_available() and isinstance(obj, tf.Tensor):
return obj.numpy().tolist()
elif is_torch_available() and isinstance(obj, torch.Tensor):
return obj.detach().cpu().tolist()
elif isinstance(obj, np.ndarray):
return obj.tolist()
else:
return obj
class BatchEncoding(UserDict):
"""
Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`
@@ -1025,6 +1042,38 @@ class SpecialTokensMixin:
"""
return self.convert_tokens_to_ids(self.additional_special_tokens)
@bos_token_id.setter
def bos_token_id(self, value):
self._bos_token = self.convert_tokens_to_ids(value)
@eos_token_id.setter
def eos_token_id(self, value):
self._eos_token = self.convert_tokens_to_ids(value)
@unk_token_id.setter
def unk_token_id(self, value):
self._unk_token = self.convert_tokens_to_ids(value)
@sep_token_id.setter
def sep_token_id(self, value):
self._sep_token = self.convert_tokens_to_ids(value)
@pad_token_id.setter
def pad_token_id(self, value):
self._pad_token = self.convert_tokens_to_ids(value)
@cls_token_id.setter
def cls_token_id(self, value):
self._cls_token = self.convert_tokens_to_ids(value)
@mask_token_id.setter
def mask_token_id(self, value):
self._mask_token = self.convert_tokens_to_ids(value)
@additional_special_tokens_ids.setter
def additional_special_tokens_ids(self, values):
self._additional_special_tokens = [self.convert_tokens_to_ids(value) for value in values]
@property
def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
"""
@@ -1424,6 +1473,18 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
f"padding_side='{self.padding_side}', special_tokens={self.special_tokens_map_extended})"
)
def get_vocab(self) -> Dict[str, int]:
"""
Returns the vocabulary as a dictionary of token to index.
:obj:`tokenizer.get_vocab()[token]` is equivalent to :obj:`tokenizer.convert_tokens_to_ids(token)` when
:obj:`token` is in the vocab.
Returns:
:obj:`Dict[str, int]`: The vocabulary.
"""
raise NotImplementedError()
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
r"""
@@ -1852,6 +1913,32 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
"""
raise NotImplementedError
def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
"""
Converts a string in a sequence of tokens, using the backend Rust tokenizer.
Note that this method behave differently between fast and slow tokenizers:
- in fast tokenizers (instances of :class:`~transformers.PreTrainedTokenizerFast`), this method
will replace the unknown tokens with the :obj:`unk_token`,
- in slow tokenizers (instances of :class:`~transformers.PreTrainedTokenizer`), this method
keep unknown tokens unchanged.
Args:
text (:obj:`str`):
The sequence to be encoded.
pair (:obj:`str`, `optional`):
A second sequence to be encoded with the first.
add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to add the special tokens associated with the corresponding model.
kwargs (additional keyword arguments, `optional`):
Will be passed to the underlying model specific encode method.
See details in :meth:`~transformers.PreTrainedTokenizer.__call__`
Returns:
:obj:`List[str]`: The list of tokens.
"""
raise NotImplementedError
@add_end_docstrings(
ENCODE_KWARGS_DOCSTRING,
"""
@@ -2456,18 +2543,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
f"Should be one of a python, numpy, pytorch or tensorflow object."
)
def to_py_obj(obj):
if isinstance(obj, (list, tuple)):
return [to_py_obj(o) for o in obj]
elif is_tf_available() and isinstance(obj, tf.Tensor):
return obj.numpy().tolist()
elif is_torch_available() and isinstance(obj, torch.Tensor):
return obj.cpu().tolist()
elif isinstance(obj, np.ndarray):
return obj.tolist()
else:
return obj
for key, value in encoded_inputs.items():
encoded_inputs[key] = to_py_obj(value)
@@ -2862,33 +2937,53 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
return encoded_inputs
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""
Converts a sequence of token ids in a single string.
The most simple way to do it is ``" ".join(tokens)`` but we often want to remove
sub-word tokenization artifacts at the same time.
Args:
tokens (:obj:`List[str]`): The token to join in a string.
Return: The joined tokens.
"""
raise NotImplementedError
def batch_decode(
self, sequences: List[List[int]], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
self,
sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True,
**kwargs
) -> List[str]:
"""
Convert a list of lists of token ids into a list of strings by calling decode.
Args:
sequences (:obj:`List[List[int]]`):
sequences (:obj:`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
List of tokenized input ids. Can be obtained using the ``__call__`` method.
skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to clean up the tokenization spaces.
kwargs (additional keyword arguments, `optional`):
Will be passed to the underlying model specific decode method.
Returns:
:obj:`List[str]`: The list of decoded sentences.
"""
return [
self.decode(
seq, skip_special_tokens=skip_special_tokens, clean_up_tokenization_spaces=clean_up_tokenization_spaces
seq,
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
for seq in sequences
]
def decode(
self,
token_ids: List[int],
token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True,
**kwargs
@@ -2900,16 +2995,35 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
Args:
token_ids (:obj:`List[int]`):
token_ids (:obj:`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
List of tokenized input ids. Can be obtained using the ``__call__`` method.
skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to clean up the tokenization spaces.
kwargs (additional keyword arguments, `optional`):
Will be passed to the underlying model specific decode method.
Returns:
:obj:`str`: The decoded sentence.
"""
# Convert inputs to python lists
token_ids = to_py_obj(token_ids)
return self._decode(
token_ids=token_ids,
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
def _decode(
self,
token_ids: Union[int, List[int]],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True,
**kwargs
) -> str:
raise NotImplementedError
def get_special_tokens_mask(