[tests|tokenizers] Refactoring pipelines test backbone - Small tokenizers improvements - General tests speedups (#7970)

* WIP refactoring pipeline tests - switching to fast tokenizers * fix dialog pipeline and fill-mask * refactoring pipeline tests backbone * make large tests slow * fix tests (tf Bart inactive for now) * fix doc... * clean up for merge * fixing tests - remove bart from summarization until there is TF * fix quality and RAG * Add new translation pipeline tests - fix JAX tests * only slow for dialog * Fixing the missing TF-BART imports in modeling_tf_auto * spin out pipeline tests in separate CI job * adding pipeline test to CI YAML * add slow pipeline tests * speed up tf and pt join test to avoid redoing all the standalone pt and tf tests * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sam Shleifer <sshleifer@gmail.com> * Update src/transformers/pipelines.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * add require_torch and require_tf in is_pt_tf_cross_test Co-authored-by: Sam Shleifer <sshleifer@gmail.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
2020-10-23 15:58:19 +02:00
parent 88b3a91e61
commit 3a40cdf58d
32 changed files with 1587 additions and 1143 deletions
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -175,6 +175,23 @@ class TokenSpan(NamedTuple):
    end: int


+def to_py_obj(obj):
+    """
+    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list
+    to a python list.
+    """
+    if isinstance(obj, (list, tuple)):
+        return [to_py_obj(o) for o in obj]
+    elif is_tf_available() and isinstance(obj, tf.Tensor):
+        return obj.numpy().tolist()
+    elif is_torch_available() and isinstance(obj, torch.Tensor):
+        return obj.detach().cpu().tolist()
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    else:
+        return obj
+
+
 class BatchEncoding(UserDict):
    """
    Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`
@@ -1025,6 +1042,38 @@ class SpecialTokensMixin:
        """
        return self.convert_tokens_to_ids(self.additional_special_tokens)

+    @bos_token_id.setter
+    def bos_token_id(self, value):
+        self._bos_token = self.convert_tokens_to_ids(value)
+
+    @eos_token_id.setter
+    def eos_token_id(self, value):
+        self._eos_token = self.convert_tokens_to_ids(value)
+
+    @unk_token_id.setter
+    def unk_token_id(self, value):
+        self._unk_token = self.convert_tokens_to_ids(value)
+
+    @sep_token_id.setter
+    def sep_token_id(self, value):
+        self._sep_token = self.convert_tokens_to_ids(value)
+
+    @pad_token_id.setter
+    def pad_token_id(self, value):
+        self._pad_token = self.convert_tokens_to_ids(value)
+
+    @cls_token_id.setter
+    def cls_token_id(self, value):
+        self._cls_token = self.convert_tokens_to_ids(value)
+
+    @mask_token_id.setter
+    def mask_token_id(self, value):
+        self._mask_token = self.convert_tokens_to_ids(value)
+
+    @additional_special_tokens_ids.setter
+    def additional_special_tokens_ids(self, values):
+        self._additional_special_tokens = [self.convert_tokens_to_ids(value) for value in values]
+
    @property
    def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
        """
@@ -1424,6 +1473,18 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            f"padding_side='{self.padding_side}', special_tokens={self.special_tokens_map_extended})"
        )

+    def get_vocab(self) -> Dict[str, int]:
+        """
+        Returns the vocabulary as a dictionary of token to index.
+
+        :obj:`tokenizer.get_vocab()[token]` is equivalent to :obj:`tokenizer.convert_tokens_to_ids(token)` when
+        :obj:`token` is in the vocab.
+
+        Returns:
+            :obj:`Dict[str, int]`: The vocabulary.
+        """
+        raise NotImplementedError()
+
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
        r"""
@@ -1852,6 +1913,32 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        """
        raise NotImplementedError

+    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
+        """
+        Converts a string in a sequence of tokens, using the backend Rust tokenizer.
+
+        Note that this method behave differently between fast and slow tokenizers:
+            - in fast tokenizers (instances of :class:`~transformers.PreTrainedTokenizerFast`), this method
+                will replace the unknown tokens with the :obj:`unk_token`,
+            - in slow tokenizers (instances of :class:`~transformers.PreTrainedTokenizer`), this method
+                keep unknown tokens unchanged.
+
+        Args:
+            text (:obj:`str`):
+                The sequence to be encoded.
+            pair (:obj:`str`, `optional`):
+                A second sequence to be encoded with the first.
+            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to add the special tokens associated with the corresponding model.
+            kwargs (additional keyword arguments, `optional`):
+                Will be passed to the underlying model specific encode method.
+                See details in :meth:`~transformers.PreTrainedTokenizer.__call__`
+
+        Returns:
+            :obj:`List[str]`: The list of tokens.
+        """
+        raise NotImplementedError
+
    @add_end_docstrings(
        ENCODE_KWARGS_DOCSTRING,
        """
@@ -2456,18 +2543,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                    f"Should be one of a python, numpy, pytorch or tensorflow object."
                )

-            def to_py_obj(obj):
-                if isinstance(obj, (list, tuple)):
-                    return [to_py_obj(o) for o in obj]
-                elif is_tf_available() and isinstance(obj, tf.Tensor):
-                    return obj.numpy().tolist()
-                elif is_torch_available() and isinstance(obj, torch.Tensor):
-                    return obj.cpu().tolist()
-                elif isinstance(obj, np.ndarray):
-                    return obj.tolist()
-                else:
-                    return obj
-
            for key, value in encoded_inputs.items():
                encoded_inputs[key] = to_py_obj(value)

@@ -2862,33 +2937,53 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):

        return encoded_inputs

+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """
+        Converts a sequence of token ids in a single string.
+        The most simple way to do it is ``" ".join(tokens)`` but we often want to remove
+        sub-word tokenization artifacts at the same time.
+        Args:
+            tokens (:obj:`List[str]`): The token to join in a string.
+        Return: The joined tokens.
+        """
+        raise NotImplementedError
+
    def batch_decode(
-        self, sequences: List[List[int]], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
+        self,
+        sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = True,
+        **kwargs
    ) -> List[str]:
        """
        Convert a list of lists of token ids into a list of strings by calling decode.

        Args:
-            sequences (:obj:`List[List[int]]`):
+            sequences (:obj:`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the ``__call__`` method.
            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
                Whether or not to clean up the tokenization spaces.
+            kwargs (additional keyword arguments, `optional`):
+                Will be passed to the underlying model specific decode method.

        Returns:
            :obj:`List[str]`: The list of decoded sentences.
        """
        return [
            self.decode(
-                seq, skip_special_tokens=skip_special_tokens, clean_up_tokenization_spaces=clean_up_tokenization_spaces
+                seq,
+                skip_special_tokens=skip_special_tokens,
+                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                **kwargs,
            )
            for seq in sequences
        ]

    def decode(
        self,
-        token_ids: List[int],
+        token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = True,
        **kwargs
@@ -2900,16 +2995,35 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.

        Args:
-            token_ids (:obj:`List[int]`):
+            token_ids (:obj:`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the ``__call__`` method.
            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
                Whether or not to clean up the tokenization spaces.
+            kwargs (additional keyword arguments, `optional`):
+                Will be passed to the underlying model specific decode method.

        Returns:
            :obj:`str`: The decoded sentence.
        """
+        # Convert inputs to python lists
+        token_ids = to_py_obj(token_ids)
+
+        return self._decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = True,
+        **kwargs
+    ) -> str:
        raise NotImplementedError

    def get_special_tokens_mask(