[breaking|pipelines|tokenizers] Adding slow-fast tokenizers equivalence tests pipelines - Removing sentencepiece as a required dependency (#8073)

* Fixing roberta for slow-fast tests * WIP getting equivalence on pipelines * slow-to-fast equivalence - working on question-answering pipeline * optional FAISS tests * Pipeline Q&A * Move pipeline tests to their own test job again * update tokenizer to add sequence id methods * update to tokenizers 0.9.4 * set sentencepiecce as optional * clean up squad * clean up pipelines to use sequence_ids * style/quality * wording * Switch to use_fast = True by default * update tests for use_fast at True by default * fix rag tokenizer test * removing protobuf from required dependencies * fix NER test for use_fast = True by default * fixing example tests (Q&A examples use slow tokenizers for now) * protobuf in main deps extras["sentencepiece"] and example deps * fix protobug install test * try to fix seq2seq by switching to slow tokenizers for now * Update src/transformers/tokenization_utils_base.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
2020-11-15 22:50:59 +01:00
parent 24184e73c4
commit f4e04cd2c6
23 changed files with 689 additions and 262 deletions
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -182,7 +182,9 @@ def to_py_obj(obj):
    """
    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
    """
-    if isinstance(obj, (list, tuple)):
+    if isinstance(obj, (dict, BatchEncoding)):
+        return {k: to_py_obj(v) for k, v in obj.items()}
+    elif isinstance(obj, (list, tuple)):
        return [to_py_obj(o) for o in obj]
    elif is_tf_available() and isinstance(obj, tf.Tensor):
        return obj.numpy().tolist()
@@ -216,6 +218,9 @@ class BatchEncoding(UserDict):
            initialization.
        prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to add a batch axis when converting to tensors (see :obj:`tensor_type` above).
+        n_sequences (:obj:`Optional[int]`, `optional`):
+            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
+            initialization.
    """

    def __init__(
@@ -224,6 +229,7 @@ class BatchEncoding(UserDict):
        encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
        tensor_type: Union[None, str, TensorType] = None,
        prepend_batch_axis: bool = False,
+        n_sequences: Optional[int] = None,
    ):
        super().__init__(data)

@@ -232,8 +238,22 @@ class BatchEncoding(UserDict):

        self._encodings = encoding

+        if n_sequences is None and encoding is not None and len(encoding):
+            n_sequences = encoding[0].n_sequences
+
+        self._n_sequences = n_sequences
+
        self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)

+    @property
+    def n_sequences(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
+        :class:`~transformers.BatchEncoding`. Currently can be one of :obj:`None` (unknown), :obj:`1` (a single
+        sentence) or :obj:`2` (a pair of sentences)
+        """
+        return self.n_sequences
+
    @property
    def is_fast(self) -> bool:
        """
@@ -311,6 +331,27 @@ class BatchEncoding(UserDict):
            raise ValueError("tokens() is not available when using Python-based tokenizers")
        return self._encodings[batch_index].tokens

+    def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
+        """
+        Return a list mapping the tokens to the id of their original sentences:
+
+            - :obj:`None` for special tokens added around or between sequences,
+            - :obj:`0` for tokens corresponding to words in the first sequence,
+            - :obj:`1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
+              encoded.
+
+        Args:
+            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
+
+        Returns:
+            :obj:`List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens
+            added by the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their
+            corresponding sequence.
+        """
+        if not self._encodings:
+            raise ValueError("sequence_ids() is not available when using Python-based tokenizers")
+        return self._encodings[batch_index].sequence_ids
+
    def words(self, batch_index: int = 0) -> List[Optional[int]]:
        """
        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
@@ -325,7 +366,67 @@ class BatchEncoding(UserDict):
        """
        if not self._encodings:
            raise ValueError("words() is not available when using Python-based tokenizers")
-        return self._encodings[batch_index].words
+        warnings.warn(
+            "`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
+            "but more self-explanatory `BatchEncoding.word_ids()` property.",
+            FutureWarning,
+        )
+        return self.word_ids(batch_index)
+
+    def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
+        """
+        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
+
+        Args:
+            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
+
+        Returns:
+            :obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by
+            the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding
+            word (several tokens will be mapped to the same word index if they are parts of that word).
+        """
+        if not self._encodings:
+            raise ValueError("word_ids() is not available when using Python-based tokenizers")
+        return self._encodings[batch_index].word_ids
+
+    def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
+        """
+        Get the index of the sequence represented by the given token. In the general use case, this method returns
+        :obj:`0` for a single sequence or the first sequence of a pair, and :obj:`1` for the second sequence of a pair
+
+        Can be called as:
+
+        - ``self.token_to_sequence(token_index)`` if batch size is 1
+        - ``self.token_to_sequence(batch_index, token_index)`` if batch size is greater than 1
+
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
+        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
+        tokenized words.
+
+        Args:
+            batch_or_token_index (:obj:`int`):
+                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
+                the token in the sequence.
+            token_index (:obj:`int`, `optional`):
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the
+                sequence.
+
+        Returns:
+            :obj:`int`: Index of the word in the input sequence.
+        """
+
+        if not self._encodings:
+            raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
+        if token_index is not None:
+            batch_index = batch_or_token_index
+        else:
+            batch_index = 0
+            token_index = batch_or_token_index
+        if batch_index < 0:
+            batch_index = self._batch_size + batch_index
+        if token_index < 0:
+            token_index = self._seq_len + token_index
+        return self._encodings[batch_index].token_to_sequence(token_index)

    def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
        """
@@ -365,9 +466,11 @@ class BatchEncoding(UserDict):
            token_index = self._seq_len + token_index
        return self._encodings[batch_index].token_to_word(token_index)

-    def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = None) -> Optional[TokenSpan]:
+    def word_to_tokens(
+        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
+    ) -> Optional[TokenSpan]:
        """
-        Get the encoded token span corresponding to a word in the sequence of the batch.
+        Get the encoded token span corresponding to a word in a sequence of the batch.

        Token spans are returned as a :class:`~transformers.tokenization_utils_base.TokenSpan` with:

@@ -376,8 +479,9 @@ class BatchEncoding(UserDict):

        Can be called as:

-        - ``self.word_to_tokens(word_index)`` if batch size is 1
-        - ``self.word_to_tokens(batch_index, word_index)`` if batch size is greater or equal to 1
+        - ``self.word_to_tokens(word_index, sequence_index: int = 0)`` if batch size is 1
+        - ``self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)`` if batch size is greater or equal
+          to 1

        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
@@ -390,6 +494,9 @@ class BatchEncoding(UserDict):
            word_index (:obj:`int`, `optional`):
                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
                sequence.
+            sequence_index (:obj:`int`, `optional`, defaults to 0):
+                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
+                or 1) the provided word index belongs to.

        Returns:
            Optional :class:`~transformers.tokenization_utils_base.TokenSpan` Span of tokens in the encoded sequence.
@@ -407,7 +514,7 @@ class BatchEncoding(UserDict):
            batch_index = self._batch_size + batch_index
        if word_index < 0:
            word_index = self._seq_len + word_index
-        span = self._encodings[batch_index].word_to_tokens(word_index)
+        span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
        return TokenSpan(*span) if span is not None else None

    def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
@@ -446,7 +553,9 @@ class BatchEncoding(UserDict):
            token_index = batch_or_token_index
        return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index)))

-    def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
+    def char_to_token(
+        self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0
+    ) -> int:
        """
        Get the index of the token in the encoded output comprising a character in the original string for a sequence
        of the batch.
@@ -467,6 +576,9 @@ class BatchEncoding(UserDict):
            char_index (:obj:`int`, `optional`):
                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
                sequence.
+            sequence_index (:obj:`int`, `optional`, defaults to 0):
+                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
+                or 1) the provided character index belongs to.


        Returns:
@@ -480,9 +592,11 @@ class BatchEncoding(UserDict):
        else:
            batch_index = 0
            char_index = batch_or_char_index
-        return self._encodings[batch_index].char_to_token(char_index)
+        return self._encodings[batch_index].char_to_token(char_index, sequence_index)

-    def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan:
+    def word_to_chars(
+        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
+    ) -> CharSpan:
        """
        Get the character span in the original string corresponding to given word in a sequence of the batch.

@@ -503,6 +617,9 @@ class BatchEncoding(UserDict):
            word_index (:obj:`int`, `optional`):
                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
                sequence.
+            sequence_index (:obj:`int`, `optional`, defaults to 0):
+                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
+                or 1) the provided word index belongs to.

        Returns:
            :obj:`CharSpan` or :obj:`List[CharSpan]`: Span(s) of the associated character or characters in the string.
@@ -520,9 +637,9 @@ class BatchEncoding(UserDict):
        else:
            batch_index = 0
            word_index = batch_or_word_index
-        return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index)))
+        return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))

-    def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
+    def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int:
        """
        Get the word in the original string corresponding to a character in the original string of a sequence of the
        batch.
@@ -543,6 +660,9 @@ class BatchEncoding(UserDict):
            char_index (:obj:`int`, `optional`):
                If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the
                original string.
+            sequence_index (:obj:`int`, `optional`, defaults to 0):
+                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
+                or 1) the provided character index belongs to.


        Returns:
@@ -556,7 +676,7 @@ class BatchEncoding(UserDict):
        else:
            batch_index = 0
            char_index = batch_or_char_index
-        return self._encodings[batch_index].char_to_word(char_index)
+        return self._encodings[batch_index].char_to_word(char_index, sequence_index)

    def convert_to_tensors(
        self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
@@ -1872,6 +1992,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                "Only fast tokenizers (instances of PretrainedTokenizerFast) can be saved in non legacy format."
            )

+        save_directory = str(save_directory)
+
        added_tokens_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
        )