From 5bf9afbf351f9419505eb1c9e0c5ab78883c3caf Mon Sep 17 00:00:00 2001
From: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Date: Thu, 4 Jun 2020 04:57:01 +0000
Subject: [PATCH] Introduce a new tensor type for return_tensors on tokenizer
 for NumPy (#4585)

* Refactor tensor creation in tokenizers.

* Make sure to convert string to TensorType

* Refactor convert_to_tensors_

* Introduce numpy tensor creation

* Format

* Add unittest for TensorType creation from str

* sorting imports

* Added unittests for numpy tensor conversion.

* Do not use in-place version for squeeze as numpy doesn't provide such feature.

* Added extra parameter prepend_batch_axis: bool on prepare_for_model.

* Ensure test_np_encode_plus_sent_to_model is not executed if encoder/decoder model.

* style.

* numpy tests require_torch for now while flax not merged.

* Hopefully will make flake8 happy.

* One more time :notes:
---
 src/transformers/__init__.py           |   2 +-
 src/transformers/tokenization_utils.py | 165 ++++++++++++-------------
 tests/test_tokenization_common.py      |  44 +++++++
 tests/test_tokenization_utils.py       |   7 +-
 4 files changed, 133 insertions(+), 85 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index e291b29ea1..c5822619c9 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -132,7 +132,7 @@ from .tokenization_reformer import ReformerTokenizer
 from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
 from .tokenization_t5 import T5Tokenizer
 from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer, TransfoXLTokenizerFast
-from .tokenization_utils import PreTrainedTokenizer
+from .tokenization_utils import PreTrainedTokenizer, TensorType
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_xlm_roberta import XLMRobertaTokenizer
 from .tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index a6be3a48c3..8ee2346783 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -25,8 +25,10 @@ import re
 import warnings
 from collections import UserDict, defaultdict
 from contextlib import contextmanager
-from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
+from enum import Enum
+from typing import Any, Dict, List, MutableMapping, NamedTuple, Optional, Sequence, Tuple, Union
 
+import numpy as np
 from tokenizers import AddedToken as AddedTokenFast
 from tokenizers import Encoding as EncodingFast
 from tokenizers.decoders import Decoder as DecoderFast
@@ -42,6 +44,18 @@ if is_torch_available():
 
 logger = logging.getLogger(__name__)
 
+NO_PAD_TOKEN_FOR_BATCH_MSG = (
+    "No padding token is set for this model, therefore no batch can be made with uneven "
+    "sequences. Set a padding token or adjust the lengths of the sequences building the "
+    "batch so that every sequence is of the same length."
+)
+
+UNEVEN_SEQUENCES_FOR_BATCH_MSG = (
+    "The sequences building the batch are not of the same size, no tensor "
+    "can be built. Set `pad_to_max_length=True` to pad the smaller sequences"
+    "up to the larger sequence's length."
+)
+
 SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
 ADDED_TOKENS_FILE = "added_tokens.json"
 TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
@@ -58,6 +72,12 @@ PreTokenizedInputPair = Tuple[List[str], List[str]]
 EncodedInputPair = Tuple[List[int], List[int]]
 
 
+class TensorType(Enum):
+    PYTORCH = "pt"
+    TENSORFLOW = "tf"
+    NUMPY = "np"
+
+
 class CharSpan(NamedTuple):
     """ Character span in the original string
 
@@ -161,6 +181,51 @@ def truncate_and_pad(
         tokenizer.no_padding()
 
 
+def convert_to_tensors(
+    batch_outputs: MutableMapping, return_tensors: Union[str, TensorType], prepend_batch_axis: bool = False
+) -> MutableMapping:
+    # Convert to TensorType
+    if not isinstance(return_tensors, TensorType):
+        return_tensors = TensorType(return_tensors)
+
+    # Get a function reference for the correct framework
+    if return_tensors == TensorType.TENSORFLOW and is_tf_available():
+        as_tensor = tf.constant
+    elif return_tensors == TensorType.PYTORCH and is_torch_available():
+        as_tensor = torch.tensor
+    elif return_tensors == TensorType.NUMPY:
+        as_tensor = np.asarray
+    else:
+        raise ImportError(
+            "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
+                return_tensors
+            )
+        )
+
+    # Do the tensor conversion in batch
+    for key, value in batch_outputs.items():
+        try:
+            if prepend_batch_axis:
+                value = [value]
+
+            tensor = as_tensor(value)
+
+            # at-least2d
+            if tensor.ndim > 2:
+                tensor = tensor.squeeze(0)
+            elif tensor.ndim < 2:
+                tensor = tensor[None, :]
+
+            batch_outputs[key] = tensor
+        except ValueError:
+            if None in [item for sequence in value for item in sequence]:
+                raise ValueError(NO_PAD_TOKEN_FOR_BATCH_MSG)
+            else:
+                raise ValueError(UNEVEN_SEQUENCES_FOR_BATCH_MSG)
+
+    return batch_outputs
+
+
 class BatchEncoding(UserDict):
     """ BatchEncoding hold the output of the encode and batch_encode methods (tokens, attention_masks, etc).
         This class is derived from a python Dictionary and can be used as a dictionnary.
@@ -755,18 +820,6 @@ class PreTrainedTokenizer(SpecialTokensMixin):
 
     padding_side: str = "right"
 
-    NO_PAD_TOKEN_FOR_BATCH_MSG = (
-        "No padding token is set for this model, therefore no batch can be made with uneven "
-        "sequences. Set a padding token or adjust the lengths of the sequences building the "
-        "batch so that every sequence is of the same length."
-    )
-
-    UNEVEN_SEQUENCES_FOR_BATCH_MSG = (
-        "The sequences building the batch are not of the same size, no tensor "
-        "can be built. Set `pad_to_max_length=True` to pad the smaller sequences"
-        "up to the larger sequence's length."
-    )
-
     @property
     def vocab_size(self) -> int:
         """ Size of the base vocabulary (without the added tokens) """
@@ -1373,7 +1426,7 @@ class PreTrainedTokenizer(SpecialTokensMixin):
         stride: int = 0,
         truncation_strategy: str = "longest_first",
         pad_to_max_length: bool = False,
-        return_tensors: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs
     ):
         """
@@ -1447,7 +1500,7 @@ class PreTrainedTokenizer(SpecialTokensMixin):
         truncation_strategy: str = "longest_first",
         pad_to_max_length: bool = False,
         is_pretokenized: bool = False,
-        return_tensors: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_overflowing_tokens: bool = False,
@@ -1590,6 +1643,7 @@ class PreTrainedTokenizer(SpecialTokensMixin):
             return_token_type_ids=return_token_type_ids,
             return_overflowing_tokens=return_overflowing_tokens,
             return_special_tokens_mask=return_special_tokens_mask,
+            prepend_batch_axis=return_tensors is not None,
         )
 
     def batch_encode_plus(
@@ -1608,7 +1662,7 @@ class PreTrainedTokenizer(SpecialTokensMixin):
         truncation_strategy: str = "longest_first",
         pad_to_max_length: bool = False,
         is_pretokenized: bool = False,
-        return_tensors: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_masks: Optional[bool] = None,
         return_overflowing_tokens: bool = False,
@@ -1783,39 +1837,10 @@ class PreTrainedTokenizer(SpecialTokensMixin):
                 batch_outputs[key].append(value)
 
         if return_tensors is not None:
+            convert_to_tensors(batch_outputs, return_tensors)
 
-            self.convert_to_tensors_(batch_outputs, return_tensors)
         return BatchEncoding(batch_outputs)
 
-    def convert_to_tensors_(self, batch_outputs: dict, return_tensors: str) -> None:
-        # Do the tensor conversion in batch
-        for key, value in batch_outputs.items():
-            if return_tensors == "tf" and is_tf_available():
-                try:
-                    batch_outputs[key] = tf.constant(value)
-                except ValueError:
-                    if None in [item for sequence in value for item in sequence]:
-                        raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG)
-                    else:
-                        raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG)
-            elif return_tensors == "pt" and is_torch_available():
-                try:
-                    batch_outputs[key] = torch.tensor(value)
-                except ValueError:
-                    raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG)
-                except RuntimeError:
-                    if None in [item for sequence in value for item in sequence]:
-                        raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG)
-                    else:
-                        raise
-
-            elif return_tensors is not None:
-                logger.warning(
-                    "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
-                        return_tensors
-                    )
-                )
-
     def prepare_for_model(
         self,
         ids: List[int],
@@ -1825,12 +1850,13 @@ class PreTrainedTokenizer(SpecialTokensMixin):
         stride: int = 0,
         truncation_strategy: str = "longest_first",
         pad_to_max_length: bool = False,
-        return_tensors: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_overflowing_tokens: bool = False,
         return_special_tokens_mask: bool = False,
         return_lengths: bool = False,
+        prepend_batch_axis: bool = False,
     ) -> BatchEncoding:
         """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
         It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
@@ -1866,6 +1892,9 @@ class PreTrainedTokenizer(SpecialTokensMixin):
             return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
             return_lengths (:obj:`bool`, `optional`, defaults to :obj:`False`):
                 If set the resulting dictionary will include the length of each encoded inputs
+            prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If set the resulting object will feature an extra dim at position 0.
+                This can be seen as an unsqueezing operator.
 
         Return:
             A Dictionary of shape::
@@ -1990,29 +2019,8 @@ class PreTrainedTokenizer(SpecialTokensMixin):
             encoded_inputs["length"] = len(encoded_inputs["input_ids"])
 
         # Prepare model inputs as tensors if asked
-        if return_tensors == "tf" and is_tf_available():
-            encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]])
-
-            if "token_type_ids" in encoded_inputs:
-                encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]])
-
-            if "attention_mask" in encoded_inputs:
-                encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]])
-
-        elif return_tensors == "pt" and is_torch_available():
-            encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])
-
-            if "token_type_ids" in encoded_inputs:
-                encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])
-
-            if "attention_mask" in encoded_inputs:
-                encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])
-        elif return_tensors is not None:
-            logger.warning(
-                "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
-                    return_tensors
-                )
-            )
+        if return_tensors is not None:
+            convert_to_tensors(encoded_inputs, return_tensors, prepend_batch_axis)
 
         return BatchEncoding(encoded_inputs)
 
@@ -2305,7 +2313,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
     def _convert_encoding(
         self,
         encoding: EncodingFast,
-        return_tensors: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_overflowing_tokens: bool = False,
@@ -2345,16 +2353,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
                 encoding_dict["offset_mapping"].append(e.offsets)
 
         if return_tensors is not None:
-            for key, value in encoding_dict.items():
-                if return_tensors == "tf" and is_tf_available():
-                    encoding_dict[key] = tf.constant(value)
-                elif return_tensors == "pt" and is_torch_available():
-                    encoding_dict[key] = torch.tensor(value)
-                elif return_tensors is not None:
-                    logger.warning(
-                        "Unable to convert output to tensors format {}, "
-                        "PyTorch or TensorFlow is not available.".format(return_tensors)
-                    )
+            encoding_dict = convert_to_tensors(encoding_dict, return_tensors)
 
         return encoding_dict
 
@@ -2438,7 +2437,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
         truncation_strategy: str = "longest_first",
         pad_to_max_length: bool = False,
         is_pretokenized: bool = False,
-        return_tensors: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_overflowing_tokens: bool = False,
@@ -2575,7 +2574,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
         stride: int = 0,
         truncation_strategy: str = "longest_first",
         is_pretokenized: bool = False,
-        return_tensors: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_overflowing_tokens: bool = False,
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 5bbb087192..1b1a54f3c4 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -832,3 +832,47 @@ class TokenizerTesterMixin:
             # This should not fail
             model(encoded_sequence_fast)
             model(batch_encoded_sequence_fast)
+
+    # TODO: Check if require_torch is the best to test for numpy here ... Maybe move to require_flax when available
+    @require_torch
+    def test_np_encode_plus_sent_to_model(self):
+        from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
+
+        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
+
+        tokenizer = self.get_tokenizer()
+        if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
+            return
+
+        config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
+        config = config_class()
+
+        if config.is_encoder_decoder or config.pad_token_id is None:
+            return
+
+        # Build sequence
+        first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
+        sequence = " ".join(first_ten_tokens)
+        encoded_sequence = tokenizer.encode_plus(sequence, return_tensors="np")
+        batch_encoded_sequence = tokenizer.batch_encode_plus([sequence, sequence], return_tensors="np")
+
+        # TODO: add forward through JAX/Flax when PR is merged
+        # This is currently here to make flake8 happy !
+        if encoded_sequence is None:
+            raise ValueError("Cannot convert list to numpy tensor on  encode_plus()")
+
+        if batch_encoded_sequence is None:
+            raise ValueError("Cannot convert list to numpy tensor on  batch_encode_plus()")
+
+        if self.test_rust_tokenizer:
+            fast_tokenizer = self.get_rust_tokenizer()
+            encoded_sequence_fast = fast_tokenizer.encode_plus(sequence, return_tensors="np")
+            batch_encoded_sequence_fast = fast_tokenizer.batch_encode_plus([sequence, sequence], return_tensors="np")
+
+            # TODO: add forward through JAX/Flax when PR is merged
+            # This is currently here to make flake8 happy !
+            if encoded_sequence_fast is None:
+                raise ValueError("Cannot convert list to numpy tensor on  encode_plus() (fast)")
+
+            if batch_encoded_sequence_fast is None:
+                raise ValueError("Cannot convert list to numpy tensor on  batch_encode_plus() (fast)")
diff --git a/tests/test_tokenization_utils.py b/tests/test_tokenization_utils.py
index 2909b4f9da..de0ac69ac0 100644
--- a/tests/test_tokenization_utils.py
+++ b/tests/test_tokenization_utils.py
@@ -16,7 +16,7 @@
 
 import unittest
 
-from transformers import PreTrainedTokenizer
+from transformers import PreTrainedTokenizer, TensorType
 from transformers.tokenization_gpt2 import GPT2Tokenizer
 
 from .utils import slow
@@ -39,3 +39,8 @@ class TokenizerUtilsTest(unittest.TestCase):
     @slow
     def test_pretrained_tokenizers(self):
         self.check_tokenizer_from_pretrained(GPT2Tokenizer)
+
+    def check_tensor_type_from_str(self):
+        self.assertEqual(TensorType("tf"), TensorType.TENSORFLOW)
+        self.assertEqual(TensorType("pt"), TensorType.PYTORCH)
+        self.assertEqual(TensorType("np"), TensorType.NUMPY)