From 9d44236f7021028a0581345b3917589320e479b8 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 24 Sep 2019 07:03:24 -0400
Subject: [PATCH] Updated DistilBERT

---
 examples/utils_glue.py                        |   2 +-
 .../tests/tokenization_tests_commons.py       |  12 +-
 pytorch_transformers/tokenization_utils.py    | 146 +++++++++++-------
 3 files changed, 98 insertions(+), 62 deletions(-)

diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index e2fc3a119a..efe42189e4 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -412,7 +412,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
             output_mask=True,
             max_length=max_seq_length
         )
-        input_ids, segment_ids = inputs["sequence"], inputs["mask"]
+        input_ids, segment_ids = inputs["input_ids"], inputs["output_token_type"]
 
         # The mask has 1 for real tokens and 0 for padding tokens. Only real
         # tokens are attended to.
diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index 8a3b56a058..1f84d36e7d 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -196,8 +196,8 @@ class CommonTestCases:
             if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
                 seq_0 = "Test this method."
                 seq_1 = "With these inputs."
-                information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_mask=True)
-                sequences, mask = information["sequence"], information["mask"]
+                information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_token_type=True)
+                sequences, mask = information["input_ids"], information["output_token_type"]
                 assert len(sequences) == len(mask)
 
         def test_number_of_added_tokens(self):
@@ -224,7 +224,7 @@ class CommonTestCases:
             total_length = len(sequence) + num_added_tokens
             information = tokenizer.encode_plus(seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride)
 
-            truncated_sequence = information["sequence"]
+            truncated_sequence = information["input_ids"]
             overflowing_tokens = information["overflowing_tokens"]
 
             assert len(overflowing_tokens) == 2 + stride
@@ -249,12 +249,12 @@ class CommonTestCases:
             )
 
             information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
-                                                stride=stride)
+                                                stride=stride, truncate_first_sequence=False)
             information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2,
                                                                 add_special_tokens=True, stride=stride,
-                                                                truncate_second_sequence_first=False)
+                                                                truncate_first_sequence=True)
 
-            truncated_sequence = information["sequence"]
+            truncated_sequence = information["input_ids"]
             overflowing_tokens = information["overflowing_tokens"]
             overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
 
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index f2cb383143..1209c60de5 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -536,13 +536,7 @@ class PreTrainedTokenizer(object):
 
         if pair:
             initial_tokens_len = len(self.encode("This is a sequence") + self.encode("This is another"))
-            final_tokens = self.encode("This is a sequence", "This is another", add_special_tokens=True)
-
-            # In some models (e.g. GPT-2), there is no sequence pair encoding.
-            if len(final_tokens) == 2:
-                return 0
-            else:
-                final_tokens_len = len(final_tokens)
+            final_tokens_len = len(self.encode("This is a sequence", "This is another", add_special_tokens=True))
         else:
             initial_tokens_len = len(self.encode("This is a sequence"))
             final_tokens_len = len(self.encode("This is a sequence", add_special_tokens=True))
@@ -700,86 +694,93 @@ class PreTrainedTokenizer(object):
         Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
 
         Args:
-            text: The first sequence to be encoded.
-            text_pair: Optional second sequence to be encoded.
+            text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method)
+            text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
+                string using the `tokenize` method) or a list of integers (tokenized string ids using the
+                `convert_tokens_to_ids` method)
             add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
                 to their model.
+            **kwargs: passed to the `self.tokenize()` method
         """
-        if text_pair is None:
-            if add_special_tokens:
-                sequence_tokens = self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) if isinstance(text, six.string_types) else text
-                return self.add_special_tokens_single_sequence(sequence_tokens)
-            else:
-                ids = self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) if isinstance(text, six.string_types) else text
-                return ids
-
-        first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text, **kwargs)] if isinstance(text, six.string_types) else text
-        second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)] if isinstance(text_pair, six.string_types) else text_pair
-
-        if add_special_tokens:
-            return self.add_special_tokens_sequence_pair(first_sentence_tokens, second_sentence_tokens)
-        else:
-            logger.warning("No special tokens were added. The two sequences have been concatenated.")
-            return first_sentence_tokens + second_sentence_tokens
+        return self.encode_plus(text, text_pair, add_special_tokens, **kwargs)["input_ids"]
 
     def encode_plus(self,
                     text,
                     text_pair=None,
                     add_special_tokens=False,
-                    output_mask=False,
+                    output_token_type=False,
                     max_length=None,
                     stride=0,
-                    truncate_second_sequence_first=True,
+                    truncate_first_sequence=True,
                     **kwargs):
         """
         Returns a dictionary containing the encoded sequence or sequence pair. Other values can be returned by this
         method: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
 
         Args:
-            text: The first sequence to be encoded.
-            text_pair: Optional second sequence to be encoded.
+            text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method)
+            text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
+                string using the `tokenize` method) or a list of integers (tokenized string ids using the
+                `convert_tokens_to_ids` method)
             add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
                 to their model.
-            output_mask: if set to ``True``, returns the text pair corresponding mask with 0 for the first sequence,
+            output_token_type: if set to ``True``, returns the text pair corresponding mask with 0 for the first sequence,
                 and 1 for the second.
             max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
                 If there are overflowing tokens, those will be added to the returned dictionary
             stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
                 from the main sequence returned. The value of this argument defined the number of additional tokens.
-            truncate_second_sequence_first: if there is a specified max_length, this flag will choose which sequence
+            truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
                 will be truncated.
             **kwargs: passed to the `self.tokenize()` method
         """
 
         information = {}
 
+        def get_input_ids(text):
+            if isinstance(text, six.string_types):
+                input_ids = self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                input_ids = self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                input_ids = text
+            else:
+                raise ValueError("Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
+
+            return input_ids
+
         if text_pair is None:
-            sequence_tokens = self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) if isinstance(text, six.string_types) else text
+            sequence_tokens = get_input_ids(text)
+
             if add_special_tokens:
-                information = self.prepare_for_model(sequence_tokens, max_length, stride)
+                information = self.prepare_for_model(sequence_tokens, max_length=max_length, stride=stride)
             else:
                 if max_length:
                     information["overflowing_tokens"] = sequence_tokens[max_length - stride:]
                     sequence_tokens = sequence_tokens[:max_length]
-                information["sequence"] = sequence_tokens
+                information["input_ids"] = sequence_tokens
 
-            if output_mask:
-                information["mask"] = [0] * len(information["sequence"])
+            if output_token_type:
+                information["output_token_type"] = [0] * len(information["input_ids"])
         else:
-            first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text, **kwargs)] if isinstance(text, six.string_types) else text
-            second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)] if isinstance(text_pair, six.string_types) else text_pair
+            first_sentence_tokens = get_input_ids(text)
+            second_sentence_tokens = get_input_ids(text_pair)
 
             if add_special_tokens:
                 information = self.prepare_pair_for_model(
                     first_sentence_tokens,
                     second_sentence_tokens,
-                    max_length,
-                    truncate_second_sequence_first,
-                    stride
+                    max_length=max_length,
+                    truncate_first_sequence=truncate_first_sequence,
+                    stride=stride
                 )
 
-                if output_mask:
-                    information["mask"] = self.create_mask_from_sequences(text, text_pair)
+                if output_token_type:
+                    information["output_token_type"] = self.create_mask_from_sequences(text, text_pair)
             else:
                 logger.warning("No special tokens were added. The two sequences have been concatenated.")
                 sequence = first_sentence_tokens + second_sentence_tokens
@@ -787,43 +788,78 @@ class PreTrainedTokenizer(object):
                 if max_length:
                     information["overflowing_tokens"] = sequence[max_length - stride:]
                     sequence = sequence[:max_length]
-                if output_mask:
-                    information["mask"] = [0] * len(sequence)
+                if output_token_type:
+                    information["output_token_type"] = [0] * len(sequence)
 
-                information["sequence"] = sequence
+                information["input_ids"] = sequence
 
         return information
 
     def prepare_for_model(self, ids, max_length=None, stride=0):
+        """
+        Prepares a list of tokenized input ids so that it can be used by the model. It adds special tokens, truncates
+        sequences if overflowing while taking into account the special tokens and manages a window stride for
+        overflowing tokens
+
+        Args:
+            ids: list of tokenized input ids. Can be obtained from a string by chaining the
+                `tokenize` and `convert_tokens_to_ids` methods.
+            max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
+            stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
+                list of inputs.
+
+        Return:
+            a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
+        """
         information = {}
-        n_added_tokens = self.num_added_tokens()
         if max_length:
+            n_added_tokens = self.num_added_tokens()
             information["overflowing_tokens"] = ids[max_length - n_added_tokens - stride:]
             ids = ids[:max_length - n_added_tokens]
-        information["sequence"] = self.add_special_tokens_single_sequence(ids)
+        information["input_ids"] = self.add_special_tokens_single_sequence(ids)
 
         return information
 
-    def prepare_pair_for_model(self, ids_0, ids_1, max_length=None, truncate_second_sequence_first=True, stride=0):
+    def prepare_pair_for_model(self, ids_0, ids_1, max_length=None, truncate_first_sequence=True, stride=0):
+        """
+        Prepares a list of tokenized input ids pair so that it can be used by the model. It adds special tokens,
+        truncates sequences if overflowing while taking into account the special tokens and manages a window stride for
+        overflowing tokens
+
+        Args:
+            ids_0: list of tokenized input ids. Can be obtained from a string by chaining the
+                `tokenize` and `convert_tokens_to_ids` methods.
+            ids_1: second list of tokenized input ids. Can be obtained from a string by chaining the
+                `tokenize` and `convert_tokens_to_ids` methods.
+            max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
+            truncate_first_sequence: if set to `True`, alongside a specified `max_length`, will truncate the first
+                sequence if the total size is superior than the specified `max_length`. If set to `False`, will
+                truncate the second sequence instead.
+            stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
+                list of inputs.
+
+        Return:
+            a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
+        """
         f_len, s_len = len(ids_0), len(ids_1)
-        n_added_tokens = self.num_added_tokens(pair=True)
         information = {}
 
         if max_length:
+            n_added_tokens = self.num_added_tokens(pair=True)
             if len(ids_0) + n_added_tokens >= max_length:
                 logger.warning(
                     "The first sequence is longer than the maximum specified length. This sequence will not be truncated.")
             else:
                 if f_len + s_len + self.num_added_tokens(pair=True) > max_length:
-                    if truncate_second_sequence_first:
-                        information["overflowing_tokens"] = ids_1[max_length - f_len - n_added_tokens - stride:]
-                        ids_1 = ids_1[:max_length - f_len - n_added_tokens]
-                    else:
+                    if truncate_first_sequence:
                         information["overflowing_tokens"] = ids_0[max_length - s_len - n_added_tokens - stride:]
                         ids_0 = ids_0[:max_length - s_len - n_added_tokens]
+                    else:
+                        information["overflowing_tokens"] = ids_1[max_length - f_len - n_added_tokens - stride:]
+                        ids_1 = ids_1[:max_length - f_len - n_added_tokens]
 
         sequence = self.add_special_tokens_sequence_pair(ids_0, ids_1)
-        information["sequence"] = sequence
+        information["input_ids"] = sequence
 
         return information