From a6981076eca5494b9d230f13217c14b93443888a Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 24 Sep 2019 16:46:26 +0200
Subject: [PATCH] various updates

---
 examples/utils_glue.py                        |   1 -
 .../tests/tokenization_tests_commons.py       |   2 +-
 pytorch_transformers/tokenization_bert.py     |   4 +-
 pytorch_transformers/tokenization_roberta.py  |   4 +-
 pytorch_transformers/tokenization_utils.py    | 153 +++++++-----------
 pytorch_transformers/tokenization_xlm.py      |   4 +-
 pytorch_transformers/tokenization_xlnet.py    |   4 +-
 7 files changed, 63 insertions(+), 109 deletions(-)

diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index 2557540cc6..e0ca9caa0a 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -409,7 +409,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
             example.text_a,
             example.text_b,
             add_special_tokens=True,
-            output_token_type=True,
             max_length=max_seq_length,
             truncate_first_sequence=True  # We're truncating the first sequence as a priority
         )
diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index 4ad92c8192..b71ba44436 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -196,7 +196,7 @@ class CommonTestCases:
             if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
                 seq_0 = "Test this method."
                 seq_1 = "With these inputs."
-                information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_token_type=True)
+                information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
                 sequences, mask = information["input_ids"], information["token_type_ids"]
                 assert len(sequences) == len(mask)
 
diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index 7eca60a140..225152e065 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer):
 
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
-    def create_token_type_ids_from_sequences(self, sequence_0, sequence_1):
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
         A BERT sequence pair mask has the following format:
@@ -214,7 +214,7 @@ class BertTokenizer(PreTrainedTokenizer):
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
 
-        return len(cls + self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
 
     def save_vocabulary(self, vocab_path):
         """Save the tokenizer vocabulary to a directory or file."""
diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py
index 475aee47fa..ee8e97d6bf 100644
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -97,7 +97,7 @@ class RobertaTokenizer(GPT2Tokenizer):
         cls = [self.cls_token_id]
         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
 
-    def create_token_type_ids_from_sequences(self, sequence_0, sequence_1):
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
         A RoBERTa sequence pair mask has the following format:
@@ -107,4 +107,4 @@ class RobertaTokenizer(GPT2Tokenizer):
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
 
-        return len(cls + self.encode(sequence_0) + sep + sep) * [0] + len(self.encode(sequence_1) + sep) * [1]
\ No newline at end of file
+        return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
\ No newline at end of file
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index c5efd37a53..136429b7d1 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -704,13 +704,14 @@ class PreTrainedTokenizer(object):
                 to their model.
             **kwargs: passed to the `self.tokenize()` method
         """
-        return self.encode_plus(text, text_pair, add_special_tokens, **kwargs)["input_ids"]
+        encoded_inputs = self.encode_plus(text, text_pair=text_pair, add_special_tokens=add_special_tokens, **kwargs)
+
+        return encoded_inputs["input_ids"]
 
     def encode_plus(self,
                     text,
                     text_pair=None,
                     add_special_tokens=False,
-                    output_token_type=False,
                     max_length=None,
                     stride=0,
                     truncate_first_sequence=True,
@@ -728,8 +729,6 @@ class PreTrainedTokenizer(object):
                 `convert_tokens_to_ids` method)
             add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
                 to their model.
-            output_token_type: if set to ``True``, returns the text pair corresponding mask with 0 for the first sequence,
-                and 1 for the second.
             max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
                 If there are overflowing tokens, those will be added to the returned dictionary
             stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
@@ -739,133 +738,89 @@ class PreTrainedTokenizer(object):
             **kwargs: passed to the `self.tokenize()` method
         """
 
-        information = {}
-
         def get_input_ids(text):
             if isinstance(text, six.string_types):
-                input_ids = self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
+                return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
             elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], six.string_types):
-                input_ids = self.convert_tokens_to_ids(text)
+                return self.convert_tokens_to_ids(text)
             elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
-                input_ids = text
+                return text
             else:
                 raise ValueError("Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
 
-            return input_ids
+        first_ids = get_input_ids(text)
+        second_ids = get_input_ids(text_pair) if text_pair is not None else None
 
-        if text_pair is None:
-            sequence_tokens = get_input_ids(text)
+        return self.prepare_for_model(first_ids,
+                                      pair_ids=second_ids,
+                                      max_length=max_length,
+                                      add_special_tokens=add_special_tokens,
+                                      stride=stride,
+                                      truncate_first_sequence=truncate_first_sequence)
 
-            if add_special_tokens:
-                information = self.prepare_for_model(sequence_tokens, max_length=max_length, stride=stride)
-            else:
-                if max_length:
-                    information["overflowing_tokens"] = sequence_tokens[max_length - stride:]
-                    sequence_tokens = sequence_tokens[:max_length]
-                information["input_ids"] = sequence_tokens
 
-            if output_token_type:
-                information["token_type_ids"] = [0] * len(information["input_ids"])
-        else:
-            first_sentence_tokens = get_input_ids(text)
-            second_sentence_tokens = get_input_ids(text_pair)
-
-            if add_special_tokens:
-                information = self.prepare_pair_for_model(
-                    first_sentence_tokens,
-                    second_sentence_tokens,
-                    max_length=max_length,
-                    truncate_first_sequence=truncate_first_sequence,
-                    stride=stride
-                )
-
-                if output_token_type:
-                    information["token_type_ids"] = self.create_token_type_ids_from_sequences(text, text_pair)
-            else:
-                logger.warning("No special tokens were added. The two sequences have been concatenated.")
-                sequence = first_sentence_tokens + second_sentence_tokens
-
-                if max_length:
-                    information["overflowing_tokens"] = sequence[max_length - stride:]
-                    sequence = sequence[:max_length]
-                if output_token_type:
-                    information["token_type_ids"] = [0] * len(sequence)
-
-                information["input_ids"] = sequence
-
-        return information
-
-    def prepare_for_model(self, ids, max_length=None, stride=0):
+    def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0, truncate_first_sequence=True):
         """
-        Prepares a list of tokenized input ids so that it can be used by the model. It adds special tokens, truncates
+        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
+        It adds special tokens, truncates
         sequences if overflowing while taking into account the special tokens and manages a window stride for
         overflowing tokens
 
         Args:
             ids: list of tokenized input ids. Can be obtained from a string by chaining the
                 `tokenize` and `convert_tokens_to_ids` methods.
+            pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the
+                `tokenize` and `convert_tokens_to_ids` methods.
             max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
+            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
+                to their model.
             stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
                 list of inputs.
+            truncate_first_sequence: if set to `True` and an optional second list of input ids is provided,
+                alongside a specified `max_length`, will truncate the first sequence if the total size is superior
+                than the specified `max_length`. If set to `False`, will truncate the second sequence instead.
 
         Return:
             a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
         """
-        information = {}
+        pair = bool(pair_ids is not None)
+        len_ids = len(ids)
+        len_pair_ids = len(pair_ids) if pair else 0
+
+        encoded_inputs = {}
         if max_length:
-            n_added_tokens = self.num_added_tokens()
-            information["overflowing_tokens"] = ids[max_length - n_added_tokens - stride:]
-            ids = ids[:max_length - n_added_tokens]
-        information["input_ids"] = self.add_special_tokens_single_sequence(ids)
-
-        return information
-
-    def prepare_pair_for_model(self, ids_0, ids_1, max_length=None, truncate_first_sequence=True, stride=0):
-        """
-        Prepares a list of tokenized input ids pair so that it can be used by the model. It adds special tokens,
-        truncates sequences if overflowing while taking into account the special tokens and manages a window stride for
-        overflowing tokens
-
-        Args:
-            ids_0: list of tokenized input ids. Can be obtained from a string by chaining the
-                `tokenize` and `convert_tokens_to_ids` methods.
-            ids_1: second list of tokenized input ids. Can be obtained from a string by chaining the
-                `tokenize` and `convert_tokens_to_ids` methods.
-            max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
-            truncate_first_sequence: if set to `True`, alongside a specified `max_length`, will truncate the first
-                sequence if the total size is superior than the specified `max_length`. If set to `False`, will
-                truncate the second sequence instead.
-            stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
-                list of inputs.
-
-        Return:
-            a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
-        """
-        f_len, s_len = len(ids_0), len(ids_1)
-        information = {}
-
-        if max_length:
-            n_added_tokens = self.num_added_tokens(pair=True)
-            if len(ids_0) + n_added_tokens >= max_length:
+            n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0
+            if pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
                 logger.warning(
-                    "The first sequence is longer than the maximum specified length. This sequence will not be truncated.")
+                    "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length."
+                    "This pair of sequences will not be truncated.")
             else:
-                if f_len + s_len + self.num_added_tokens(pair=True) > max_length:
-                    if truncate_first_sequence:
-                        information["overflowing_tokens"] = ids_0[max_length - s_len - n_added_tokens - stride:]
-                        ids_0 = ids_0[:max_length - s_len - n_added_tokens]
+                if n_added_tokens + len_ids + len_pair_ids > max_length:
+                    if truncate_first_sequence or not pair:
+                        encoded_inputs["overflowing_tokens"] = ids[max_length - len_pair_ids - n_added_tokens - stride:]
+                        ids = ids[:max_length - len_pair_ids - n_added_tokens]
+                    elif not truncate_first_sequence and pair:
+                        encoded_inputs["overflowing_tokens"] = pair_ids[max_length - len_ids - n_added_tokens - stride:]
+                        pair_ids = pair_ids[:max_length - len_ids - n_added_tokens]
                     else:
-                        information["overflowing_tokens"] = ids_1[max_length - f_len - n_added_tokens - stride:]
-                        ids_1 = ids_1[:max_length - f_len - n_added_tokens]
+                        logger.warning(
+                            "Cannot truncate second sequence as it is not provided. No truncation.")
 
-        sequence = self.add_special_tokens_sequence_pair(ids_0, ids_1)
-        information["input_ids"] = sequence
+        if add_special_tokens:
+            sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
+            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
+        else:
+            sequence = ids + pair_ids if pair else ids
+            token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
 
-        return information
+        encoded_inputs["input_ids"] = sequence
+        encoded_inputs["token_type_ids"] = token_type_ids
 
-    def create_token_type_ids_from_sequences(self, sequence_0, sequence_1):
+        return encoded_inputs
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
         logger.warning("This tokenizer does not make use of special tokens.")
-        return [0] * len(self.encode(sequence_0)) + [1] * len(self.encode(sequence_1))
+        return [0] * len(token_ids_0) + [1] * len(token_ids_1)
 
     def add_special_tokens_single_sequence(self, token_ids):
         logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 833a8d8be6..f1e49416a4 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer):
         cls = [self.cls_token_id]
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
-    def create_token_type_ids_from_sequences(self, sequence_0, sequence_1):
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
         An XLM sequence pair mask has the following format:
@@ -780,7 +780,7 @@ class XLMTokenizer(PreTrainedTokenizer):
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
 
-        return len(cls + self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
 
     def save_vocabulary(self, save_directory):
         """Save the tokenizer vocabulary and merge files to a directory."""
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index 5febf16418..941c6c5bc3 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         cls = [self.cls_token_id]
         return token_ids_0 + sep + token_ids_1 + sep + cls
 
-    def create_token_type_ids_from_sequences(self, sequence_0, sequence_1):
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
         A BERT sequence pair mask has the following format:
@@ -211,7 +211,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         cls = [self.cls_token_id]
         cls_segment_id = [2]
 
-        return len(self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1] + cls_segment_id
+        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
 
     def save_vocabulary(self, save_directory):
         """ Save the sentencepiece vocabulary (copy original file) and special tokens file