From f5bcde0b2fcfab961580129ec27ac72f43eaa267 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Mon, 30 Sep 2019 16:04:55 -0400
Subject: [PATCH 01/11] [multiple-choice] Simplify and use
 tokenizer.encode_plus

---
 examples/README.md                 |   8 +-
 examples/run_multiple_choice.py    |  16 +--
 examples/utils_multiple_choice.py  | 166 ++++++++++-------------------
 transformers/tokenization_utils.py |  68 ++++++++----
 4 files changed, 116 insertions(+), 142 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index fb5de20a2a..382d794fcb 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -9,7 +9,7 @@ similar API between the different models.
 | [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.                                         |
 | [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision.                              |
 | [SQuAD](#squad) | Using BERT for question answering, examples with distributed training.                                                                                  |
-| [Multiple Choice](#multiple choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
+| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
 
 ## Language model fine-tuning
 
@@ -283,17 +283,17 @@ The results  are the following:
   loss = 0.04755385363816904
 ```
 
-##Multiple Choice
+## Multiple Choice
 
 Based on the script [`run_multiple_choice.py`]().
 
 #### Fine-tuning on SWAG
 Download [swag](https://github.com/rowanz/swagaf/tree/master/data) data
 
-```
+```bash
 #training on 4 tesla V100(16GB) GPUS
 export SWAG_DIR=/path/to/swag_data_dir
-python ./examples/single_model_scripts/run_multiple_choice.py \
+python ./examples/run_multiple_choice.py \
 --model_type roberta \
 --task_name swag \
 --model_name_or_path roberta-base \
diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py
index 54f3a8a904..2ce1327c98 100644
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -306,14 +306,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
         else:
             examples = processor.get_train_examples(args.data_dir)
         logger.info("Training number: %s", str(len(examples)))
-        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer,
-            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
-            cls_token=tokenizer.cls_token,
-            sep_token=tokenizer.sep_token,
-            sep_token_extra=bool(args.model_type in ['roberta']),
-            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
+        features = convert_examples_to_features(
+            examples,
+            label_list,
+            args.max_seq_length,
+            tokenizer,
             pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
+            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0
+        )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
@@ -362,7 +362,7 @@ def main():
                         help="Whether to run eval on the dev set.")
     parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set')
     parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
+                        help="Run evaluation during training at each logging step.")
     parser.add_argument("--do_lower_case", action='store_true',
                         help="Set this flag if you are using an uncased model.")
 
diff --git a/examples/utils_multiple_choice.py b/examples/utils_multiple_choice.py
index 7abcc5e1e9..50bb491243 100644
--- a/examples/utils_multiple_choice.py
+++ b/examples/utils_multiple_choice.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BERT multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension  """
+""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension  """
 
 from __future__ import absolute_import, division, print_function
 
@@ -26,6 +26,8 @@ import json
 import csv
 import glob
 import tqdm
+from typing import List
+from transformers import PreTrainedTokenizer
 
 
 logger = logging.getLogger(__name__)
@@ -34,13 +36,13 @@ logger = logging.getLogger(__name__)
 class InputExample(object):
     """A single training/test example for multiple choice"""
 
-    def __init__(self, example_id, question,  contexts, endings, label=None):
+    def __init__(self, example_id, question, contexts, endings, label=None):
         """Constructs a InputExample.
 
         Args:
             example_id: Unique id for the example.
             contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
-            question: string. The untokenized text of the second sequence (qustion).
+            question: string. The untokenized text of the second sequence (question).
             endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
             label: (Optional) string. The label of the example. This should be
             specified for train and dev examples, but not for test examples.
@@ -66,7 +68,7 @@ class InputFeatures(object):
                 'input_mask': input_mask,
                 'segment_ids': segment_ids
             }
-            for _, input_ids, input_mask, segment_ids in choices_features
+            for input_ids, input_mask, segment_ids in choices_features
         ]
         self.label = label
 
@@ -192,7 +194,7 @@ class SwagProcessor(DataProcessor):
             return lines
 
 
-    def _create_examples(self, lines, type):
+    def _create_examples(self, lines: List[List[str]], type: str):
         """Creates examples for the training and dev sets."""
         if type == "train" and lines[0][-1] != 'label':
             raise ValueError(
@@ -300,24 +302,18 @@ class ArcProcessor(DataProcessor):
         return examples
 
 
-def convert_examples_to_features(examples, label_list, max_seq_length,
-                                 tokenizer,
-                                 cls_token_at_end=False,
-                                 cls_token='[CLS]',
-                                 cls_token_segment_id=1,
-                                 sep_token='[SEP]',
-                                 sequence_a_segment_id=0,
-                                 sequence_b_segment_id=1,
-                                 sep_token_extra=False,
-                                 pad_token_segment_id=0,
-                                 pad_on_left=False,
-                                 pad_token=0,
-                                 mask_padding_with_zero=True):
-    """ Loads a data file into a list of `InputBatch`s
-        `cls_token_at_end` define the location of the CLS token:
-            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
+def convert_examples_to_features(
+    examples: List[InputExample],
+    label_list: List[str],
+    max_length: int,
+    tokenizer: PreTrainedTokenizer,
+    pad_token_segment_id=0,
+    pad_on_left=False,
+    pad_token=0,
+    mask_padding_with_zero=True,
+) -> List[InputFeatures]:
+    """
+    Loads a data file into a list of `InputFeatures`
     """
 
     label_map = {label : i for i, label in enumerate(label_list)}
@@ -328,125 +324,71 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
             logger.info("Writing example %d of %d" % (ex_index, len(examples)))
         choices_features = []
         for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
-            tokens_a = tokenizer.tokenize(context)
-            tokens_b = None
+            text_a = context
             if example.question.find("_") != -1:
-                #this is for cloze question
-                tokens_b = tokenizer.tokenize(example.question.replace("_", ending))
+                # this is for cloze question
+                text_b = example.question.replace("_", ending)
             else:
-                tokens_b = tokenizer.tokenize(example.question + " " + ending)
-                # you can add seq token between quesiotn and ending. This does not make too much difference.
-                # tokens_b = tokenizer.tokenize(example.question)
-                # tokens_b += [sep_token]
-                # if sep_token_extra:
-                #     tokens_b += [sep_token]
-                # tokens_b += tokenizer.tokenize(ending)
+                text_b = example.question + " " + ending
 
-            special_tokens_count = 4 if sep_token_extra else 3
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
+            inputs = tokenizer.encode_plus(
+                text_a,
+                text_b,
+                add_special_tokens=True,
+                max_length=max_length,
+                truncate_both_sequences=True
+            )
+            if 'overflowing_tokens' in inputs and len(inputs['overflowing_tokens']) > 0:
+                logger.info('Attention! you are cropping tokens (swag task is ok). '
+                        'If you are training ARC and RACE and you are poping question + options,'
+                        'you need to try to use a bigger max seq length!')
 
-            # The convention in BERT is:
-            # (a) For sequence pairs:
-            #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-            #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
-            # (b) For single sequences:
-            #  tokens:   [CLS] the dog is hairy . [SEP]
-            #  type_ids:   0   0   0   0  0     0   0
-            #
-            # Where "type_ids" are used to indicate whether this is the first
-            # sequence or the second sequence. The embedding vectors for `type=0` and
-            # `type=1` were learned during pre-training and are added to the wordpiece
-            # embedding vector (and position vector). This is not *strictly* necessary
-            # since the [SEP] token unambiguously separates the sequences, but it makes
-            # it easier for the model to learn the concept of sequences.
-            #
-            # For classification tasks, the first vector (corresponding to [CLS]) is
-            # used as as the "sentence vector". Note that this only makes sense because
-            # the entire model is fine-tuned.
-            tokens = tokens_a + [sep_token]
-            if sep_token_extra:
-                # roberta uses an extra separator b/w pairs of sentences
-                tokens += [sep_token]
-
-            segment_ids = [sequence_a_segment_id] * len(tokens)
-
-            if tokens_b:
-                tokens += tokens_b + [sep_token]
-                segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
-
-            if cls_token_at_end:
-                tokens = tokens + [cls_token]
-                segment_ids = segment_ids + [cls_token_segment_id]
-            else:
-                tokens = [cls_token] + tokens
-                segment_ids = [cls_token_segment_id] + segment_ids
-
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+            input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
 
             # The mask has 1 for real tokens and 0 for padding tokens. Only real
             # tokens are attended to.
-            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
 
             # Zero-pad up to the sequence length.
-            padding_length = max_seq_length - len(input_ids)
+            padding_length = max_length - len(input_ids)
             if pad_on_left:
                 input_ids = ([pad_token] * padding_length) + input_ids
-                input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
-                segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
+                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
+                token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
             else:
                 input_ids = input_ids + ([pad_token] * padding_length)
-                input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-                segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
+                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+                token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
+
+            assert len(input_ids) == max_length
+            assert len(attention_mask) == max_length
+            assert len(token_type_ids) == max_length
+            choices_features.append((input_ids, attention_mask, token_type_ids))
+
 
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-            choices_features.append((tokens, input_ids, input_mask, segment_ids))
         label = label_map[example.label]
 
         if ex_index < 2:
             logger.info("*** Example ***")
             logger.info("race_id: {}".format(example.example_id))
-            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
+            for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
                 logger.info("choice: {}".format(choice_idx))
-                logger.info("tokens: {}".format(' '.join(tokens)))
                 logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
-                logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
-                logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
+                logger.info("attention_mask: {}".format(' '.join(map(str, attention_mask))))
+                logger.info("token_type_ids: {}".format(' '.join(map(str, token_type_ids))))
                 logger.info("label: {}".format(label))
 
         features.append(
             InputFeatures(
-                example_id = example.example_id,
-                choices_features = choices_features,
-                label = label
+                example_id=example.example_id,
+                choices_features=choices_features,
+                label=label,
             )
         )
 
     return features
 
 
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-
-    # However, since we'd better not to remove tokens of options and questions, you can choose to use a bigger
-    # length or only pop from context
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            logger.info('Attention! you are removing from token_b (swag task is ok). '
-                        'If you are training ARC and RACE (you are poping question + options), '
-                        'you need to try to use a bigger max seq length!')
-            tokens_b.pop()
 
 
 processors = {
@@ -456,7 +398,7 @@ processors = {
 }
 
 
-GLUE_TASKS_NUM_LABELS = {
+MULTIPLE_CHOICE_TASKS_NUM_LABELS = {
     "race", 4,
     "swag", 4,
     "arc", 4
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 1e20588f83..ea90f33a28 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -699,6 +699,7 @@ class PreTrainedTokenizer(object):
                 max_length=None,
                 stride=0,
                 truncate_first_sequence=True,
+                truncate_both_sequences=False,
                 return_tensors=None,
                 **kwargs):
         """
@@ -718,7 +719,7 @@ class PreTrainedTokenizer(object):
             max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
                 If there are overflowing tokens, those will be added to the returned dictionary
             stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
-                from the main sequence returned. The value of this argument defined the number of additional tokens.
+                from the main sequence returned. The value of this argument defines the number of additional tokens.
             truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
                 will be truncated.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
@@ -731,6 +732,7 @@ class PreTrainedTokenizer(object):
                                           add_special_tokens=add_special_tokens,
                                           stride=stride,
                                           truncate_first_sequence=truncate_first_sequence,
+                                          truncate_both_sequences=truncate_both_sequences,
                                           return_tensors=return_tensors,
                                           **kwargs)
 
@@ -743,6 +745,7 @@ class PreTrainedTokenizer(object):
                     max_length=None,
                     stride=0,
                     truncate_first_sequence=True,
+                    truncate_both_sequences=False,
                     return_tensors=None,
                     **kwargs):
         """
@@ -761,7 +764,7 @@ class PreTrainedTokenizer(object):
             max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
                 If there are overflowing tokens, those will be added to the returned dictionary
             stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
-                from the main sequence returned. The value of this argument defined the number of additional tokens.
+                from the main sequence returned. The value of this argument defines the number of additional tokens.
             truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
                 will be truncated.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
@@ -788,11 +791,12 @@ class PreTrainedTokenizer(object):
                                       add_special_tokens=add_special_tokens,
                                       stride=stride,
                                       truncate_first_sequence=truncate_first_sequence,
+                                      truncate_both_sequences=truncate_both_sequences,
                                       return_tensors=return_tensors)
 
 
     def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0,
-                          truncate_first_sequence=True, return_tensors=None):
+                          truncate_first_sequence=True, truncate_both_sequences=True, return_tensors=None):
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
         It adds special tokens, truncates
@@ -825,21 +829,30 @@ class PreTrainedTokenizer(object):
         encoded_inputs = {}
         if max_length:
             n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0
-            if pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
-                logger.warning(
-                    "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length."
-                    "This pair of sequences will not be truncated.")
-            else:
-                if n_added_tokens + len_ids + len_pair_ids > max_length:
-                    if truncate_first_sequence or not pair:
-                        encoded_inputs["overflowing_tokens"] = ids[max_length - len_pair_ids - n_added_tokens - stride:]
-                        ids = ids[:max_length - len_pair_ids - n_added_tokens]
-                    elif not truncate_first_sequence and pair:
-                        encoded_inputs["overflowing_tokens"] = pair_ids[max_length - len_ids - n_added_tokens - stride:]
-                        pair_ids = pair_ids[:max_length - len_ids - n_added_tokens]
-                    else:
-                        logger.warning(
-                            "Cannot truncate second sequence as it is not provided. No truncation.")
+
+            if n_added_tokens + len_ids + len_pair_ids > max_length:
+                if truncate_both_sequences:
+                    tokens_a, tokens_b = self._truncate_seq_pair(
+                        copy.deepcopy(ids),
+                        copy.deepcopy(pair_ids),
+                        max_length=max_length - n_added_tokens
+                    )
+                    encoded_inputs["overflowing_tokens"] = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):]
+                    ids = tokens_a
+                    pair_ids = tokens_b
+                elif pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
+                    logger.warning(
+                        "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length."
+                        "This pair of sequences will not be truncated.")
+                elif truncate_first_sequence or not pair:
+                    encoded_inputs["overflowing_tokens"] = ids[max_length - len_pair_ids - n_added_tokens - stride:]
+                    ids = ids[:max_length - len_pair_ids - n_added_tokens]
+                elif not truncate_first_sequence and pair:
+                    encoded_inputs["overflowing_tokens"] = pair_ids[max_length - len_ids - n_added_tokens - stride:]
+                    pair_ids = pair_ids[:max_length - len_ids - n_added_tokens]
+                else:
+                    logger.warning(
+                        "Cannot truncate second sequence as it is not provided. No truncation.")
 
         if add_special_tokens:
             sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
@@ -862,6 +875,25 @@ class PreTrainedTokenizer(object):
 
         return encoded_inputs
 
+    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
+        """Truncates a sequence pair in place to the maximum length."""
+
+        # This is a simple heuristic which will always truncate the longer sequence
+        # one token at a time. This makes more sense than truncating an equal percent
+        # of tokens from each, since if one sequence is very short then each token
+        # that's truncated likely contains more information than a longer sequence.
+
+        # However, since we'd better not to remove tokens of options and questions, you can choose to use a bigger
+        # length or only pop from context
+        while True:
+            total_length = len(tokens_a) + len(tokens_b)
+            if total_length <= max_length:
+                return (tokens_a, tokens_b)
+            if len(tokens_a) > len(tokens_b):
+                tokens_a.pop()
+            else:
+                tokens_b.pop()
+
     def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
         logger.warning("This tokenizer does not make use of special tokens.")
         return [0] * len(token_ids_0) + [1] * len(token_ids_1)

From b3506629551b5496aa7aa80923abade04b70f166 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Mon, 30 Sep 2019 16:37:09 -0400
Subject: [PATCH 02/11] overflowing_tokens do not really make sense here, let's
 just return a number

Co-Authored-By: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
---
 examples/utils_multiple_choice.py  | 2 +-
 transformers/tokenization_utils.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/utils_multiple_choice.py b/examples/utils_multiple_choice.py
index 50bb491243..a7fc1b1222 100644
--- a/examples/utils_multiple_choice.py
+++ b/examples/utils_multiple_choice.py
@@ -338,7 +338,7 @@ def convert_examples_to_features(
                 max_length=max_length,
                 truncate_both_sequences=True
             )
-            if 'overflowing_tokens' in inputs and len(inputs['overflowing_tokens']) > 0:
+            if 'num_truncated_tokens' in inputs and inputs['num_truncated_tokens'] > 0:
                 logger.info('Attention! you are cropping tokens (swag task is ok). '
                         'If you are training ARC and RACE and you are poping question + options,'
                         'you need to try to use a bigger max seq length!')
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index ea90f33a28..27b9b0638d 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -837,7 +837,8 @@ class PreTrainedTokenizer(object):
                         copy.deepcopy(pair_ids),
                         max_length=max_length - n_added_tokens
                     )
-                    encoded_inputs["overflowing_tokens"] = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):]
+                    truncated_tokens = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):]
+                    encoded_inputs["num_truncated_tokens"] = len(truncated_tokens)
                     ids = tokens_a
                     pair_ids = tokens_b
                 elif pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:

From 7c789c337d9a4f6e9e904d3c1351c28a94183894 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 30 Sep 2019 10:20:14 -0400
Subject: [PATCH 03/11] Always truncate argument in the encode method

---
 .../tests/tokenization_tests_commons.py       | 17 ++++++++
 transformers/tokenization_utils.py            | 43 +++++++++++++------
 2 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index b71ba44436..73a2cb44b3 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -232,6 +232,23 @@ class CommonTestCases:
             assert len(truncated_sequence) == total_length - 2
             assert truncated_sequence == tokenizer.add_special_tokens_single_sequence(sequence[:-2])
 
+        def test_always_truncate(self):
+            tokenizer = self.get_tokenizer()
+
+            seq_0 = "This is a sentence to be encoded."
+            length_single_sequence = len(tokenizer.encode(seq_0))
+            length = len(tokenizer.encode(seq_0, seq_0, add_special_tokens=True))
+
+            not_truncated = tokenizer.encode(seq_0, seq_0, add_special_tokens=True, max_length=length_single_sequence)
+            truncated = tokenizer.encode(
+                seq_0, seq_0,
+                max_length=length_single_sequence,
+                add_special_tokens=True,
+                always_truncate=True
+            )
+
+            assert truncated == not_truncated[:length_single_sequence - length]
+
         def test_maximum_encoding_length_pair_input(self):
             tokenizer = self.get_tokenizer()
 
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index db9e9cd72e..fc7fe1df67 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -693,14 +693,15 @@ class PreTrainedTokenizer(object):
         raise NotImplementedError
 
     def encode(self,
-                text,
-                text_pair=None,
-                add_special_tokens=False,
-                max_length=None,
-                stride=0,
-                truncate_first_sequence=True,
-                return_tensors=None,
-                **kwargs):
+               text,
+               text_pair=None,
+               add_special_tokens=False,
+               max_length=None,
+               stride=0,
+               truncate_first_sequence=True,
+               return_tensors=None,
+               always_truncate=False,
+               **kwargs):
         """
         Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
 
@@ -721,6 +722,8 @@ class PreTrainedTokenizer(object):
                 from the main sequence returned. The value of this argument defined the number of additional tokens.
             truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
                 will be truncated.
+            always_truncate: if set to True, will always truncate the sequences when overflowing, even if one of the
+                sequences may be lost in the process.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             **kwargs: passed to the `self.tokenize()` method
@@ -732,6 +735,7 @@ class PreTrainedTokenizer(object):
                                           stride=stride,
                                           truncate_first_sequence=truncate_first_sequence,
                                           return_tensors=return_tensors,
+                                          always_truncate=always_truncate,
                                           **kwargs)
 
         return encoded_inputs["input_ids"]
@@ -744,6 +748,7 @@ class PreTrainedTokenizer(object):
                     stride=0,
                     truncate_first_sequence=True,
                     return_tensors=None,
+                    always_truncate=False,
                     **kwargs):
         """
         Returns a dictionary containing the encoded sequence or sequence pair and additional informations:
@@ -764,6 +769,8 @@ class PreTrainedTokenizer(object):
                 from the main sequence returned. The value of this argument defined the number of additional tokens.
             truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
                 will be truncated.
+            always_truncate: if set to True, will always truncate the sequences when overflowing, even if one of the
+                sequences may be lost in the process.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             **kwargs: passed to the `self.tokenize()` method
@@ -788,11 +795,12 @@ class PreTrainedTokenizer(object):
                                       add_special_tokens=add_special_tokens,
                                       stride=stride,
                                       truncate_first_sequence=truncate_first_sequence,
+                                      always_truncate=always_truncate,
                                       return_tensors=return_tensors)
 
 
     def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0,
-                          truncate_first_sequence=True, return_tensors=None):
+                          truncate_first_sequence=True, always_truncate=False, return_tensors=None):
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
         It adds special tokens, truncates
@@ -812,6 +820,8 @@ class PreTrainedTokenizer(object):
             truncate_first_sequence: if set to `True` and an optional second list of input ids is provided,
                 alongside a specified `max_length`, will truncate the first sequence if the total size is superior
                 than the specified `max_length`. If set to `False`, will truncate the second sequence instead.
+            always_truncate: if set to True, will always truncate the sequences when overflowing, even if one of the
+                sequences may be lost in the process.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
 
@@ -826,9 +836,14 @@ class PreTrainedTokenizer(object):
         if max_length:
             n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0
             if pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
-                logger.warning(
-                    "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length."
-                    "This pair of sequences will not be truncated.")
+                if always_truncate:
+                    logger.warning(
+                        "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length. "
+                        "This pair of sequences will be truncated but one of the sequences may not be present in the resulting list of ids.")
+                else:
+                    logger.warning(
+                        "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length. "
+                        "This pair of sequences will not be truncated.")
             else:
                 if n_added_tokens + len_ids + len_pair_ids > max_length:
                     if truncate_first_sequence or not pair:
@@ -860,6 +875,10 @@ class PreTrainedTokenizer(object):
         encoded_inputs["input_ids"] = sequence
         encoded_inputs["token_type_ids"] = token_type_ids
 
+        if always_truncate and len(encoded_inputs["input_ids"]) > max_length:
+            encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length]
+            encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
+
         return encoded_inputs
 
     def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):

From 2f259b228e9901e1bd46b9cb0692a40a9a7e98e7 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 30 Sep 2019 11:48:18 -0400
Subject: [PATCH 04/11] Sequence IDS

---
 .../tests/tokenization_tests_commons.py       | 30 +++++++++++++++++++
 transformers/tokenization_bert.py             | 18 +++++++++++
 transformers/tokenization_roberta.py          | 18 +++++++++++
 transformers/tokenization_utils.py            | 20 ++++++++++++-
 transformers/tokenization_xlm.py              | 18 +++++++++++
 transformers/tokenization_xlnet.py            | 18 +++++++++++
 6 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index 73a2cb44b3..f44be1f97b 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -292,3 +292,33 @@ class CommonTestCases:
 
             assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input
             assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input
+
+        def test_sequence_ids(self):
+            tokenizer = self.get_tokenizer()
+
+            sequence_0 = "Encode this."
+            sequence_1 = "This one too please."
+
+            # Testing single inputs
+            encoded_sequence = tokenizer.encode(sequence_0)
+            encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
+            encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
+            sequence_ids = encoded_sequence_dict["sequence_ids"]
+            assert len(sequence_ids) == len(encoded_sequence_w_special)
+
+            filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
+            filtered_sequence = [x for x in filtered_sequence if x is not None]
+            assert encoded_sequence == filtered_sequence
+
+            # Testing inputs pairs
+            encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1)
+            encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
+            encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
+            sequence_ids = encoded_sequence_dict["sequence_ids"]
+            assert len(sequence_ids) == len(encoded_sequence_w_special)
+
+            filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
+            filtered_sequence = [x for x in filtered_sequence if x is not None]
+            assert encoded_sequence == filtered_sequence
+
+
diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py
index 42163cb8ec..b05f88b7a8 100644
--- a/transformers/tokenization_bert.py
+++ b/transformers/tokenization_bert.py
@@ -204,6 +204,24 @@ class BertTokenizer(PreTrainedTokenizer):
 
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
+    def get_sequence_ids(self, token_ids_0, token_ids_1=None):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+
+        Args:
+            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+            for sequence pairs
+
+        Returns:
+            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+        """
+        if token_ids_1:
+            return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0]
+        else:
+            return [0] + ([1] * len(token_ids_0)) + [0]
+
     def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py
index 7adeea689e..c371ae3904 100644
--- a/transformers/tokenization_roberta.py
+++ b/transformers/tokenization_roberta.py
@@ -100,6 +100,24 @@ class RobertaTokenizer(GPT2Tokenizer):
         cls = [self.cls_token_id]
         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
 
+    def get_sequence_ids(self, token_ids_0, token_ids_1=None):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+
+        Args:
+            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+            for sequence pairs
+
+        Returns:
+            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+        """
+        if token_ids_1:
+            return [0] + ([1] * len(token_ids_0)) + [0, 0] + ([1] * len(token_ids_1)) + [0]
+        else:
+            return [0] + ([1] * len(token_ids_0)) + [0]
+
     def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index fc7fe1df67..5aa7f55730 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -826,7 +826,21 @@ class PreTrainedTokenizer(object):
                 or PyTorch torch.Tensor instead of a list of python integers.
 
         Return:
-            a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
+            A Dictionary of shape::
+
+                {
+                    input_ids: list[int],
+                    overflowing_tokens: list[int] if a ``max_length`` is specified, else None
+                    sequence_ids: list[int] if ``add_special_tokens`` if set to ``True``
+                }
+
+            With the fields:
+                ``input_ids``: list of tokens to be fed to a model
+
+                ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
+
+                ``sequence_ids``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
+                tokens and 1 specifying sequence tokens.
         """
         pair = bool(pair_ids is not None)
         len_ids = len(ids)
@@ -859,6 +873,7 @@ class PreTrainedTokenizer(object):
         if add_special_tokens:
             sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
             token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
+            encoded_inputs["sequence_ids"] = self.get_sequence_ids(ids, pair_ids)
         else:
             sequence = ids + pair_ids if pair else ids
             token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
@@ -893,6 +908,9 @@ class PreTrainedTokenizer(object):
         logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
         return token_ids_0 + token_ids_1
 
+    def get_sequence_ids(self, token_ids_0, token_ids_1=None):
+        return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
+
     def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
         """ Converts a single index or a sequence of indices (integers) in a token "
             (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py
index f1e49416a4..b6e27ee59e 100644
--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -770,6 +770,24 @@ class XLMTokenizer(PreTrainedTokenizer):
         cls = [self.cls_token_id]
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
+    def get_sequence_ids(self, token_ids_0, token_ids_1=None):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+
+        Args:
+            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+            for sequence pairs
+
+        Returns:
+            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+        """
+        if token_ids_1:
+            return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0]
+        else:
+            return [0] + ([1] * len(token_ids_0)) + [0]
+
     def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py
index ad9efdf043..c8d59decee 100644
--- a/transformers/tokenization_xlnet.py
+++ b/transformers/tokenization_xlnet.py
@@ -200,6 +200,24 @@ class XLNetTokenizer(PreTrainedTokenizer):
         cls = [self.cls_token_id]
         return token_ids_0 + sep + token_ids_1 + sep + cls
 
+    def get_sequence_ids(self, token_ids_0, token_ids_1=None):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+
+        Args:
+            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+            for sequence pairs
+
+        Returns:
+            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+        """
+        if token_ids_1:
+            return ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0, 0]
+        else:
+            return ([1] * len(token_ids_0)) + [0, 0]
+
     def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.

From cc412edd42102e6e068b7cbb34d0cf11856d6ae8 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 30 Sep 2019 14:11:41 -0400
Subject: [PATCH 05/11] Supports already existing special tokens

---
 transformers/tests/tokenization_tests_commons.py | 12 ++++++++++++
 transformers/tokenization_bert.py                |  6 +++++-
 transformers/tokenization_roberta.py             |  6 +++++-
 transformers/tokenization_utils.py               |  2 +-
 transformers/tokenization_xlm.py                 |  6 +++++-
 transformers/tokenization_xlnet.py               |  6 +++++-
 6 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index f44be1f97b..d656e165aa 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -321,4 +321,16 @@ class CommonTestCases:
             filtered_sequence = [x for x in filtered_sequence if x is not None]
             assert encoded_sequence == filtered_sequence
 
+            # Testing with already existing special tokens
+            if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
+                tokenizer.add_special_tokens({'cls_token': '</s>', 'sep_token': '<s>'})
+            encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
+            encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
+            sequence_ids_orig = encoded_sequence_dict["sequence_ids"]
+            sequence_ids = tokenizer.get_sequence_ids(encoded_sequence_w_special, special_tokens_present=True)
+            assert len(sequence_ids) == len(encoded_sequence_w_special)
+            print(sequence_ids_orig, sequence_ids)
+            assert sequence_ids_orig == sequence_ids
+
+
 
diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py
index b05f88b7a8..72720bfa4e 100644
--- a/transformers/tokenization_bert.py
+++ b/transformers/tokenization_bert.py
@@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer):
 
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
-    def get_sequence_ids(self, token_ids_0, token_ids_1=None):
+    def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
@@ -217,6 +217,10 @@ class BertTokenizer(PreTrainedTokenizer):
         Returns:
             A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
         """
+
+        if special_tokens_present:
+            return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0))
+
         if token_ids_1:
             return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0]
         else:
diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py
index c371ae3904..6f19ee699c 100644
--- a/transformers/tokenization_roberta.py
+++ b/transformers/tokenization_roberta.py
@@ -100,7 +100,7 @@ class RobertaTokenizer(GPT2Tokenizer):
         cls = [self.cls_token_id]
         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
 
-    def get_sequence_ids(self, token_ids_0, token_ids_1=None):
+    def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
@@ -113,6 +113,10 @@ class RobertaTokenizer(GPT2Tokenizer):
         Returns:
             A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
         """
+
+        if special_tokens_present:
+            return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0))
+
         if token_ids_1:
             return [0] + ([1] * len(token_ids_0)) + [0, 0] + ([1] * len(token_ids_1)) + [0]
         else:
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 5aa7f55730..a0f1382aa2 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -908,7 +908,7 @@ class PreTrainedTokenizer(object):
         logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
         return token_ids_0 + token_ids_1
 
-    def get_sequence_ids(self, token_ids_0, token_ids_1=None):
+    def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
         return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
 
     def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py
index b6e27ee59e..3f398853dc 100644
--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer):
         cls = [self.cls_token_id]
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
-    def get_sequence_ids(self, token_ids_0, token_ids_1=None):
+    def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
@@ -783,6 +783,10 @@ class XLMTokenizer(PreTrainedTokenizer):
         Returns:
             A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
         """
+
+        if special_tokens_present:
+            return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0))
+
         if token_ids_1:
             return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0]
         else:
diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py
index c8d59decee..325a32ef6f 100644
--- a/transformers/tokenization_xlnet.py
+++ b/transformers/tokenization_xlnet.py
@@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         cls = [self.cls_token_id]
         return token_ids_0 + sep + token_ids_1 + sep + cls
 
-    def get_sequence_ids(self, token_ids_0, token_ids_1=None):
+    def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
@@ -213,6 +213,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
         Returns:
             A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
         """
+
+        if special_tokens_present:
+            return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0))
+
         if token_ids_1:
             return ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0, 0]
         else:

From 5ed50a93fb4fc4ed554a54835060ab4721123d07 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 30 Sep 2019 14:14:27 -0400
Subject: [PATCH 06/11] LM finetuning won't mask special tokens anymore

---
 examples/run_lm_finetuning.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index a91deebb6c..024b254b56 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -108,7 +108,12 @@ def mask_tokens(inputs, tokenizer, args):
     """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
     labels = inputs.clone()
     # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
-    masked_indices = torch.bernoulli(torch.full(labels.shape, args.mlm_probability)).bool()
+    probability_matrix = torch.full(labels.shape, args.mlm_probability)
+    probability_matrix *= torch.tensor(
+        [tokenizer.get_sequence_ids(val, special_tokens_present=True) for val in labels.tolist()],
+        dtype=torch.float
+    )
+    masked_indices = torch.bernoulli(probability_matrix).bool()
     labels[~masked_indices] = -1  # We only compute loss on masked tokens
 
     # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])

From 651bfb7ad57387b7d645420f56195b9134fff99f Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 30 Sep 2019 17:27:40 -0400
Subject: [PATCH 07/11] always_truncate by default

---
 .../tests/tokenization_tests_commons.py       | 18 -------------
 transformers/tokenization_utils.py            | 26 ++++---------------
 2 files changed, 5 insertions(+), 39 deletions(-)

diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index d656e165aa..f4805598cd 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -232,23 +232,6 @@ class CommonTestCases:
             assert len(truncated_sequence) == total_length - 2
             assert truncated_sequence == tokenizer.add_special_tokens_single_sequence(sequence[:-2])
 
-        def test_always_truncate(self):
-            tokenizer = self.get_tokenizer()
-
-            seq_0 = "This is a sentence to be encoded."
-            length_single_sequence = len(tokenizer.encode(seq_0))
-            length = len(tokenizer.encode(seq_0, seq_0, add_special_tokens=True))
-
-            not_truncated = tokenizer.encode(seq_0, seq_0, add_special_tokens=True, max_length=length_single_sequence)
-            truncated = tokenizer.encode(
-                seq_0, seq_0,
-                max_length=length_single_sequence,
-                add_special_tokens=True,
-                always_truncate=True
-            )
-
-            assert truncated == not_truncated[:length_single_sequence - length]
-
         def test_maximum_encoding_length_pair_input(self):
             tokenizer = self.get_tokenizer()
 
@@ -329,7 +312,6 @@ class CommonTestCases:
             sequence_ids_orig = encoded_sequence_dict["sequence_ids"]
             sequence_ids = tokenizer.get_sequence_ids(encoded_sequence_w_special, special_tokens_present=True)
             assert len(sequence_ids) == len(encoded_sequence_w_special)
-            print(sequence_ids_orig, sequence_ids)
             assert sequence_ids_orig == sequence_ids
 
 
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index a0f1382aa2..527ee8422d 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -700,7 +700,6 @@ class PreTrainedTokenizer(object):
                stride=0,
                truncate_first_sequence=True,
                return_tensors=None,
-               always_truncate=False,
                **kwargs):
         """
         Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
@@ -722,8 +721,6 @@ class PreTrainedTokenizer(object):
                 from the main sequence returned. The value of this argument defined the number of additional tokens.
             truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
                 will be truncated.
-            always_truncate: if set to True, will always truncate the sequences when overflowing, even if one of the
-                sequences may be lost in the process.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             **kwargs: passed to the `self.tokenize()` method
@@ -735,7 +732,6 @@ class PreTrainedTokenizer(object):
                                           stride=stride,
                                           truncate_first_sequence=truncate_first_sequence,
                                           return_tensors=return_tensors,
-                                          always_truncate=always_truncate,
                                           **kwargs)
 
         return encoded_inputs["input_ids"]
@@ -748,7 +744,6 @@ class PreTrainedTokenizer(object):
                     stride=0,
                     truncate_first_sequence=True,
                     return_tensors=None,
-                    always_truncate=False,
                     **kwargs):
         """
         Returns a dictionary containing the encoded sequence or sequence pair and additional informations:
@@ -769,8 +764,6 @@ class PreTrainedTokenizer(object):
                 from the main sequence returned. The value of this argument defined the number of additional tokens.
             truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
                 will be truncated.
-            always_truncate: if set to True, will always truncate the sequences when overflowing, even if one of the
-                sequences may be lost in the process.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             **kwargs: passed to the `self.tokenize()` method
@@ -795,12 +788,10 @@ class PreTrainedTokenizer(object):
                                       add_special_tokens=add_special_tokens,
                                       stride=stride,
                                       truncate_first_sequence=truncate_first_sequence,
-                                      always_truncate=always_truncate,
                                       return_tensors=return_tensors)
 
-
     def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0,
-                          truncate_first_sequence=True, always_truncate=False, return_tensors=None):
+                          truncate_first_sequence=True, return_tensors=None):
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
         It adds special tokens, truncates
@@ -820,8 +811,6 @@ class PreTrainedTokenizer(object):
             truncate_first_sequence: if set to `True` and an optional second list of input ids is provided,
                 alongside a specified `max_length`, will truncate the first sequence if the total size is superior
                 than the specified `max_length`. If set to `False`, will truncate the second sequence instead.
-            always_truncate: if set to True, will always truncate the sequences when overflowing, even if one of the
-                sequences may be lost in the process.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
 
@@ -850,14 +839,9 @@ class PreTrainedTokenizer(object):
         if max_length:
             n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0
             if pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
-                if always_truncate:
-                    logger.warning(
-                        "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length. "
-                        "This pair of sequences will be truncated but one of the sequences may not be present in the resulting list of ids.")
-                else:
-                    logger.warning(
-                        "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length. "
-                        "This pair of sequences will not be truncated.")
+                logger.warning(
+                    "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length. "
+                    "This pair of sequences will be truncated with no regard to the special tokens")
             else:
                 if n_added_tokens + len_ids + len_pair_ids > max_length:
                     if truncate_first_sequence or not pair:
@@ -890,7 +874,7 @@ class PreTrainedTokenizer(object):
         encoded_inputs["input_ids"] = sequence
         encoded_inputs["token_type_ids"] = token_type_ids
 
-        if always_truncate and len(encoded_inputs["input_ids"]) > max_length:
+        if max_length and len(encoded_inputs["input_ids"]) > max_length:
             encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length]
             encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
 

From aebd83230f4a70df386ea2104c21ec46e9320f7f Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 2 Oct 2019 18:04:38 -0400
Subject: [PATCH 08/11] Update naming + remove f string in run_lm_finetuning
 example

---
 examples/run_lm_finetuning.py                 |  4 ++--
 .../tests/tokenization_tests_commons.py       | 22 +++++++++----------
 transformers/tokenization_bert.py             |  2 +-
 transformers/tokenization_roberta.py          |  2 +-
 transformers/tokenization_utils.py            |  9 ++++----
 transformers/tokenization_xlm.py              |  2 +-
 transformers/tokenization_xlnet.py            |  2 +-
 7 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index 024b254b56..585bbc8d75 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -59,7 +59,7 @@ class TextDataset(Dataset):
     def __init__(self, tokenizer, file_path='train', block_size=512):
         assert os.path.isfile(file_path)
         directory, filename = os.path.split(file_path)
-        cached_features_file = os.path.join(directory, 'cached_lm_{}_{}'.format(block_size, filename))
+        cached_features_file = os.path.join(directory, 'cached_lm_' + block_size + '_' + filename)
 
         if os.path.exists(cached_features_file):
             logger.info("Loading features from cached file %s", cached_features_file)
@@ -110,7 +110,7 @@ def mask_tokens(inputs, tokenizer, args):
     # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
     probability_matrix = torch.full(labels.shape, args.mlm_probability)
     probability_matrix *= torch.tensor(
-        [tokenizer.get_sequence_ids(val, special_tokens_present=True) for val in labels.tolist()],
+        [tokenizer.get_special_tokens_mask(val, special_tokens_present=True) for val in labels.tolist()],
         dtype=torch.float
     )
     masked_indices = torch.bernoulli(probability_matrix).bool()
diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index f4805598cd..c902347fe5 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -276,7 +276,7 @@ class CommonTestCases:
             assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input
             assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input
 
-        def test_sequence_ids(self):
+        def test_special_tokens_mask(self):
             tokenizer = self.get_tokenizer()
 
             sequence_0 = "Encode this."
@@ -286,10 +286,10 @@ class CommonTestCases:
             encoded_sequence = tokenizer.encode(sequence_0)
             encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
             encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-            sequence_ids = encoded_sequence_dict["sequence_ids"]
-            assert len(sequence_ids) == len(encoded_sequence_w_special)
+            special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
+            assert len(special_tokens_mask) == len(encoded_sequence_w_special)
 
-            filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
+            filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
             filtered_sequence = [x for x in filtered_sequence if x is not None]
             assert encoded_sequence == filtered_sequence
 
@@ -297,10 +297,10 @@ class CommonTestCases:
             encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1)
             encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
             encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-            sequence_ids = encoded_sequence_dict["sequence_ids"]
-            assert len(sequence_ids) == len(encoded_sequence_w_special)
+            special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
+            assert len(special_tokens_mask) == len(encoded_sequence_w_special)
 
-            filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
+            filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
             filtered_sequence = [x for x in filtered_sequence if x is not None]
             assert encoded_sequence == filtered_sequence
 
@@ -309,10 +309,10 @@ class CommonTestCases:
                 tokenizer.add_special_tokens({'cls_token': '</s>', 'sep_token': '<s>'})
             encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
             encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-            sequence_ids_orig = encoded_sequence_dict["sequence_ids"]
-            sequence_ids = tokenizer.get_sequence_ids(encoded_sequence_w_special, special_tokens_present=True)
-            assert len(sequence_ids) == len(encoded_sequence_w_special)
-            assert sequence_ids_orig == sequence_ids
+            special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
+            special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, special_tokens_present=True)
+            assert len(special_tokens_mask) == len(encoded_sequence_w_special)
+            assert special_tokens_mask_orig == special_tokens_mask
 
 
 
diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py
index 72720bfa4e..cf6ab51d02 100644
--- a/transformers/tokenization_bert.py
+++ b/transformers/tokenization_bert.py
@@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer):
 
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
-    def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py
index 6f19ee699c..ceb1245169 100644
--- a/transformers/tokenization_roberta.py
+++ b/transformers/tokenization_roberta.py
@@ -100,7 +100,7 @@ class RobertaTokenizer(GPT2Tokenizer):
         cls = [self.cls_token_id]
         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
 
-    def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 527ee8422d..bb69df8698 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -820,7 +820,7 @@ class PreTrainedTokenizer(object):
                 {
                     input_ids: list[int],
                     overflowing_tokens: list[int] if a ``max_length`` is specified, else None
-                    sequence_ids: list[int] if ``add_special_tokens`` if set to ``True``
+                    special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True``
                 }
 
             With the fields:
@@ -828,7 +828,7 @@ class PreTrainedTokenizer(object):
 
                 ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
 
-                ``sequence_ids``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
+                ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
                 tokens and 1 specifying sequence tokens.
         """
         pair = bool(pair_ids is not None)
@@ -857,7 +857,7 @@ class PreTrainedTokenizer(object):
         if add_special_tokens:
             sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
             token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
-            encoded_inputs["sequence_ids"] = self.get_sequence_ids(ids, pair_ids)
+            encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
         else:
             sequence = ids + pair_ids if pair else ids
             token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
@@ -877,6 +877,7 @@ class PreTrainedTokenizer(object):
         if max_length and len(encoded_inputs["input_ids"]) > max_length:
             encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length]
             encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
+            encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
 
         return encoded_inputs
 
@@ -892,7 +893,7 @@ class PreTrainedTokenizer(object):
         logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
         return token_ids_0 + token_ids_1
 
-    def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
         return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
 
     def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py
index 3f398853dc..817c6d8c44 100644
--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer):
         cls = [self.cls_token_id]
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
-    def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py
index 325a32ef6f..37f975abef 100644
--- a/transformers/tokenization_xlnet.py
+++ b/transformers/tokenization_xlnet.py
@@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         cls = [self.cls_token_id]
         return token_ids_0 + sep + token_ids_1 + sep + cls
 
-    def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.

From 9e136ff57c268bfe0c0bfb322401684aaba12d15 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Fri, 4 Oct 2019 15:00:56 -0400
Subject: [PATCH 09/11] Honor args.overwrite_cache (h/t @erenup)

---
 examples/run_glue.py            | 2 +-
 examples/run_multiple_choice.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index fc3b617da0..3a6eac63ad 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -270,7 +270,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
         list(filter(None, args.model_name_or_path.split('/'))).pop(),
         str(args.max_seq_length),
         str(task)))
-    if os.path.exists(cached_features_file):
+    if os.path.exists(cached_features_file) and not args.overwrite_cache:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py
index 2ce1327c98..223316cbcb 100644
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -293,7 +293,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
         list(filter(None, args.model_name_or_path.split('/'))).pop(),
         str(args.max_seq_length),
         str(task)))
-    if os.path.exists(cached_features_file):
+    if os.path.exists(cached_features_file) and not args.overwrite_cache:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:

From 6c1d0bc0665ef01710db301fb1a0a3c23778714a Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 4 Oct 2019 17:38:38 -0400
Subject: [PATCH 10/11] update encode_plus - add truncation strategies

---
 examples/run_lm_finetuning.py                 |   8 +-
 transformers/tests/tokenization_bert_test.py  |   4 +-
 .../tests/tokenization_distilbert_test.py     |   4 +-
 .../tests/tokenization_roberta_test.py        |   4 +-
 .../tests/tokenization_tests_commons.py       |  51 +++---
 transformers/tests/tokenization_xlm_test.py   |   4 +-
 transformers/tests/tokenization_xlnet_test.py |   4 +-
 transformers/tokenization_bert.py             |  48 +++---
 transformers/tokenization_roberta.py          |  49 +++---
 transformers/tokenization_utils.py            | 147 ++++++++++--------
 transformers/tokenization_xlm.py              |  45 +++---
 transformers/tokenization_xlnet.py            |  47 +++---
 12 files changed, 222 insertions(+), 193 deletions(-)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index 585bbc8d75..6c5e749868 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -75,7 +75,7 @@ class TextDataset(Dataset):
             tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
 
             for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
-                self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[i:i+block_size]))
+                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size]))
             # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
             # If your dataset is small, first you should loook for a bigger one :-) and second you
             # can change this behavior by adding (model specific) padding.
@@ -109,10 +109,8 @@ def mask_tokens(inputs, tokenizer, args):
     labels = inputs.clone()
     # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
     probability_matrix = torch.full(labels.shape, args.mlm_probability)
-    probability_matrix *= torch.tensor(
-        [tokenizer.get_special_tokens_mask(val, special_tokens_present=True) for val in labels.tolist()],
-        dtype=torch.float
-    )
+    special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
+    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
     masked_indices = torch.bernoulli(probability_matrix).bool()
     labels[~masked_indices] = -1  # We only compute loss on masked tokens
 
diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py
index b70941f884..5e49e2915b 100644
--- a/transformers/tests/tokenization_bert_test.py
+++ b/transformers/tests/tokenization_bert_test.py
@@ -131,8 +131,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         text = tokenizer.encode("sequence builders")
         text_2 = tokenizer.encode("multi-sequence build")
 
-        encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
-        encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
 
         assert encoded_sentence == [101] + text + [102]
         assert encoded_pair == [101] + text + [102] + text_2 + [102]
diff --git a/transformers/tests/tokenization_distilbert_test.py b/transformers/tests/tokenization_distilbert_test.py
index 64a88df99f..a18d644fe8 100644
--- a/transformers/tests/tokenization_distilbert_test.py
+++ b/transformers/tests/tokenization_distilbert_test.py
@@ -36,8 +36,8 @@ class DistilBertTokenizationTest(BertTokenizationTest):
         text = tokenizer.encode("sequence builders")
         text_2 = tokenizer.encode("multi-sequence build")
 
-        encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
-        encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
 
         assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
         assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \
diff --git a/transformers/tests/tokenization_roberta_test.py b/transformers/tests/tokenization_roberta_test.py
index f14b26a2e4..a731ac26c9 100644
--- a/transformers/tests/tokenization_roberta_test.py
+++ b/transformers/tests/tokenization_roberta_test.py
@@ -87,8 +87,8 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
         encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
         encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
 
-        encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
-        encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
 
         assert encoded_sentence == encoded_text_from_decode
         assert encoded_pair == encoded_pair_from_decode
diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index c902347fe5..b8f9295633 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -193,12 +193,12 @@ class CommonTestCases:
 
             tokenizer = self.get_tokenizer()
 
-            if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
+            if tokenizer.build_inputs_with_special_tokens.__qualname__.split('.')[0] != "PreTrainedTokenizer":
                 seq_0 = "Test this method."
                 seq_1 = "With these inputs."
                 information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
                 sequences, mask = information["input_ids"], information["token_type_ids"]
-                assert len(sequences) == len(mask)
+                self.assertEqual(len(sequences), len(mask))
 
         def test_number_of_added_tokens(self):
             tokenizer = self.get_tokenizer()
@@ -211,7 +211,7 @@ class CommonTestCases:
 
             # Method is implemented (e.g. not GPT-2)
             if len(attached_sequences) != 2:
-                assert tokenizer.num_added_tokens(pair=True) == len(attached_sequences) - len(sequences)
+                self.assertEqual(tokenizer.num_added_tokens(pair=True), len(attached_sequences) - len(sequences))
 
         def test_maximum_encoding_length_single_input(self):
             tokenizer = self.get_tokenizer()
@@ -227,10 +227,10 @@ class CommonTestCases:
             truncated_sequence = information["input_ids"]
             overflowing_tokens = information["overflowing_tokens"]
 
-            assert len(overflowing_tokens) == 2 + stride
-            assert overflowing_tokens == sequence[-(2 + stride):]
-            assert len(truncated_sequence) == total_length - 2
-            assert truncated_sequence == tokenizer.add_special_tokens_single_sequence(sequence[:-2])
+            self.assertEqual(len(overflowing_tokens), 2 + stride)
+            self.assertEqual(overflowing_tokens, sequence[-(2 + stride):])
+            self.assertEqual(len(truncated_sequence), total_length - 2)
+            self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2]))
 
         def test_maximum_encoding_length_pair_input(self):
             tokenizer = self.get_tokenizer()
@@ -243,7 +243,7 @@ class CommonTestCases:
             sequence_1_no_special_tokens = tokenizer.encode(seq_1)
 
             sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
-            truncated_second_sequence = tokenizer.add_special_tokens_sequence_pair(
+            truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
                 tokenizer.encode(seq_0),
                 tokenizer.encode(seq_1)[:-2]
             )
@@ -258,11 +258,11 @@ class CommonTestCases:
             overflowing_tokens = information["overflowing_tokens"]
             overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
 
-            assert len(overflowing_tokens) == 2 + stride
-            assert overflowing_tokens == sequence_1_no_special_tokens[-(2 + stride):]
-            assert overflowing_tokens_first_truncated == sequence_0_no_special_tokens[-(2 + stride):]
-            assert len(truncated_sequence) == len(sequence) - 2
-            assert truncated_sequence == truncated_second_sequence
+            self.assertEqual(len(overflowing_tokens), 2 + stride)
+            self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride):])
+            self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride):])
+            self.assertEqual(len(truncated_sequence), len(sequence) - 2)
+            self.assertEqual(truncated_sequence, truncated_second_sequence)
 
         def test_encode_input_type(self):
             tokenizer = self.get_tokenizer()
@@ -273,8 +273,8 @@ class CommonTestCases:
             input_ids = tokenizer.convert_tokens_to_ids(tokens)
             formatted_input = tokenizer.encode(sequence, add_special_tokens=True)
 
-            assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input
-            assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input
+            self.assertEqual(tokenizer.encode(tokens, add_special_tokens=True), formatted_input)
+            self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
 
         def test_special_tokens_mask(self):
             tokenizer = self.get_tokenizer()
@@ -287,22 +287,22 @@ class CommonTestCases:
             encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
             encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
             special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-            assert len(special_tokens_mask) == len(encoded_sequence_w_special)
+            self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
 
-            filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
+            filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
             filtered_sequence = [x for x in filtered_sequence if x is not None]
-            assert encoded_sequence == filtered_sequence
+            self.assertEqual(encoded_sequence, filtered_sequence)
 
             # Testing inputs pairs
             encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1)
             encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
             encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
             special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-            assert len(special_tokens_mask) == len(encoded_sequence_w_special)
+            self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
 
-            filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
+            filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
             filtered_sequence = [x for x in filtered_sequence if x is not None]
-            assert encoded_sequence == filtered_sequence
+            self.assertEqual(encoded_sequence, filtered_sequence)
 
             # Testing with already existing special tokens
             if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
@@ -310,9 +310,6 @@ class CommonTestCases:
             encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
             encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
             special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
-            special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, special_tokens_present=True)
-            assert len(special_tokens_mask) == len(encoded_sequence_w_special)
-            assert special_tokens_mask_orig == special_tokens_mask
-
-
-
+            special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, already_has_special_tokens=True)
+            self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
+            self.assertEqual(special_tokens_mask_orig, special_tokens_mask)
diff --git a/transformers/tests/tokenization_xlm_test.py b/transformers/tests/tokenization_xlm_test.py
index b1a71ede59..0949b0cce4 100644
--- a/transformers/tests/tokenization_xlm_test.py
+++ b/transformers/tests/tokenization_xlm_test.py
@@ -72,8 +72,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
         text = tokenizer.encode("sequence builders")
         text_2 = tokenizer.encode("multi-sequence build")
 
-        encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
-        encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
 
         assert encoded_sentence == [1] + text + [1]
         assert encoded_pair == [1] + text + [1] + text_2 + [1]
diff --git a/transformers/tests/tokenization_xlnet_test.py b/transformers/tests/tokenization_xlnet_test.py
index f4418c7fe5..1a5dbcf6df 100644
--- a/transformers/tests/tokenization_xlnet_test.py
+++ b/transformers/tests/tokenization_xlnet_test.py
@@ -95,8 +95,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
         text = tokenizer.encode("sequence builders")
         text_2 = tokenizer.encode("multi-sequence build")
 
-        encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
-        encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
 
         assert encoded_sentence == text + [4, 3]
         assert encoded_pair == text + [4] + text_2 + [4, 3]
diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py
index cf6ab51d02..d256f27a58 100644
--- a/transformers/tokenization_bert.py
+++ b/transformers/tokenization_bert.py
@@ -187,24 +187,21 @@ class BertTokenizer(PreTrainedTokenizer):
         out_string = ' '.join(tokens).replace(' ##', '').strip()
         return out_string
 
-    def add_special_tokens_single_sequence(self, token_ids):
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
-        Adds special tokens to the a sequence for sequence classification tasks.
-        A BERT sequence has the following format: [CLS] X [SEP]
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A BERT sequence has the following format:
+            single sequence: [CLS] X [SEP]
+            pair of sequences: [CLS] A [SEP] B [SEP]
         """
-        return [self.cls_token_id] + token_ids + [self.sep_token_id]
-
-    def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
-        """
-        Adds special tokens to a sequence pair for sequence classification tasks.
-        A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
-        """
-        sep = [self.sep_token_id]
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
         cls = [self.cls_token_id]
-
+        sep = [self.sep_token_id]
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
@@ -212,30 +209,37 @@ class BertTokenizer(PreTrainedTokenizer):
         Args:
             token_ids_0: list of ids (must not contain special tokens)
             token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-            for sequence pairs
+                for sequence pairs
+            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                special tokens for the model
 
         Returns:
             A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
         """
 
-        if special_tokens_present:
-            return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0))
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError("You should not supply a second sequence if the provided sequence of "
+                                 "ids is already formated with special tokens for the model.")
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
-        if token_ids_1:
-            return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0]
-        else:
-            return [0] + ([1] * len(token_ids_0)) + [0]
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
 
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
         A BERT sequence pair mask has the following format:
         0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
         | first sequence    | second sequence
+
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
-
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
         return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
 
     def save_vocabulary(self, vocab_path):
diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py
index ceb1245169..9cc8a9af6e 100644
--- a/transformers/tokenization_roberta.py
+++ b/transformers/tokenization_roberta.py
@@ -84,23 +84,21 @@ class RobertaTokenizer(GPT2Tokenizer):
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
 
-    def add_special_tokens_single_sequence(self, token_ids):
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
-        Adds special tokens to a sequence for sequence classification tasks.
-        A RoBERTa sequence has the following format: <s> X </s>
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A RoBERTa sequence has the following format:
+            single sequence: <s> X </s>
+            pair of sequences: <s> A </s></s> B </s>
         """
-        return [self.cls_token_id] + token_ids + [self.sep_token_id]
-
-    def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
-        """
-        Adds special tokens to a sequence pair for sequence classification tasks.
-        A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
-        """
-        sep = [self.sep_token_id]
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
         cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
 
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
@@ -108,28 +106,35 @@ class RobertaTokenizer(GPT2Tokenizer):
         Args:
             token_ids_0: list of ids (must not contain special tokens)
             token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-            for sequence pairs
+                for sequence pairs
+            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                special tokens for the model
 
         Returns:
             A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
         """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError("You should not supply a second sequence if the provided sequence of "
+                                 "ids is already formated with special tokens for the model.")
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
-        if special_tokens_present:
-            return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0))
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
 
-        if token_ids_1:
-            return [0] + ([1] * len(token_ids_0)) + [0, 0] + ([1] * len(token_ids_1)) + [0]
-        else:
-            return [0] + ([1] * len(token_ids_0)) + [0]
-
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
         A RoBERTa sequence pair mask has the following format:
         0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
         | first sequence    | second sequence
+
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
 
-        return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
\ No newline at end of file
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 2f25e95f4b..ce5811b96f 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -538,15 +538,9 @@ class PreTrainedTokenizer(object):
         Returns:
             Number of tokens added to sequences
         """
-
-        if pair:
-            initial_tokens_len = len(self.encode("This is a sequence") + self.encode("This is another"))
-            final_tokens_len = len(self.encode("This is a sequence", "This is another", add_special_tokens=True))
-        else:
-            initial_tokens_len = len(self.encode("This is a sequence"))
-            final_tokens_len = len(self.encode("This is a sequence", add_special_tokens=True))
-
-        return final_tokens_len - initial_tokens_len
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
 
     def add_special_tokens(self, special_tokens_dict):
         """
@@ -795,7 +789,7 @@ class PreTrainedTokenizer(object):
                                       return_tensors=return_tensors)
 
     def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0,
-                          truncate_first_sequence=True, truncate_both_sequences=True, return_tensors=None):
+                          truncation_strategy='longest_first', return_tensors=None):
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
         It adds special tokens, truncates
@@ -812,6 +806,12 @@ class PreTrainedTokenizer(object):
                 to their model.
             stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
                 list of inputs.
+            truncation_strategy: string selected in the following options:
+                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+                    starting from the longest one at each token (when there is a pair of input sequences)
+                - 'only_first': Only truncate the first sequence
+                - 'only_second': Only truncate the second sequence
+                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
             truncate_first_sequence: if set to `True` and an optional second list of input ids is provided,
                 alongside a specified `max_length`, will truncate the first sequence if the total size is superior
                 than the specified `max_length`. If set to `False`, will truncate the second sequence instead.
@@ -840,37 +840,17 @@ class PreTrainedTokenizer(object):
         len_pair_ids = len(pair_ids) if pair else 0
 
         encoded_inputs = {}
-        if max_length:
-            n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0
-
-            if n_added_tokens + len_ids + len_pair_ids > max_length:
-                if truncate_both_sequences:
-                    tokens_a, tokens_b = self._truncate_seq_pair(
-                        copy.deepcopy(ids),
-                        copy.deepcopy(pair_ids),
-                        max_length=max_length - n_added_tokens
-                    )
-                    truncated_tokens = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):]
-                    encoded_inputs["num_truncated_tokens"] = len(truncated_tokens)
-                    ids = tokens_a
-                    pair_ids = tokens_b
-                elif pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
-                    logger.warning(
-                        "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length."
-                        "This pair of sequences will not be truncated.")
-                elif truncate_first_sequence or not pair:
-                    encoded_inputs["overflowing_tokens"] = ids[max_length - len_pair_ids - n_added_tokens - stride:]
-                    ids = ids[:max_length - len_pair_ids - n_added_tokens]
-                elif not truncate_first_sequence and pair:
-                    encoded_inputs["overflowing_tokens"] = pair_ids[max_length - len_ids - n_added_tokens - stride:]
-                    pair_ids = pair_ids[:max_length - len_ids - n_added_tokens]
-                else:
-                    logger.warning(
-                        "Cannot truncate second sequence as it is not provided. No truncation.")
+        total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0)
+        if max_length and total_len > max_length:
+            ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids,
+                                                                        num_tokens_to_remove=total_len-max_length,
+                                                                        truncation_strategy=truncation_strategy)
+            encoded_inputs["overflowing_tokens"] = overflowing_tokens
+            encoded_inputs["num_truncated_tokens"] = total_len - max_length
 
         if add_special_tokens:
-            sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
-            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
+            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
             encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
         else:
             sequence = ids + pair_ids if pair else ids
@@ -895,39 +875,76 @@ class PreTrainedTokenizer(object):
 
         return encoded_inputs
 
-    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
-        """Truncates a sequence pair in place to the maximum length."""
+    def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first'):
+        """Truncates a sequence pair in place to the maximum length.
+            truncation_strategy: string selected in the following options:
+                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+                    starting from the longest one at each token (when there is a pair of input sequences).
+                    Overflowing tokens only contains overflow from the first sequence.
+                - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove.
+                - 'only_second': Only truncate the second sequence
+                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+        """
+        if num_tokens_to_remove <= 0:
+            return ids, pair_ids, []
 
-        # This is a simple heuristic which will always truncate the longer sequence
-        # one token at a time. This makes more sense than truncating an equal percent
-        # of tokens from each, since if one sequence is very short then each token
-        # that's truncated likely contains more information than a longer sequence.
+        if truncation_strategy == 'longest_first':
+            overflowing_tokens = []
+            for _ in range(num_tokens_to_remove):
+                if pair_ids is None or len(ids) > len(pair_ids):
+                    overflowing_tokens.append(ids[-1])
+                    ids = ids[:-1]
+                else:
+                    pair_ids = pair_ids[:-1]
+        elif truncation_strategy == 'only_first':
+            assert len(ids) > num_tokens_to_remove
+            overflowing_tokens = ids[-num_tokens_to_remove:]
+            ids = ids[:-num_tokens_to_remove]
+        elif truncation_strategy == 'only_second':
+            assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
+            overflowing_tokens = pair_ids[-num_tokens_to_remove:]
+            pair_ids = pair_ids[:-num_tokens_to_remove]
+        elif truncation_strategy == 'do_not_truncate':
+            raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.")
+        else:
+            raise ValueError("Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']")
+        return (ids, pair_ids, overflowing_tokens)
 
-        # However, since we'd better not to remove tokens of options and questions, you can choose to use a bigger
-        # length or only pop from context
-        while True:
-            total_length = len(tokens_a) + len(tokens_b)
-            if total_length <= max_length:
-                return (tokens_a, tokens_b)
-            if len(tokens_a) > len(tokens_b):
-                tokens_a.pop()
-            else:
-                tokens_b.pop()
-
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
         logger.warning("This tokenizer does not make use of special tokens.")
+        if token_ids_1 is None:
+            return len(token_ids_0) * [0]
         return [0] * len(token_ids_0) + [1] * len(token_ids_1)
 
-    def add_special_tokens_single_sequence(self, token_ids):
-        logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
-        return token_ids
-
-    def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
-        logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A RoBERTa sequence has the following format:
+            single sequence: <s> X </s>
+            pair of sequences: <s> A </s></s> B </s>
+        """
+        logger.warning("This tokenizer does not make use of special tokens. Input is returned with no modification.")
+        if token_ids_1 is None:
+            return token_ids_0
         return token_ids_0 + token_ids_1
 
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
-        return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+
+        Args:
+            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+                for sequence pairs
+            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                special tokens for the model
+
+        Returns:
+            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+        """
+        return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
 
     def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
         """ Converts a single index or a sequence of indices (integers) in a token "
diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py
index 817c6d8c44..d09ce6b9dc 100644
--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -754,23 +754,21 @@ class XLMTokenizer(PreTrainedTokenizer):
         out_string = ''.join(tokens).replace('</w>', ' ').strip()
         return out_string
 
-    def add_special_tokens_single_sequence(self, token_ids):
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
-        Adds special tokens to a sequence for sequence classification tasks.
-        An XLM sequence has the following format: [CLS] X [SEP]
-        """
-        return [self.cls_token_id] + token_ids + [self.sep_token_id]
-
-    def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
-        """
-        Adds special tokens to a sequence pair for sequence classification tasks.
-        An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A RoBERTa sequence has the following format:
+            single sequence: <s> X </s>
+            pair of sequences: <s> A </s></s> B </s>
         """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
@@ -778,30 +776,37 @@ class XLMTokenizer(PreTrainedTokenizer):
         Args:
             token_ids_0: list of ids (must not contain special tokens)
             token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-            for sequence pairs
+                for sequence pairs
+            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                special tokens for the model
 
         Returns:
             A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
         """
 
-        if special_tokens_present:
-            return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0))
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError("You should not supply a second sequence if the provided sequence of "
+                                 "ids is already formated with special tokens for the model.")
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
-        if token_ids_1:
-            return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0]
-        else:
-            return [0] + ([1] * len(token_ids_0)) + [0]
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
 
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
         An XLM sequence pair mask has the following format:
         0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
         | first sequence    | second sequence
+
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
-
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
         return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
 
     def save_vocabulary(self, save_directory):
diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py
index 37f975abef..deae8de336 100644
--- a/transformers/tokenization_xlnet.py
+++ b/transformers/tokenization_xlnet.py
@@ -181,26 +181,21 @@ class XLNetTokenizer(PreTrainedTokenizer):
         out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
         return out_string
 
-    def add_special_tokens_single_sequence(self, token_ids):
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
-        Adds special tokens to a sequence for sequence classification tasks.
-        An XLNet sequence has the following format: X [SEP][CLS]
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A RoBERTa sequence has the following format:
+            single sequence: <s> X </s>
+            pair of sequences: <s> A </s></s> B </s>
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
-        return token_ids + sep + cls
-
-    def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
-        """
-        Adds special tokens to a sequence pair for sequence classification tasks.
-        An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
-        """
-
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return token_ids_0 + sep + cls
         return token_ids_0 + sep + token_ids_1 + sep + cls
 
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
@@ -208,31 +203,39 @@ class XLNetTokenizer(PreTrainedTokenizer):
         Args:
             token_ids_0: list of ids (must not contain special tokens)
             token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-            for sequence pairs
+                for sequence pairs
+            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                special tokens for the model
 
         Returns:
             A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
         """
 
-        if special_tokens_present:
-            return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0))
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError("You should not supply a second sequence if the provided sequence of "
+                                 "ids is already formated with special tokens for the model.")
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
-        if token_ids_1:
-            return ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0, 0]
-        else:
-            return ([1] * len(token_ids_0)) + [0, 0]
+        if token_ids_1 is not None:
+            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
+        return ([0] * len(token_ids_0)) + [1, 1]
 
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
         A BERT sequence pair mask has the following format:
         0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
         | first sequence    | second sequence     | CLS segment ID
+        
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
         cls_segment_id = [2]
 
+        if token_ids_1 is None:
+            return len(token_ids_0 + sep + cls) * [0]
         return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
 
     def save_vocabulary(self, save_directory):

From 78ef1a99306213599d4eab8ae48f17a81d1ee2b8 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 4 Oct 2019 17:59:44 -0400
Subject: [PATCH 11/11] fixes

---
 examples/utils_multiple_choice.py             |  1 -
 transformers/data/processors/glue.py          |  1 -
 .../tests/tokenization_tests_commons.py       |  4 +-
 transformers/tokenization_utils.py            | 47 +++++++++++--------
 4 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/examples/utils_multiple_choice.py b/examples/utils_multiple_choice.py
index a7fc1b1222..a131a63924 100644
--- a/examples/utils_multiple_choice.py
+++ b/examples/utils_multiple_choice.py
@@ -336,7 +336,6 @@ def convert_examples_to_features(
                 text_b,
                 add_special_tokens=True,
                 max_length=max_length,
-                truncate_both_sequences=True
             )
             if 'num_truncated_tokens' in inputs and inputs['num_truncated_tokens'] > 0:
                 logger.info('Attention! you are cropping tokens (swag task is ok). '
diff --git a/transformers/data/processors/glue.py b/transformers/data/processors/glue.py
index 61bca8c11b..741569ea30 100644
--- a/transformers/data/processors/glue.py
+++ b/transformers/data/processors/glue.py
@@ -86,7 +86,6 @@ def glue_convert_examples_to_features(examples, tokenizer,
             example.text_b,
             add_special_tokens=True,
             max_length=max_length,
-            truncate_first_sequence=True  # We're truncating the first sequence in priority
         )
         input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
 
diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index b8f9295633..b2801d5f41 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -249,10 +249,10 @@ class CommonTestCases:
             )
 
             information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
-                                                stride=stride, truncate_first_sequence=False)
+                                                stride=stride, truncation_strategy='only_second')
             information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2,
                                                                 add_special_tokens=True, stride=stride,
-                                                                truncate_first_sequence=True)
+                                                                truncation_strategy='only_first')
 
             truncated_sequence = information["input_ids"]
             overflowing_tokens = information["overflowing_tokens"]
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index ce5811b96f..c7b55f3a9c 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -692,8 +692,7 @@ class PreTrainedTokenizer(object):
                 add_special_tokens=False,
                 max_length=None,
                 stride=0,
-                truncate_first_sequence=True,
-                truncate_both_sequences=False,
+                truncation_strategy='longest_first',
                 return_tensors=None,
                 **kwargs):
         """
@@ -714,8 +713,12 @@ class PreTrainedTokenizer(object):
                 If there are overflowing tokens, those will be added to the returned dictionary
             stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
                 from the main sequence returned. The value of this argument defines the number of additional tokens.
-            truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
-                will be truncated.
+            truncation_strategy: string selected in the following options:
+                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+                    starting from the longest one at each token (when there is a pair of input sequences)
+                - 'only_first': Only truncate the first sequence
+                - 'only_second': Only truncate the second sequence
+                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             **kwargs: passed to the `self.tokenize()` method
@@ -725,8 +728,7 @@ class PreTrainedTokenizer(object):
                                           max_length=max_length,
                                           add_special_tokens=add_special_tokens,
                                           stride=stride,
-                                          truncate_first_sequence=truncate_first_sequence,
-                                          truncate_both_sequences=truncate_both_sequences,
+                                          truncation_strategy=truncation_strategy,
                                           return_tensors=return_tensors,
                                           **kwargs)
 
@@ -738,8 +740,7 @@ class PreTrainedTokenizer(object):
                     add_special_tokens=False,
                     max_length=None,
                     stride=0,
-                    truncate_first_sequence=True,
-                    truncate_both_sequences=False,
+                    truncation_strategy='longest_first',
                     return_tensors=None,
                     **kwargs):
         """
@@ -759,8 +760,12 @@ class PreTrainedTokenizer(object):
                 If there are overflowing tokens, those will be added to the returned dictionary
             stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
                 from the main sequence returned. The value of this argument defines the number of additional tokens.
-            truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
-                will be truncated.
+            truncation_strategy: string selected in the following options:
+                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+                    starting from the longest one at each token (when there is a pair of input sequences)
+                - 'only_first': Only truncate the first sequence
+                - 'only_second': Only truncate the second sequence
+                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             **kwargs: passed to the `self.tokenize()` method
@@ -784,8 +789,7 @@ class PreTrainedTokenizer(object):
                                       max_length=max_length,
                                       add_special_tokens=add_special_tokens,
                                       stride=stride,
-                                      truncate_first_sequence=truncate_first_sequence,
-                                      truncate_both_sequences=truncate_both_sequences,
+                                      truncation_strategy=truncation_strategy,
                                       return_tensors=return_tensors)
 
     def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0,
@@ -812,9 +816,6 @@ class PreTrainedTokenizer(object):
                 - 'only_first': Only truncate the first sequence
                 - 'only_second': Only truncate the second sequence
                 - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            truncate_first_sequence: if set to `True` and an optional second list of input ids is provided,
-                alongside a specified `max_length`, will truncate the first sequence if the total size is superior
-                than the specified `max_length`. If set to `False`, will truncate the second sequence instead.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
 
@@ -844,7 +845,8 @@ class PreTrainedTokenizer(object):
         if max_length and total_len > max_length:
             ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids,
                                                                         num_tokens_to_remove=total_len-max_length,
-                                                                        truncation_strategy=truncation_strategy)
+                                                                        truncation_strategy=truncation_strategy,
+                                                                        stride=stride)
             encoded_inputs["overflowing_tokens"] = overflowing_tokens
             encoded_inputs["num_truncated_tokens"] = total_len - max_length
 
@@ -875,7 +877,7 @@ class PreTrainedTokenizer(object):
 
         return encoded_inputs
 
-    def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first'):
+    def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
         """Truncates a sequence pair in place to the maximum length.
             truncation_strategy: string selected in the following options:
                 - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
@@ -892,17 +894,22 @@ class PreTrainedTokenizer(object):
             overflowing_tokens = []
             for _ in range(num_tokens_to_remove):
                 if pair_ids is None or len(ids) > len(pair_ids):
-                    overflowing_tokens.append(ids[-1])
+                    overflowing_tokens = [ids[-1]] + overflowing_tokens
                     ids = ids[:-1]
                 else:
                     pair_ids = pair_ids[:-1]
+            window_len = min(len(ids), stride)
+            if window_len > 0:
+                overflowing_tokens = ids[-window_len:] + overflowing_tokens
         elif truncation_strategy == 'only_first':
             assert len(ids) > num_tokens_to_remove
-            overflowing_tokens = ids[-num_tokens_to_remove:]
+            window_len = min(len(ids), stride + num_tokens_to_remove)
+            overflowing_tokens = ids[-window_len:]
             ids = ids[:-num_tokens_to_remove]
         elif truncation_strategy == 'only_second':
             assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
-            overflowing_tokens = pair_ids[-num_tokens_to_remove:]
+            window_len = min(len(pair_ids), stride + num_tokens_to_remove)
+            overflowing_tokens = pair_ids[-window_len:]
             pair_ids = pair_ids[:-num_tokens_to_remove]
         elif truncation_strategy == 'do_not_truncate':
             raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.")