From 5164ea91a7b4d35cb03867233527fa383a651775 Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre@huggingface.co>
Date: Mon, 9 Mar 2020 13:48:58 -0400
Subject: [PATCH] Skipping outputs (#3116)

* Minimal example

* Proposal 2

* Proposal 2 for fast tokenizers

* Typings

* Docs

* Revert "Docs" for easier review

This reverts commit eaf0f97062e809887704a542144c537f769d5223.

* Remove unnecessary assignments

* Tests

* Fix faulty type

* Remove prints

* return_outputs -> model_input_names

* Revert "Revert "Docs" for easier review"

This reverts commit 6fdc69408102bf695797f2dfddbb6350c6b9e722.

* code quality
---
 src/transformers/tokenization_distilbert.py |   2 +
 src/transformers/tokenization_roberta.py    |   2 +
 src/transformers/tokenization_utils.py      | 348 ++++++++++++--------
 tests/test_tokenization_common.py           |  62 ++--
 4 files changed, 251 insertions(+), 163 deletions(-)

diff --git a/src/transformers/tokenization_distilbert.py b/src/transformers/tokenization_distilbert.py
index 7a5bf34367..626e65486b 100644
--- a/src/transformers/tokenization_distilbert.py
+++ b/src/transformers/tokenization_distilbert.py
@@ -69,6 +69,7 @@ class DistilBertTokenizer(BertTokenizer):
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    model_input_names = ["attention_mask"]
 
 
 class DistilBertTokenizerFast(BertTokenizerFast):
@@ -76,3 +77,4 @@ class DistilBertTokenizerFast(BertTokenizerFast):
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    model_input_names = ["attention_mask"]
diff --git a/src/transformers/tokenization_roberta.py b/src/transformers/tokenization_roberta.py
index 7275ceb4ca..5076f8764c 100644
--- a/src/transformers/tokenization_roberta.py
+++ b/src/transformers/tokenization_roberta.py
@@ -119,6 +119,7 @@ class RobertaTokenizer(GPT2Tokenizer):
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]
 
     def __init__(
         self,
@@ -244,6 +245,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]
 
     def __init__(
         self,
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index 75119e9285..cdb5e2839a 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -22,6 +22,7 @@ import os
 import re
 from collections import defaultdict
 from contextlib import contextmanager
+from typing import List, Optional, Tuple, Union
 
 from tokenizers.implementations import BaseTokenizer
 
@@ -138,6 +139,7 @@ class PreTrainedTokenizer(object):
     pretrained_vocab_files_map = {}
     pretrained_init_configuration = {}
     max_model_input_sizes = {}
+    model_input_names = ["token_type_ids", "attention_mask"]
 
     SPECIAL_TOKENS_ATTRIBUTES = [
         "bos_token",
@@ -316,6 +318,7 @@ class PreTrainedTokenizer(object):
 
         # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
         self.padding_side = kwargs.pop("padding_side", self.padding_side)
+        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
 
         # Added tokens
         self.added_tokens_encoder = {}
@@ -849,14 +852,14 @@ class PreTrainedTokenizer(object):
 
     def encode(
         self,
-        text,
-        text_pair=None,
-        add_special_tokens=True,
-        max_length=None,
-        stride=0,
-        truncation_strategy="longest_first",
-        pad_to_max_length=False,
-        return_tensors=None,
+        text: str,
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = True,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        truncation_strategy: str = "longest_first",
+        pad_to_max_length: bool = False,
+        return_tensors: Optional[str] = None,
         **kwargs
     ):
         """
@@ -865,34 +868,43 @@ class PreTrainedTokenizer(object):
         Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
 
         Args:
-            text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+            text (:obj:`str` or :obj:`List[str]`):
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
                 the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                 method)
-            text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
+            text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
+                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
                 string using the `tokenize` method) or a list of integers (tokenized string ids using the
                 `convert_tokens_to_ids` method)
-            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
+            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                If set to ``True``, the sequences will be encoded with the special tokens relative
                 to their model.
-            max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
+            max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
+                If set to a number, will limit the total sequence returned so that it has a maximum length.
                 If there are overflowing tokens, those will be added to the returned dictionary
-            stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
+            stride (:obj:`int`, `optional`, defaults to ``0``):
+                If set to a number along with max_length, the overflowing tokens returned will contain some tokens
                 from the main sequence returned. The value of this argument defines the number of additional tokens.
-            truncation_strategy: string selected in the following options:
+            truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
+                String selected in the following options:
+
                 - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                    starting from the longest one at each token (when there is a pair of input sequences)
+                  starting from the longest one at each token (when there is a pair of input sequences)
                 - 'only_first': Only truncate the first sequence
                 - 'only_second': Only truncate the second sequence
                 - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
-                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
-                The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings:
+            pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the
+                model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
+                which can be set to the following strings:
+
                 - 'left': pads on the left of the sequences
                 - 'right': pads on the right of the sequences
                 Defaults to False: no padding.
-            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
-                or PyTorch torch.Tensor instead of a list of python integers.
-            add_prefix_space: Only applies to GPT-2 and RoBERTa tokenizers. When `True`, this ensures that the sequence
-                begins with an empty space. False by default except for when using RoBERTa with `add_special_tokens=True`.
+            return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
+                Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
+                or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
             **kwargs: passed to the `self.tokenize()` method
         """
         encoded_inputs = self.encode_plus(
@@ -911,59 +923,79 @@ class PreTrainedTokenizer(object):
 
     def encode_plus(
         self,
-        text,
-        text_pair=None,
-        add_special_tokens=True,
-        max_length=None,
-        stride=0,
-        truncation_strategy="longest_first",
-        pad_to_max_length=False,
-        return_tensors=None,
-        return_token_type_ids=True,
-        return_attention_mask=True,
-        return_overflowing_tokens=False,
-        return_special_tokens_mask=False,
-        return_offsets_mapping=False,
+        text: str,
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = True,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        truncation_strategy: str = "longest_first",
+        pad_to_max_length: bool = False,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
         **kwargs
     ):
         """
-        Returns a dictionary containing the encoded sequence or sequence pair and additional informations:
+        Returns a dictionary containing the encoded sequence or sequence pair and additional information:
         the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
 
         Args:
-            text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+            text (:obj:`str` or :obj:`List[str]`):
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
                 the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                 method)
-            text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
+            text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
+                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
                 string using the `tokenize` method) or a list of integers (tokenized string ids using the
                 `convert_tokens_to_ids` method)
-            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
+            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                If set to ``True``, the sequences will be encoded with the special tokens relative
                 to their model.
-            max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
+            max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
+                If set to a number, will limit the total sequence returned so that it has a maximum length.
                 If there are overflowing tokens, those will be added to the returned dictionary
-            stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
+            stride (:obj:`int`, `optional`, defaults to ``0``):
+                If set to a number along with max_length, the overflowing tokens returned will contain some tokens
                 from the main sequence returned. The value of this argument defines the number of additional tokens.
-            truncation_strategy: string selected in the following options:
+            truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
+                String selected in the following options:
+
                 - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                    starting from the longest one at each token (when there is a pair of input sequences)
+                  starting from the longest one at each token (when there is a pair of input sequences)
                 - 'only_first': Only truncate the first sequence
                 - 'only_second': Only truncate the second sequence
                 - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
-                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
-                The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings:
+            pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the
+                model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
+                which can be set to the following strings:
+
                 - 'left': pads on the left of the sequences
                 - 'right': pads on the right of the sequences
                 Defaults to False: no padding.
-            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
-                or PyTorch torch.Tensor instead of a list of python integers.
-            add_prefix_space: Only applies to GPT-2 and RoBERTa tokenizers. When `True`, this ensures that the sequence
-                begins with an empty space. False by default except for when using RoBERTa with `add_special_tokens=True`.
-            return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
-            return_attention_mask: (optional) Set to False to avoid returning attention mask (default True)
-            return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
-            return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
-            return_offsets_mapping: (optional) Set to True to return (char_start, char_end) for each token (default False).
+            return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
+                Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
+                or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
+            return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
+                Whether to return token type IDs. If left to the default, will return the token type IDs according
+                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+                `What are token type IDs? <../glossary.html#token-type-ids>`_
+            return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return overflowing token information (default False).
+            return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return special tokens mask information (default False).
+            return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return (char_start, char_end) for each token (default False).
                 If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
                 Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
             **kwargs: passed to the `self.tokenize()` method
@@ -981,13 +1013,14 @@ class PreTrainedTokenizer(object):
                 }
 
             With the fields:
-                ``input_ids``: list of token ids to be fed to a model
-                ``token_type_ids``: list of token type ids to be fed to a model
-                ``attention_mask``: list of indices specifying which tokens should be attended to by the model
-                ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
-                ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
-                ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
-                tokens and 1 specifying sequence tokens.
+
+            - ``input_ids``: list of token ids to be fed to a model
+            - ``token_type_ids``: list of token type ids to be fed to a model
+            - ``attention_mask``: list of indices specifying which tokens should be attended to by the model
+            - ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
+            - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
+            - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
+              tokens and 1 specifying sequence tokens.
         """
 
         def get_input_ids(text):
@@ -1038,19 +1071,19 @@ class PreTrainedTokenizer(object):
 
     def batch_encode_plus(
         self,
-        batch_text_or_text_pairs=None,
-        add_special_tokens=True,
-        max_length=None,
-        stride=0,
-        truncation_strategy="longest_first",
-        pad_to_max_length=False,
-        return_tensors=None,
-        return_token_type_ids=True,
-        return_attention_masks=True,
-        return_overflowing_tokens=False,
-        return_special_tokens_masks=False,
-        return_offsets_mapping=False,
-        return_input_lengths=False,
+        batch_text_or_text_pairs: Union[str, List[str]],
+        add_special_tokens: bool = True,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        truncation_strategy: str = "longest_first",
+        pad_to_max_length: bool = False,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_masks: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_masks: bool = False,
+        return_offsets_mapping: bool = False,
+        return_input_lengths: bool = False,
         **kwargs
     ):
         """
@@ -1058,32 +1091,59 @@ class PreTrainedTokenizer(object):
         the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
 
         Args:
-            batch_text_or_text_pairs: Batch of sequences or pair of sequences to be encoded.
+            batch_text_or_text_pairs (:obj:`List[str]` or :obj:`List[List[str]]`):
+                Batch of sequences or pair of sequences to be encoded.
                 This can be a list of string/string-sequences/int-sequences or a list of pair of
                 string/string-sequences/int-sequence (see details in encode_plus)
-            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
+            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                If set to ``True``, the sequences will be encoded with the special tokens relative
                 to their model.
-            max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
-                If there are overflowing tokens, those will be added to the returned dictionary`
-            stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
+            max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
+                If set to a number, will limit the total sequence returned so that it has a maximum length.
+                If there are overflowing tokens, those will be added to the returned dictionary
+            stride (:obj:`int`, `optional`, defaults to ``0``):
+                If set to a number along with max_length, the overflowing tokens returned will contain some tokens
                 from the main sequence returned. The value of this argument defines the number of additional tokens.
-            truncation_strategy: string selected in the following options:
+            truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
+                String selected in the following options:
+
                 - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                    starting from the longest one at each token (when there is a pair of input sequences)
+                  starting from the longest one at each token (when there is a pair of input sequences)
                 - 'only_first': Only truncate the first sequence
                 - 'only_second': Only truncate the second sequence
                 - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
-                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
-                The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings:
+            pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the
+                model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
+                which can be set to the following strings:
+
                 - 'left': pads on the left of the sequences
                 - 'right': pads on the right of the sequences
                 Defaults to False: no padding.
-            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
-                or PyTorch torch.Tensor instead of a list of python integers.
-            return_input_lengths: (optional) If set the resulting dictionary will include the length of each sample
-            return_attention_masks: (optional) Set to True to return the attention mask (default False)
-            return_offsets_mapping: (optional) Not available, should be set to False or it will throw NotImplementError
+            return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
+                Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
+                or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
+            return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
+                Whether to return token type IDs. If left to the default, will return the token type IDs according
+                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+                `What are token type IDs? <../glossary.html#token-type-ids>`_
+            return_attention_masks (:obj:`bool`, `optional`, defaults to :obj:`none`):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return overflowing token information (default False).
+            return_special_tokens_masks (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return special tokens mask information (default False).
+            return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return (char_start, char_end) for each token (default False).
+                If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
+                Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
+            return_input_lengths (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If set the resulting dictionary will include the length of each sample
             **kwargs: passed to the `self.tokenize()` method
 
         Return:
@@ -1099,13 +1159,14 @@ class PreTrainedTokenizer(object):
                 }
 
             With the fields:
-                ``input_ids``: list of token ids to be fed to a model
-                ``token_type_ids``: list of token type ids to be fed to a model
-                ``attention_mask``: list of indices specifying which tokens should be attended to by the model
-                ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
-                ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
-                ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
-                tokens and 1 specifying sequence tokens.
+
+            - ``input_ids``: list of token ids to be fed to a model
+            - ``token_type_ids``: list of token type ids to be fed to a model
+            - ``attention_mask``: list of indices specifying which tokens should be attended to by the model
+            - ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
+            - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
+            - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
+              tokens and 1 specifying sequence tokens.
         """
 
         def get_input_ids(text):
@@ -1220,18 +1281,18 @@ class PreTrainedTokenizer(object):
 
     def prepare_for_model(
         self,
-        ids,
-        pair_ids=None,
-        max_length=None,
-        add_special_tokens=True,
-        stride=0,
-        truncation_strategy="longest_first",
-        pad_to_max_length=False,
-        return_tensors=None,
-        return_token_type_ids=True,
-        return_attention_mask=True,
-        return_overflowing_tokens=False,
-        return_special_tokens_mask=False,
+        ids: List[int],
+        pair_ids: Optional[List[int]] = None,
+        max_length: Optional[int] = None,
+        add_special_tokens: bool = True,
+        stride: int = 0,
+        truncation_strategy: str = "longest_first",
+        pad_to_max_length: bool = False,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
     ):
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
@@ -1292,6 +1353,11 @@ class PreTrainedTokenizer(object):
         len_ids = len(ids)
         len_pair_ids = len(pair_ids) if pair else 0
 
+        if return_token_type_ids is None:
+            return_token_type_ids = "token_type_ids" in self.model_input_names
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
         encoded_inputs = {}
 
         # Handle max sequence length
@@ -1617,6 +1683,9 @@ class PreTrainedTokenizer(object):
 
 
 class PreTrainedTokenizerFast(PreTrainedTokenizer):
+
+    model_input_names = ["token_type_ids", "attention_mask"]
+
     def __init__(self, tokenizer: BaseTokenizer, **kwargs):
         if tokenizer is None:
             raise ValueError("Provided tokenizer cannot be None")
@@ -1685,16 +1754,21 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
         if self._tokenizer is not None:
             self._tokenizer.add_special_tokens(self.all_special_tokens)
 
-    @staticmethod
     def _convert_encoding(
+        self,
         encoding,
         return_tensors=None,
-        return_token_type_ids=True,
-        return_attention_mask=True,
+        return_token_type_ids=None,
+        return_attention_mask=None,
         return_overflowing_tokens=False,
         return_special_tokens_mask=False,
         return_offsets_mapping=False,
     ):
+        if return_token_type_ids is None:
+            return_token_type_ids = "token_type_ids" in self.model_input_names
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
         if return_overflowing_tokens and encoding.overflowing is not None:
             encodings = [encoding] + encoding.overflowing
         else:
@@ -1774,18 +1848,18 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
 
     def batch_encode_plus(
         self,
-        batch_text_or_text_pairs=None,
-        add_special_tokens=True,
-        max_length=None,
-        stride=0,
-        truncation_strategy="longest_first",
-        pad_to_max_length=False,
-        return_tensors=None,
-        return_token_type_ids=True,
-        return_attention_mask=True,
-        return_overflowing_tokens=False,
-        return_special_tokens_mask=False,
-        return_offsets_mapping=False,
+        batch_text_or_text_pairs: Optional[Union[List[str], List[Tuple[str]]]] = None,
+        add_special_tokens: bool = True,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        truncation_strategy: str = "longest_first",
+        pad_to_max_length: bool = False,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
         **kwargs
     ):
         if not add_special_tokens:
@@ -1868,19 +1942,19 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
 
     def encode_plus(
         self,
-        text,
-        text_pair=None,
-        add_special_tokens=False,
-        max_length=None,
-        pad_to_max_length=False,
-        stride=0,
-        truncation_strategy="longest_first",
-        return_tensors=None,
-        return_token_type_ids=True,
-        return_attention_mask=True,
-        return_overflowing_tokens=False,
-        return_special_tokens_mask=False,
-        return_offsets_mapping=False,
+        text: str,
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = False,
+        max_length: Optional[int] = None,
+        pad_to_max_length: bool = False,
+        stride: int = 0,
+        truncation_strategy: str = "longest_first",
+        return_tensors: Optional[bool] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
         **kwargs
     ):
         batched_input = [(text, text_pair)] if text_pair else [text]
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index b1f69fbfc1..3534eeef06 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -48,7 +48,7 @@ class TokenizerTesterMixin:
         # to the concatenated encode_plus format: [{'input_ids': [...], ...}, {'input_ids': [...], ...}]
         return [
             {value: batch_encode_plus_sequences[value][i] for value in batch_encode_plus_sequences.keys()}
-            for i in range(len(batch_encode_plus_sequences))
+            for i in range(len(batch_encode_plus_sequences["input_ids"]))
         ]
 
     def test_tokenizers_common_properties(self):
@@ -261,7 +261,10 @@ class TokenizerTesterMixin:
     def test_mask_output(self):
         tokenizer = self.get_tokenizer()
 
-        if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer":
+        if (
+            tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
+            and "token_type_ids" in tokenizer.model_input_names
+        ):
             seq_0 = "Test this method."
             seq_1 = "With these inputs."
             information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
@@ -504,51 +507,58 @@ class TokenizerTesterMixin:
 
         encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
         input_ids = encoded_sequence["input_ids"]
-        token_type_ids = encoded_sequence["token_type_ids"]
-        attention_mask = encoded_sequence["attention_mask"]
         special_tokens_mask = encoded_sequence["special_tokens_mask"]
         sequence_length = len(input_ids)
 
         # Test right padding
         tokenizer.padding_side = "right"
 
-        padded_sequence = tokenizer.encode_plus(
+        right_padded_sequence = tokenizer.encode_plus(
             sequence,
             max_length=sequence_length + padding_size,
             pad_to_max_length=True,
             return_special_tokens_mask=True,
         )
-        padded_input_ids = padded_sequence["input_ids"]
-        padded_token_type_ids = padded_sequence["token_type_ids"]
-        padded_attention_mask = padded_sequence["attention_mask"]
-        padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
-        padded_sequence_length = len(padded_input_ids)
+        right_padded_input_ids = right_padded_sequence["input_ids"]
 
-        assert sequence_length + padding_size == padded_sequence_length
-        assert input_ids + [padding_idx] * padding_size == padded_input_ids
-        assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
-        assert attention_mask + [0] * padding_size == padded_attention_mask
-        assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask
+        right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
+        right_padded_sequence_length = len(right_padded_input_ids)
+
+        assert sequence_length + padding_size == right_padded_sequence_length
+        assert input_ids + [padding_idx] * padding_size == right_padded_input_ids
+        assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask
 
         # Test left padding
         tokenizer.padding_side = "left"
-        padded_sequence = tokenizer.encode_plus(
+        left_padded_sequence = tokenizer.encode_plus(
             sequence,
             max_length=sequence_length + padding_size,
             pad_to_max_length=True,
             return_special_tokens_mask=True,
         )
-        padded_input_ids = padded_sequence["input_ids"]
-        padded_token_type_ids = padded_sequence["token_type_ids"]
-        padded_attention_mask = padded_sequence["attention_mask"]
-        padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
-        padded_sequence_length = len(padded_input_ids)
+        left_padded_input_ids = left_padded_sequence["input_ids"]
+        left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
+        left_padded_sequence_length = len(left_padded_input_ids)
 
-        assert sequence_length + padding_size == padded_sequence_length
-        assert [padding_idx] * padding_size + input_ids == padded_input_ids
-        assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
-        assert [0] * padding_size + attention_mask == padded_attention_mask
-        assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask
+        assert sequence_length + padding_size == left_padded_sequence_length
+        assert [padding_idx] * padding_size + input_ids == left_padded_input_ids
+        assert [1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask
+
+        if "token_type_ids" in tokenizer.model_input_names:
+            token_type_ids = encoded_sequence["token_type_ids"]
+            left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
+            right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
+
+            assert token_type_ids + [token_type_padding_idx] * padding_size == right_padded_token_type_ids
+            assert [token_type_padding_idx] * padding_size + token_type_ids == left_padded_token_type_ids
+
+        if "attention_mask" in tokenizer.model_input_names:
+            attention_mask = encoded_sequence["attention_mask"]
+            right_padded_attention_mask = right_padded_sequence["attention_mask"]
+            left_padded_attention_mask = left_padded_sequence["attention_mask"]
+
+            assert attention_mask + [0] * padding_size == right_padded_attention_mask
+            assert [0] * padding_size + attention_mask == left_padded_attention_mask
 
     def test_separate_tokenizers(self):
         # This tests that tokenizers don't impact others. Unfortunately the case where it fails is when