diff --git a/examples/adversarial/utils_hans.py b/examples/adversarial/utils_hans.py
index 8a4e118f9f..ffe6145e29 100644
--- a/examples/adversarial/utils_hans.py
+++ b/examples/adversarial/utils_hans.py
@@ -112,7 +112,10 @@ if is_torch_available():
             cached_features_file = os.path.join(
                 data_dir,
                 "cached_{}_{}_{}_{}".format(
-                    "dev" if evaluate else "train", tokenizer.__class__.__name__, str(max_seq_length), task,
+                    "dev" if evaluate else "train",
+                    tokenizer.__class__.__name__,
+                    str(max_seq_length),
+                    task,
                 ),
             )
             label_list = processor.get_labels()
@@ -278,7 +281,10 @@ class HansProcessor(DataProcessor):
 
 
 def hans_convert_examples_to_features(
-    examples: List[InputExample], label_list: List[str], max_length: int, tokenizer: PreTrainedTokenizer,
+    examples: List[InputExample],
+    label_list: List[str],
+    max_length: int,
+    tokenizer: PreTrainedTokenizer,
 ):
     """
     Loads a data file into a list of ``InputFeatures``
diff --git a/examples/benchmarking/plot_csv_file.py b/examples/benchmarking/plot_csv_file.py
index bc78a07bfe..7d3f1fd4b8 100644
--- a/examples/benchmarking/plot_csv_file.py
+++ b/examples/benchmarking/plot_csv_file.py
@@ -20,7 +20,9 @@ class PlotArguments:
     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
     """
 
-    csv_file: str = field(metadata={"help": "The csv file to plot."},)
+    csv_file: str = field(
+        metadata={"help": "The csv file to plot."},
+    )
     plot_along_batch: bool = field(
         default=False,
         metadata={"help": "Whether to plot along batch size or sequence lengh. Defaults to sequence length."},
@@ -30,7 +32,8 @@ class PlotArguments:
         metadata={"help": "Whether the csv file has time results or memory results. Defaults to memory results."},
     )
     no_log_scale: bool = field(
-        default=False, metadata={"help": "Disable logarithmic scale when plotting"},
+        default=False,
+        metadata={"help": "Disable logarithmic scale when plotting"},
     )
     is_train: bool = field(
         default=False,
@@ -39,7 +42,8 @@ class PlotArguments:
         },
     )
     figure_png_file: Optional[str] = field(
-        default=None, metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
+        default=None,
+        metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
     )
     short_model_names: Optional[List[str]] = list_field(
         default=None, metadata={"help": "List of model names that are used instead of the ones in the csv file."}
diff --git a/examples/bert-loses-patience/pabee/modeling_pabee_albert.py b/examples/bert-loses-patience/pabee/modeling_pabee_albert.py
index 7beccabd57..539d55a9af 100644
--- a/examples/bert-loses-patience/pabee/modeling_pabee_albert.py
+++ b/examples/bert-loses-patience/pabee/modeling_pabee_albert.py
@@ -101,30 +101,30 @@ class AlbertModelWithPabee(AlbertModel):
         regression=False,
     ):
         r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during pre-training.
+        Return:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
+            last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+                Sequence of hidden-states at the output of the last layer of the model.
+            pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
+                Last layer hidden-state of the first token of the sequence (classification token)
+                further processed by a Linear layer and a Tanh activation function. The Linear
+                layer weights are trained from the next sentence prediction (classification)
+                objective during pre-training.
 
-            This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+                This output is usually *not* a good summary
+                of the semantic content of the input, you're often better with averaging or pooling
+                the sequence of hidden-states for the whole input sequence.
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
         """
 
         if input_ids is not None and inputs_embeds is not None:
@@ -157,7 +157,10 @@ class AlbertModelWithPabee(AlbertModel):
             res = []
             for i in range(self.config.num_hidden_layers):
                 encoder_outputs = self.encoder.adaptive_forward(
-                    encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask,
+                    encoder_outputs,
+                    current_layer=i,
+                    attention_mask=extended_attention_mask,
+                    head_mask=head_mask,
                 )
 
                 pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
@@ -174,7 +177,10 @@ class AlbertModelWithPabee(AlbertModel):
             for i in range(self.config.num_hidden_layers):
                 calculated_layer_num += 1
                 encoder_outputs = self.encoder.adaptive_forward(
-                    encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask,
+                    encoder_outputs,
+                    current_layer=i,
+                    attention_mask=extended_attention_mask,
+                    head_mask=head_mask,
                 )
 
                 pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
@@ -236,42 +242,42 @@ class AlbertForSequenceClassificationWithPabee(AlbertPreTrainedModel):
         labels=None,
     ):
         r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
-            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in ``[0, ..., config.num_labels - 1]``.
+                If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+                If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
 
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-        loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Classification (or regression if config.num_labels==1) loss.
-        logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
+            loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+                Classification (or regression if config.num_labels==1) loss.
+            logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
 
-        Examples::
+            Examples::
 
-            from transformers import AlbertTokenizer
-            from pabee import AlbertForSequenceClassificationWithPabee
-            import torch
+                from transformers import AlbertTokenizer
+                from pabee import AlbertForSequenceClassificationWithPabee
+                import torch
 
-            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-            model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert-base-v2')
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-            labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=labels)
-            loss, logits = outputs[:2]
+                tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+                model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert-base-v2')
+                input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+                labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+                outputs = model(input_ids, labels=labels)
+                loss, logits = outputs[:2]
 
         """
 
diff --git a/examples/bert-loses-patience/pabee/modeling_pabee_bert.py b/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
index 5bf3a81f66..dc310122e5 100644
--- a/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
+++ b/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
@@ -108,30 +108,30 @@ class BertModelWithPabee(BertModel):
         regression=False,
     ):
         r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during pre-training.
+        Return:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+            last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+                Sequence of hidden-states at the output of the last layer of the model.
+            pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
+                Last layer hidden-state of the first token of the sequence (classification token)
+                further processed by a Linear layer and a Tanh activation function. The Linear
+                layer weights are trained from the next sentence prediction (classification)
+                objective during pre-training.
 
-            This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+                This output is usually *not* a good summary
+                of the semantic content of the input, you're often better with averaging or pooling
+                the sequence of hidden-states for the whole input sequence.
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
         """
 
         if input_ids is not None and inputs_embeds is not None:
@@ -266,44 +266,44 @@ class BertForSequenceClassificationWithPabee(BertPreTrainedModel):
         labels=None,
     ):
         r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
 
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+                Classification (or regression if config.num_labels==1) loss.
+            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
 
-    Examples::
+        Examples::
 
-        from transformers import BertTokenizer, BertForSequenceClassification
-        from pabee import BertForSequenceClassificationWithPabee
-        import torch
+            from transformers import BertTokenizer, BertForSequenceClassification
+            from pabee import BertForSequenceClassificationWithPabee
+            import torch
 
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForSequenceClassificationWithPabee.from_pretrained('bert-base-uncased')
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            model = BertForSequenceClassificationWithPabee.from_pretrained('bert-base-uncased')
 
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
+            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+            outputs = model(input_ids, labels=labels)
 
-        loss, logits = outputs[:2]
+            loss, logits = outputs[:2]
 
         """
 
diff --git a/examples/bert-loses-patience/run_glue_with_pabee.py b/examples/bert-loses-patience/run_glue_with_pabee.py
index bddd5adb26..87970fcd15 100755
--- a/examples/bert-loses-patience/run_glue_with_pabee.py
+++ b/examples/bert-loses-patience/run_glue_with_pabee.py
@@ -120,7 +120,10 @@ def train(args, train_dataset, model, tokenizer):
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
         model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
+            model,
+            device_ids=[args.local_rank],
+            output_device=args.local_rank,
+            find_unused_parameters=True,
         )
 
     # Train!
@@ -151,13 +154,17 @@ def train(args, train_dataset, model, tokenizer):
         logger.info("  Continuing training from epoch %d", epochs_trained)
         logger.info("  Continuing training from global step %d", global_step)
         logger.info(
-            "  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch,
+            "  Will skip the first %d steps in the first epoch",
+            steps_trained_in_current_epoch,
         )
 
     tr_loss, logging_loss = 0.0, 0.0
     model.zero_grad()
     train_iterator = trange(
-        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
+        epochs_trained,
+        int(args.num_train_epochs),
+        desc="Epoch",
+        disable=args.local_rank not in [-1, 0],
     )
     set_seed(args)  # Added here for reproductibility
     for _ in train_iterator:
@@ -372,7 +379,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
             processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
         )
         features = convert_examples_to_features(
-            examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode,
+            examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=args.max_seq_length,
+            output_mode=output_mode,
         )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
@@ -434,15 +445,24 @@ def main():
         help="The output directory where the model predictions and checkpoints will be written.",
     )
     parser.add_argument(
-        "--patience", default="0", type=str, required=False,
+        "--patience",
+        default="0",
+        type=str,
+        required=False,
     )
     parser.add_argument(
-        "--regression_threshold", default=0, type=float, required=False,
+        "--regression_threshold",
+        default=0,
+        type=float,
+        required=False,
     )
 
     # Other parameters
     parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
+        "--config_name",
+        default="",
+        type=str,
+        help="Pretrained config name or path if not the same as model_name",
     )
     parser.add_argument(
         "--tokenizer_name",
@@ -466,17 +486,27 @@ def main():
     parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
     parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
     parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
+        "--evaluate_during_training",
+        action="store_true",
+        help="Run evaluation during training at each logging step.",
     )
     parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
+        "--do_lower_case",
+        action="store_true",
+        help="Set this flag if you are using an uncased model.",
     )
 
     parser.add_argument(
-        "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
+        "--per_gpu_train_batch_size",
+        default=8,
+        type=int,
+        help="Batch size per GPU/CPU for training.",
     )
     parser.add_argument(
-        "--per_gpu_eval_batch_size", default=1, type=int, help="Batch size per GPU/CPU for evaluation.",
+        "--per_gpu_eval_batch_size",
+        default=1,
+        type=int,
+        help="Batch size per GPU/CPU for evaluation.",
     )
     parser.add_argument(
         "--gradient_accumulation_steps",
@@ -485,13 +515,19 @@ def main():
         help="Number of updates steps to accumulate before performing a backward/update pass.",
     )
     parser.add_argument(
-        "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.",
+        "--learning_rate",
+        default=5e-5,
+        type=float,
+        help="The initial learning rate for Adam.",
     )
     parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
     parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
     parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
     parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
+        "--num_train_epochs",
+        default=3.0,
+        type=float,
+        help="Total number of training epochs to perform.",
     )
     parser.add_argument(
         "--max_steps",
@@ -503,7 +539,10 @@ def main():
 
     parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
     parser.add_argument(
-        "--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.",
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save checkpoint every X updates steps.",
     )
     parser.add_argument(
         "--eval_all_checkpoints",
@@ -512,10 +551,14 @@ def main():
     )
     parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
     parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
+        "--overwrite_output_dir",
+        action="store_true",
+        help="Overwrite the content of the output directory",
     )
     parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
+        "--overwrite_cache",
+        action="store_true",
+        help="Overwrite the cached training and evaluation sets",
     )
     parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
 
@@ -532,7 +575,10 @@ def main():
         "See details at https://nvidia.github.io/apex/amp.html",
     )
     parser.add_argument(
-        "--local_rank", type=int, default=-1, help="For distributed training: local_rank",
+        "--local_rank",
+        type=int,
+        default=-1,
+        help="For distributed training: local_rank",
     )
     parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
     parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
@@ -634,7 +680,8 @@ def main():
     print("Output Layers Parameters:", output_layers_param_num)
     single_output_layer_param_num = sum(param.numel() for param in model.classifiers[0].parameters())
     print(
-        "Added Output Layers Parameters:", output_layers_param_num - single_output_layer_param_num,
+        "Added Output Layers Parameters:",
+        output_layers_param_num - single_output_layer_param_num,
     )
 
     logger.info("Training/evaluation parameters %s", args)
diff --git a/examples/bertology/run_bertology.py b/examples/bertology/run_bertology.py
index 990354b894..92653da4dd 100644
--- a/examples/bertology/run_bertology.py
+++ b/examples/bertology/run_bertology.py
@@ -66,9 +66,9 @@ def print_2d_tensor(tensor):
 def compute_heads_importance(
     args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None, actually_pruned=False
 ):
-    """ This method shows how to compute:
-        - head attention entropy
-        - head importance scores according to http://arxiv.org/abs/1905.10650
+    """This method shows how to compute:
+    - head attention entropy
+    - head importance scores according to http://arxiv.org/abs/1905.10650
     """
     # Prepare our tensors
     n_layers, n_heads = model.config.num_hidden_layers, model.config.num_attention_heads
@@ -150,8 +150,8 @@ def compute_heads_importance(
 
 
 def mask_heads(args, model, eval_dataloader):
-    """ This method shows how to mask head (set some heads to zero), to test the effect on the network,
-        based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
+    """This method shows how to mask head (set some heads to zero), to test the effect on the network,
+    based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
     """
     _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
     preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
@@ -201,8 +201,8 @@ def mask_heads(args, model, eval_dataloader):
 
 
 def prune_heads(args, model, eval_dataloader, head_mask):
-    """ This method shows how to prune head (remove heads weights) based on
-        the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
+    """This method shows how to prune head (remove heads weights) based on
+    the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
     """
     # Try pruning and test time speedup
     # Pruning is like masking but we actually remove the masked weights
@@ -395,7 +395,8 @@ def main():
         cache_dir=args.cache_dir,
     )
     tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir,
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        cache_dir=args.cache_dir,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
         args.model_name_or_path,
diff --git a/examples/contrib/mm-imdb/utils_mmimdb.py b/examples/contrib/mm-imdb/utils_mmimdb.py
index 5df0a886ec..cabc85edbb 100644
--- a/examples/contrib/mm-imdb/utils_mmimdb.py
+++ b/examples/contrib/mm-imdb/utils_mmimdb.py
@@ -138,6 +138,9 @@ def get_image_transforms():
             transforms.Resize(256),
             transforms.CenterCrop(224),
             transforms.ToTensor(),
-            transforms.Normalize(mean=[0.46777044, 0.44531429, 0.40661017], std=[0.12221994, 0.12145835, 0.14380469],),
+            transforms.Normalize(
+                mean=[0.46777044, 0.44531429, 0.40661017],
+                std=[0.12221994, 0.12145835, 0.14380469],
+            ),
         ]
     )
diff --git a/examples/contrib/run_camembert.py b/examples/contrib/run_camembert.py
index 3da66d419b..32ccce0e7e 100644
--- a/examples/contrib/run_camembert.py
+++ b/examples/contrib/run_camembert.py
@@ -30,7 +30,11 @@ def fill_mask(masked_input, model, tokenizer, topk=5):
             )
         else:
             topk_filled_outputs.append(
-                (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
+                (
+                    masked_input.replace(masked_token, predicted_token),
+                    values[index].item(),
+                    predicted_token,
+                )
             )
     return topk_filled_outputs
 
diff --git a/examples/contrib/run_openai_gpt.py b/examples/contrib/run_openai_gpt.py
index c054d3b5c5..7febad29b2 100644
--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@@ -71,10 +71,10 @@ def load_rocstories_dataset(dataset_path):
 
 
 def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
-    """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
+    """Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
 
-        To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
-        input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
+    To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
+    input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
     """
     tensor_datasets = []
     for dataset in encoded_datasets:
@@ -83,7 +83,10 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
         mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
         lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
         mc_labels = np.zeros((n_batch,), dtype=np.int64)
-        for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
+        for (
+            i,
+            (story, cont1, cont2, mc_label),
+        ) in enumerate(dataset):
             with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
             with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
             input_ids[i, 0, : len(with_cont1)] = with_cont1
diff --git a/examples/contrib/run_swag.py b/examples/contrib/run_swag.py
index 962c2990d1..2724e74822 100644
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -629,7 +629,9 @@ def main():
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
 
     config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,)
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+    )
     model = AutoModelForMultipleChoice.from_pretrained(
         args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
     )
diff --git a/examples/deebert/run_glue_deebert.py b/examples/deebert/run_glue_deebert.py
index 8eadbd282d..8b4270490e 100644
--- a/examples/deebert/run_glue_deebert.py
+++ b/examples/deebert/run_glue_deebert.py
@@ -358,7 +358,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
             processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
         )
         features = convert_examples_to_features(
-            examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode,
+            examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=args.max_seq_length,
+            output_mode=output_mode,
         )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
diff --git a/examples/deebert/src/modeling_highway_bert.py b/examples/deebert/src/modeling_highway_bert.py
index 0e548c96c8..d5f8488ae2 100644
--- a/examples/deebert/src/modeling_highway_bert.py
+++ b/examples/deebert/src/modeling_highway_bert.py
@@ -14,8 +14,7 @@ from transformers.modeling_bert import (
 
 
 def entropy(x):
-    """ Calculate entropy of a pre-softmax logit Tensor
-    """
+    """Calculate entropy of a pre-softmax logit Tensor"""
     exp_x = torch.exp(x)
     A = torch.sum(exp_x, dim=1)  # sum of exp(x_i)
     B = torch.sum(x * exp_x, dim=1)  # sum of x_i * exp(x_i)
@@ -104,7 +103,8 @@ class DeeBertEncoder(nn.Module):
 
 
 @add_start_docstrings(
-    "The Bert Model transformer with early exiting (DeeBERT). ", BERT_START_DOCSTRING,
+    "The Bert Model transformer with early exiting (DeeBERT). ",
+    BERT_START_DOCSTRING,
 )
 class DeeBertModel(BertPreTrainedModel):
     def __init__(self, config):
@@ -127,9 +127,9 @@ class DeeBertModel(BertPreTrainedModel):
         self.embeddings.word_embeddings = value
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
@@ -147,33 +147,33 @@ class DeeBertModel(BertPreTrainedModel):
         encoder_attention_mask=None,
     ):
         r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during pre-training.
+        Return:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+            last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+                Sequence of hidden-states at the output of the last layer of the model.
+            pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
+                Last layer hidden-state of the first token of the sequence (classification token)
+                further processed by a Linear layer and a Tanh activation function. The Linear
+                layer weights are trained from the next sentence prediction (classification)
+                objective during pre-training.
 
-            This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+                This output is usually *not* a good summary
+                of the semantic content of the input, you're often better with averaging or pooling
+                the sequence of hidden-states for the whole input sequence.
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
-            Tuple of each early exit's results (total length: number of layers)
-            Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
+            highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
+                Tuple of each early exit's results (total length: number of layers)
+                Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
         """
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -302,32 +302,32 @@ class DeeBertForSequenceClassification(BertPreTrainedModel):
         train_highway=False,
     ):
         r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
 
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+                Classification (or regression if config.num_labels==1) loss.
+            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
-            Tuple of each early exit's results (total length: number of layers)
-            Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
+            highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
+                Tuple of each early exit's results (total length: number of layers)
+                Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
         """
 
         exit_layer = self.num_layers
diff --git a/examples/deebert/src/modeling_highway_roberta.py b/examples/deebert/src/modeling_highway_roberta.py
index b953cabb11..bc869c3469 100644
--- a/examples/deebert/src/modeling_highway_roberta.py
+++ b/examples/deebert/src/modeling_highway_roberta.py
@@ -11,7 +11,8 @@ from .modeling_highway_bert import BertPreTrainedModel, DeeBertModel, HighwayExc
 
 
 @add_start_docstrings(
-    "The RoBERTa Model transformer with early exiting (DeeRoBERTa). ", ROBERTA_START_DOCSTRING,
+    "The RoBERTa Model transformer with early exiting (DeeRoBERTa). ",
+    ROBERTA_START_DOCSTRING,
 )
 class DeeRobertaModel(DeeBertModel):
 
@@ -58,32 +59,32 @@ class DeeRobertaForSequenceClassification(BertPreTrainedModel):
         train_highway=False,
     ):
         r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
 
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+                Classification (or regression if config.num_labels==1) loss.
+            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
-            Tuple of each early exit's results (total length: number of layers)
-            Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
+            highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
+                Tuple of each early exit's results (total length: number of layers)
+                Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
         """
 
         exit_layer = self.num_layers
diff --git a/examples/distillation/run_squad_w_distillation.py b/examples/distillation/run_squad_w_distillation.py
index f7a8ec02ba..dd83e30b82 100644
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -228,14 +228,20 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                 assert end_logits_tea.size() == end_logits_stu.size()
 
                 loss_fct = nn.KLDivLoss(reduction="batchmean")
-                loss_start = loss_fct(
-                    F.log_softmax(start_logits_stu / args.temperature, dim=-1),
-                    F.softmax(start_logits_tea / args.temperature, dim=-1),
-                ) * (args.temperature ** 2)
-                loss_end = loss_fct(
-                    F.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                    F.softmax(end_logits_tea / args.temperature, dim=-1),
-                ) * (args.temperature ** 2)
+                loss_start = (
+                    loss_fct(
+                        F.log_softmax(start_logits_stu / args.temperature, dim=-1),
+                        F.softmax(start_logits_tea / args.temperature, dim=-1),
+                    )
+                    * (args.temperature ** 2)
+                )
+                loss_end = (
+                    loss_fct(
+                        F.log_softmax(end_logits_stu / args.temperature, dim=-1),
+                        F.softmax(end_logits_tea / args.temperature, dim=-1),
+                    )
+                    * (args.temperature ** 2)
+                )
                 loss_ce = (loss_start + loss_end) / 2.0
 
                 loss = args.alpha_ce * loss_ce + args.alpha_squad * loss
diff --git a/examples/distillation/utils.py b/examples/distillation/utils.py
index 211e7c61da..6d439453fe 100644
--- a/examples/distillation/utils.py
+++ b/examples/distillation/utils.py
@@ -118,7 +118,8 @@ def init_gpu_params(params):
     if params.multi_gpu:
         logger.info("Initializing PyTorch distributed")
         torch.distributed.init_process_group(
-            init_method="env://", backend="nccl",
+            init_method="env://",
+            backend="nccl",
         )
 
 
diff --git a/examples/language-modeling/run_language_modeling.py b/examples/language-modeling/run_language_modeling.py
index 740cb6364a..3377b9d9cb 100644
--- a/examples/language-modeling/run_language_modeling.py
+++ b/examples/language-modeling/run_language_modeling.py
@@ -233,7 +233,9 @@ def main():
     eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None
     if config.model_type == "xlnet":
         data_collator = DataCollatorForPermutationLanguageModeling(
-            tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length,
+            tokenizer=tokenizer,
+            plm_probability=data_args.plm_probability,
+            max_span_length=data_args.max_span_length,
         )
     else:
         data_collator = DataCollatorForLanguageModeling(
diff --git a/examples/lightning_base.py b/examples/lightning_base.py
index 1c0591a9ec..d23757a9bc 100644
--- a/examples/lightning_base.py
+++ b/examples/lightning_base.py
@@ -226,10 +226,14 @@ class BaseTransformer(pl.LightningModule):
             help="Decoder layer dropout probability (Optional). Goes into model.config",
         )
         parser.add_argument(
-            "--dropout", type=float, help="Dropout probability (Optional). Goes into model.config",
+            "--dropout",
+            type=float,
+            help="Dropout probability (Optional). Goes into model.config",
         )
         parser.add_argument(
-            "--attention_dropout", type=float, help="Attention dropout probability (Optional). Goes into model.config",
+            "--attention_dropout",
+            type=float,
+            help="Attention dropout probability (Optional). Goes into model.config",
         )
         parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
         parser.add_argument(
diff --git a/examples/longform-qa/eli5_app.py b/examples/longform-qa/eli5_app.py
index 675ffbf63d..66420b4c02 100644
--- a/examples/longform-qa/eli5_app.py
+++ b/examples/longform-qa/eli5_app.py
@@ -95,7 +95,10 @@ def make_support(question, source="wiki40b", method="dense", n_results=10):
             )
         else:
             support_doc, hit_lst = query_es_index(
-                question, es_client, index_name="english_wiki40b_snippets_100w", n_results=n_results,
+                question,
+                es_client,
+                index_name="english_wiki40b_snippets_100w",
+                n_results=n_results,
             )
     support_list = [
         (res["article_title"], res["section_title"].strip(), res["score"], res["passage_text"]) for res in hit_lst
@@ -154,7 +157,8 @@ header_full = """
     header_html,
 )
 st.sidebar.markdown(
-    header_full, unsafe_allow_html=True,
+    header_full,
+    unsafe_allow_html=True,
 )
 
 # Long Form QA with ELI5 and Wikipedia
@@ -173,9 +177,17 @@ action_list = [
 ]
 demo_options = st.sidebar.checkbox("Demo options")
 if demo_options:
-    action_st = st.sidebar.selectbox("", action_list, index=3,)
+    action_st = st.sidebar.selectbox(
+        "",
+        action_list,
+        index=3,
+    )
     action = action_list.index(action_st)
-    show_type = st.sidebar.selectbox("", ["Show full text of passages", "Show passage section titles"], index=0,)
+    show_type = st.sidebar.selectbox(
+        "",
+        ["Show full text of passages", "Show passage section titles"],
+        index=0,
+    )
     show_passages = show_type == "Show full text of passages"
 else:
     action = 3
@@ -250,7 +262,9 @@ questions_list = [
     "How does New Zealand have so many large bird predators?",
 ]
 question_s = st.selectbox(
-    "What would you like to ask? ---- select <MY QUESTION> to enter a new query", questions_list, index=1,
+    "What would you like to ask? ---- select <MY QUESTION> to enter a new query",
+    questions_list,
+    index=1,
 )
 if question_s == "<MY QUESTION>":
     question = st.text_input("Enter your question here:", "")
diff --git a/examples/longform-qa/eli5_utils.py b/examples/longform-qa/eli5_utils.py
index f6e417a8dd..5ef7ce0cc6 100644
--- a/examples/longform-qa/eli5_utils.py
+++ b/examples/longform-qa/eli5_utils.py
@@ -48,7 +48,11 @@ def make_es_index_snippets(es_client, passages_dset, index_name="english_wiki_ki
             yield passage
 
     # create the ES index
-    for ok, action in streaming_bulk(client=es_client, index=index_name, actions=passage_generator(),):
+    for ok, action in streaming_bulk(
+        client=es_client,
+        index=index_name,
+        actions=passage_generator(),
+    ):
         progress.update(1)
         successes += ok
     print("Indexed %d documents" % (successes,))
@@ -137,7 +141,11 @@ class RetrievalQAEmbedder(torch.nn.Module):
 
             # define function for checkpointing
             def partial_encode(*inputs):
-                encoder_outputs = self.sent_encoder.encoder(inputs[0], attention_mask=inputs[1], head_mask=head_mask,)
+                encoder_outputs = self.sent_encoder.encoder(
+                    inputs[0],
+                    attention_mask=inputs[1],
+                    head_mask=head_mask,
+                )
                 sequence_output = encoder_outputs[0]
                 pooled_output = self.sent_encoder.pooler(sequence_output)
                 return pooled_output
@@ -234,7 +242,11 @@ def train_qa_retriever_epoch(model, dataset, tokenizer, optimizer, scheduler, ar
         if step % args.print_freq == 0 or step == 1:
             print(
                 "{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
-                    e, step, len(dataset) // args.batch_size, loc_loss / loc_steps, time() - st_time,
+                    e,
+                    step,
+                    len(dataset) // args.batch_size,
+                    loc_loss / loc_steps,
+                    time() - st_time,
                 )
             )
             loc_loss = 0
@@ -273,7 +285,11 @@ def train_qa_retriever_joint_epoch(model, dataset_list, tokenizer, optimizer, sc
         if step % args.print_freq == 0:
             print(
                 "{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
-                    e, step, len(dataset_list[0]) // args.batch_size, loc_loss / loc_steps, time() - st_time,
+                    e,
+                    step,
+                    len(dataset_list[0]) // args.batch_size,
+                    loc_loss / loc_steps,
+                    time() - st_time,
                 )
             )
             loc_loss = 0
@@ -354,7 +370,8 @@ class ELI5DatasetS2S(Dataset):
             self.document_cache[q_id] = self.document_cache.get(q_id, self.make_doc_function(example["title"]))
         document = self.document_cache[q_id]
         in_st = "question: {} context: {}".format(
-            question.lower().replace(" --t--", "").strip(), document.lower().strip(),
+            question.lower().replace(" --t--", "").strip(),
+            document.lower().strip(),
         )
         out_st = answer
         return (in_st, out_st)
@@ -427,7 +444,11 @@ def train_qa_s2s_epoch(model, dataset, tokenizer, optimizer, scheduler, args, e=
         if step % args.print_freq == 0 or step == 1:
             print(
                 "{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
-                    e, step, len(dataset) // args.batch_size, loc_loss / loc_steps, time() - st_time,
+                    e,
+                    step,
+                    len(dataset) // args.batch_size,
+                    loc_loss / loc_steps,
+                    time() - st_time,
                 )
             )
             loc_loss = 0
@@ -456,10 +477,18 @@ def eval_qa_s2s_epoch(model, dataset, tokenizer, args):
             if step % args.print_freq == 0:
                 print(
                     "{:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
-                        step, len(dataset) // args.batch_size, loc_loss / loc_steps, time() - st_time,
+                        step,
+                        len(dataset) // args.batch_size,
+                        loc_loss / loc_steps,
+                        time() - st_time,
                     )
                 )
-    print("Total \t L: {:.3f} \t -- {:.3f}".format(loc_loss / loc_steps, time() - st_time,))
+    print(
+        "Total \t L: {:.3f} \t -- {:.3f}".format(
+            loc_loss / loc_steps,
+            time() - st_time,
+        )
+    )
 
 
 def train_qa_s2s(qa_s2s_model, qa_s2s_tokenizer, s2s_train_dset, s2s_valid_dset, s2s_args):
@@ -506,7 +535,12 @@ def qa_s2s_generate(
     max_input_length=512,
     device="cuda:0",
 ):
-    model_inputs = make_qa_s2s_batch([(question_doc, "A")], qa_s2s_tokenizer, max_input_length, device=device,)
+    model_inputs = make_qa_s2s_batch(
+        [(question_doc, "A")],
+        qa_s2s_tokenizer,
+        max_input_length,
+        device=device,
+    )
     n_beams = num_answers if num_beams is None else max(num_beams, num_answers)
     generated_ids = qa_s2s_model.generate(
         input_ids=model_inputs["input_ids"],
diff --git a/examples/movement-pruning/emmental/modeling_bert_masked.py b/examples/movement-pruning/emmental/modeling_bert_masked.py
index 294db863ee..d59a4487af 100644
--- a/examples/movement-pruning/emmental/modeling_bert_masked.py
+++ b/examples/movement-pruning/emmental/modeling_bert_masked.py
@@ -37,8 +37,7 @@ logger = logging.getLogger(__name__)
 
 
 class BertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
+    """Construct the embeddings from word, position and token_type embeddings."""
 
     def __init__(self, config):
         super().__init__()
@@ -385,8 +384,8 @@ class BertPooler(nn.Module):
 
 
 class MaskedBertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = MaskedBertConfig
@@ -492,9 +491,9 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
         self.embeddings.word_embeddings = value
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
@@ -685,31 +684,31 @@ class MaskedBertForSequenceClassification(MaskedBertPreTrainedModel):
         threshold=None,
     ):
         r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        threshold (:obj:`float`):
-            Threshold value (see :class:`~emmental.MaskedLinear`).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            threshold (:obj:`float`):
+                Threshold value (see :class:`~emmental.MaskedLinear`).
 
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+                Classification (or regression if config.num_labels==1) loss.
+            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
         """
 
         outputs = self.bert(
@@ -770,32 +769,32 @@ class MaskedBertForMultipleChoice(MaskedBertPreTrainedModel):
         threshold=None,
     ):
         r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
-        threshold (:obj:`float`):
-            Threshold value (see :class:`~emmental.MaskedLinear`).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the multiple choice classification loss.
+                Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+                of the input tensors. (see `input_ids` above)
+            threshold (:obj:`float`):
+                Threshold value (see :class:`~emmental.MaskedLinear`).
 
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Classification loss.
-        classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
-            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
+                Classification loss.
+            classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
+                `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
 
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+                Classification scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
 
         """
         num_choices = input_ids.shape[1]
@@ -860,29 +859,29 @@ class MaskedBertForTokenClassification(MaskedBertPreTrainedModel):
         threshold=None,
     ):
         r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the token classification loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-        threshold (:obj:`float`):
-            Threshold value (see :class:`~emmental.MaskedLinear`).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the token classification loss.
+                Indices should be in ``[0, ..., config.num_labels - 1]``.
+            threshold (:obj:`float`):
+                Threshold value (see :class:`~emmental.MaskedLinear`).
 
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
-            Classification loss.
-        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
+                Classification loss.
+            scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
+                Classification scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
         """
 
         outputs = self.bert(
@@ -947,36 +946,36 @@ class MaskedBertForQuestionAnswering(MaskedBertPreTrainedModel):
         threshold=None,
     ):
         r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        threshold (:obj:`float`):
-            Threshold value (see :class:`~emmental.MaskedLinear`).
+            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+            threshold (:obj:`float`):
+                Threshold value (see :class:`~emmental.MaskedLinear`).
 
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-start scores (before SoftMax).
-        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+                Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+            start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
+                Span-start scores (before SoftMax).
+            end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
+                Span-end scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
         """
 
         outputs = self.bert(
@@ -996,7 +995,10 @@ class MaskedBertForQuestionAnswering(MaskedBertPreTrainedModel):
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
-        outputs = (start_logits, end_logits,) + outputs[2:]
+        outputs = (
+            start_logits,
+            end_logits,
+        ) + outputs[2:]
         if start_positions is not None and end_positions is not None:
             # If we are on multi-GPU, split add a dimension
             if len(start_positions.size()) > 1:
diff --git a/examples/movement-pruning/masked_run_glue.py b/examples/movement-pruning/masked_run_glue.py
index 218864051f..876b9fc6e3 100644
--- a/examples/movement-pruning/masked_run_glue.py
+++ b/examples/movement-pruning/masked_run_glue.py
@@ -173,7 +173,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
         model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
+            model,
+            device_ids=[args.local_rank],
+            output_device=args.local_rank,
+            find_unused_parameters=True,
         )
 
     # Train!
@@ -217,7 +220,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
     tr_loss, logging_loss = 0.0, 0.0
     model.zero_grad()
     train_iterator = trange(
-        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
+        epochs_trained,
+        int(args.num_train_epochs),
+        desc="Epoch",
+        disable=args.local_rank not in [-1, 0],
     )
     set_seed(args)  # Added here for reproductibility
     for _ in train_iterator:
@@ -280,11 +286,14 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                         attention_mask=inputs["attention_mask"],
                     )
 
-                loss_logits = F.kl_div(
-                    input=F.log_softmax(logits_stu / args.temperature, dim=-1),
-                    target=F.softmax(logits_tea / args.temperature, dim=-1),
-                    reduction="batchmean",
-                ) * (args.temperature ** 2)
+                loss_logits = (
+                    F.kl_div(
+                        input=F.log_softmax(logits_stu / args.temperature, dim=-1),
+                        target=F.softmax(logits_tea / args.temperature, dim=-1),
+                        reduction="batchmean",
+                    )
+                    * (args.temperature ** 2)
+                )
 
                 loss = args.alpha_distil * loss_logits + args.alpha_ce * loss
 
@@ -529,7 +538,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
             processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
         )
         features = convert_examples_to_features(
-            examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=output_mode,
+            examples,
+            tokenizer,
+            max_length=args.max_seq_length,
+            label_list=label_list,
+            output_mode=output_mode,
         )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
@@ -592,7 +605,10 @@ def main():
     )
     # Other parameters
     parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
+        "--config_name",
+        default="",
+        type=str,
+        help="Pretrained config name or path if not the same as model_name",
     )
     parser.add_argument(
         "--tokenizer_name",
@@ -616,17 +632,27 @@ def main():
     parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
     parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
     parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
+        "--evaluate_during_training",
+        action="store_true",
+        help="Run evaluation during training at each logging step.",
     )
     parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
+        "--do_lower_case",
+        action="store_true",
+        help="Set this flag if you are using an uncased model.",
     )
 
     parser.add_argument(
-        "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
+        "--per_gpu_train_batch_size",
+        default=8,
+        type=int,
+        help="Batch size per GPU/CPU for training.",
     )
     parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.",
+        "--per_gpu_eval_batch_size",
+        default=8,
+        type=int,
+        help="Batch size per GPU/CPU for evaluation.",
     )
     parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
 
@@ -723,7 +749,10 @@ def main():
     parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
     parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
     parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
+        "--num_train_epochs",
+        default=3.0,
+        type=float,
+        help="Total number of training epochs to perform.",
     )
     parser.add_argument(
         "--max_steps",
@@ -742,10 +771,14 @@ def main():
     )
     parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
     parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
+        "--overwrite_output_dir",
+        action="store_true",
+        help="Overwrite the content of the output directory",
     )
     parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
+        "--overwrite_cache",
+        action="store_true",
+        help="Overwrite the cached training and evaluation sets",
     )
     parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
 
diff --git a/examples/movement-pruning/masked_run_squad.py b/examples/movement-pruning/masked_run_squad.py
index 309fb9ea9d..30806ff117 100644
--- a/examples/movement-pruning/masked_run_squad.py
+++ b/examples/movement-pruning/masked_run_squad.py
@@ -181,7 +181,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
         model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
+            model,
+            device_ids=[args.local_rank],
+            output_device=args.local_rank,
+            find_unused_parameters=True,
         )
 
     # Train!
@@ -304,16 +307,22 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                         attention_mask=inputs["attention_mask"],
                     )
 
-                loss_start = F.kl_div(
-                    input=F.log_softmax(start_logits_stu / args.temperature, dim=-1),
-                    target=F.softmax(start_logits_tea / args.temperature, dim=-1),
-                    reduction="batchmean",
-                ) * (args.temperature ** 2)
-                loss_end = F.kl_div(
-                    input=F.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                    target=F.softmax(end_logits_tea / args.temperature, dim=-1),
-                    reduction="batchmean",
-                ) * (args.temperature ** 2)
+                loss_start = (
+                    F.kl_div(
+                        input=F.log_softmax(start_logits_stu / args.temperature, dim=-1),
+                        target=F.softmax(start_logits_tea / args.temperature, dim=-1),
+                        reduction="batchmean",
+                    )
+                    * (args.temperature ** 2)
+                )
+                loss_end = (
+                    F.kl_div(
+                        input=F.log_softmax(end_logits_stu / args.temperature, dim=-1),
+                        target=F.softmax(end_logits_tea / args.temperature, dim=-1),
+                        reduction="batchmean",
+                    )
+                    * (args.temperature ** 2)
+                )
                 loss_logits = (loss_start + loss_end) / 2.0
 
                 loss = args.alpha_distil * loss_logits + args.alpha_ce * loss
@@ -859,7 +868,10 @@ def main():
     parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
     parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
     parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
+        "--num_train_epochs",
+        default=3.0,
+        type=float,
+        help="Total number of training epochs to perform.",
     )
     parser.add_argument(
         "--max_steps",
diff --git a/examples/multiple-choice/utils_multiple_choice.py b/examples/multiple-choice/utils_multiple_choice.py
index d236aef061..784a7578d3 100644
--- a/examples/multiple-choice/utils_multiple_choice.py
+++ b/examples/multiple-choice/utils_multiple_choice.py
@@ -100,7 +100,12 @@ if is_torch_available():
 
             cached_features_file = os.path.join(
                 data_dir,
-                "cached_{}_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length), task,),
+                "cached_{}_{}_{}_{}".format(
+                    mode.value,
+                    tokenizer.__class__.__name__,
+                    str(max_seq_length),
+                    task,
+                ),
             )
 
             # Make sure only the first process in distributed training processes the dataset,
@@ -121,7 +126,12 @@ if is_torch_available():
                     else:
                         examples = processor.get_train_examples(data_dir)
                     logger.info("Training examples: %s", len(examples))
-                    self.features = convert_examples_to_features(examples, label_list, max_seq_length, tokenizer,)
+                    self.features = convert_examples_to_features(
+                        examples,
+                        label_list,
+                        max_seq_length,
+                        tokenizer,
+                    )
                     logger.info("Saving features into cached file %s", cached_features_file)
                     torch.save(self.features, cached_features_file)
 
@@ -164,7 +174,12 @@ if is_tf_available():
                 examples = processor.get_train_examples(data_dir)
             logger.info("Training examples: %s", len(examples))
 
-            self.features = convert_examples_to_features(examples, label_list, max_seq_length, tokenizer,)
+            self.features = convert_examples_to_features(
+                examples,
+                label_list,
+                max_seq_length,
+                tokenizer,
+            )
 
             def gen():
                 for (ex_index, ex) in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
@@ -491,7 +506,10 @@ class ArcProcessor(DataProcessor):
 
 
 def convert_examples_to_features(
-    examples: List[InputExample], label_list: List[str], max_length: int, tokenizer: PreTrainedTokenizer,
+    examples: List[InputExample],
+    label_list: List[str],
+    max_length: int,
+    tokenizer: PreTrainedTokenizer,
 ) -> List[InputFeatures]:
     """
     Loads a data file into a list of `InputFeatures`
diff --git a/examples/question-answering/run_squad_trainer.py b/examples/question-answering/run_squad_trainer.py
index 04c06acda1..d5fc072316 100644
--- a/examples/question-answering/run_squad_trainer.py
+++ b/examples/question-answering/run_squad_trainer.py
@@ -137,7 +137,12 @@ def main():
     )
 
     # Initialize our Trainer
-    trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,)
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+    )
 
     # Training
     if training_args.do_train:
diff --git a/examples/question-answering/run_tf_squad.py b/examples/question-answering/run_tf_squad.py
index 7e90416bf5..1382e7f033 100644
--- a/examples/question-answering/run_tf_squad.py
+++ b/examples/question-answering/run_tf_squad.py
@@ -231,7 +231,12 @@ def main():
     eval_dataset = eval_dataset.apply(tf.data.experimental.assert_cardinality(len(eval_examples)))
 
     # Initialize our Trainer
-    trainer = TFTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,)
+    trainer = TFTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+    )
 
     # Training
     if training_args.do_train:
diff --git a/examples/seq2seq/bertabs/configuration_bertabs.py b/examples/seq2seq/bertabs/configuration_bertabs.py
index ac79d20062..29dd46362f 100644
--- a/examples/seq2seq/bertabs/configuration_bertabs.py
+++ b/examples/seq2seq/bertabs/configuration_bertabs.py
@@ -28,7 +28,7 @@ BERTABS_FINETUNED_CONFIG_MAP = {
 
 
 class BertAbsConfig(PretrainedConfig):
-    r""" Class to store the configuration of the BertAbs model.
+    r"""Class to store the configuration of the BertAbs model.
 
     Arguments:
         vocab_size: int
diff --git a/examples/seq2seq/bertabs/convert_bertabs_original_pytorch_checkpoint.py b/examples/seq2seq/bertabs/convert_bertabs_original_pytorch_checkpoint.py
index d94f6f0176..ed2bb11f77 100644
--- a/examples/seq2seq/bertabs/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/seq2seq/bertabs/convert_bertabs_original_pytorch_checkpoint.py
@@ -62,7 +62,7 @@ BertAbsConfig = namedtuple(
 
 
 def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
-    """ Copy/paste and tweak the pre-trained weights provided by the creators
+    """Copy/paste and tweak the pre-trained weights provided by the creators
     of BertAbs for the internal architecture.
     """
 
@@ -164,13 +164,22 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--bertabs_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump.",
+        "--bertabs_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path the official PyTorch dump.",
     )
     parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model.",
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the output PyTorch model.",
     )
     args = parser.parse_args()
 
     convert_bertabs_checkpoints(
-        args.bertabs_checkpoint_path, args.pytorch_dump_folder_path,
+        args.bertabs_checkpoint_path,
+        args.pytorch_dump_folder_path,
     )
diff --git a/examples/seq2seq/bertabs/modeling_bertabs.py b/examples/seq2seq/bertabs/modeling_bertabs.py
index c1d04e8c71..103c0b4d5b 100644
--- a/examples/seq2seq/bertabs/modeling_bertabs.py
+++ b/examples/seq2seq/bertabs/modeling_bertabs.py
@@ -105,10 +105,17 @@ class BertAbs(BertAbsPreTrainedModel):
                 p.data.zero_()
 
     def forward(
-        self, encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask,
+        self,
+        encoder_input_ids,
+        decoder_input_ids,
+        token_type_ids,
+        encoder_attention_mask,
+        decoder_attention_mask,
     ):
         encoder_output = self.bert(
-            input_ids=encoder_input_ids, token_type_ids=token_type_ids, attention_mask=encoder_attention_mask,
+            input_ids=encoder_input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=encoder_attention_mask,
         )
         encoder_hidden_states = encoder_output[0]
         dec_state = self.decoder.init_decoder_state(encoder_input_ids, encoder_hidden_states)
@@ -117,8 +124,7 @@ class BertAbs(BertAbsPreTrainedModel):
 
 
 class Bert(nn.Module):
-    """ This class is not really necessary and should probably disappear.
-    """
+    """This class is not really necessary and should probably disappear."""
 
     def __init__(self):
         super().__init__()
@@ -307,7 +313,14 @@ class TransformerDecoderLayer(nn.Module):
         self.register_buffer("mask", mask)
 
     def forward(
-        self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, previous_input=None, layer_cache=None, step=None,
+        self,
+        inputs,
+        memory_bank,
+        src_pad_mask,
+        tgt_pad_mask,
+        previous_input=None,
+        layer_cache=None,
+        step=None,
     ):
         """
         Args:
@@ -331,13 +344,25 @@ class TransformerDecoderLayer(nn.Module):
             all_input = torch.cat((previous_input, input_norm), dim=1)
             dec_mask = None
 
-        query = self.self_attn(all_input, all_input, input_norm, mask=dec_mask, layer_cache=layer_cache, type="self",)
+        query = self.self_attn(
+            all_input,
+            all_input,
+            input_norm,
+            mask=dec_mask,
+            layer_cache=layer_cache,
+            type="self",
+        )
 
         query = self.drop(query) + inputs
 
         query_norm = self.layer_norm_2(query)
         mid = self.context_attn(
-            memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, type="context",
+            memory_bank,
+            memory_bank,
+            query_norm,
+            mask=src_pad_mask,
+            layer_cache=layer_cache,
+            type="context",
         )
         output = self.feed_forward(self.drop(mid) + query)
 
@@ -422,7 +447,14 @@ class MultiHeadedAttention(nn.Module):
             self.final_linear = nn.Linear(model_dim, model_dim)
 
     def forward(
-        self, key, value, query, mask=None, layer_cache=None, type=None, predefined_graph_1=None,
+        self,
+        key,
+        value,
+        query,
+        mask=None,
+        layer_cache=None,
+        type=None,
+        predefined_graph_1=None,
     ):
         """
         Compute the context vector and the attention vectors.
@@ -628,7 +660,7 @@ def gelu(x):
 
 
 class PositionwiseFeedForward(nn.Module):
-    """ A two-layer Feed-Forward-Network with residual layer norm.
+    """A two-layer Feed-Forward-Network with residual layer norm.
 
     Args:
         d_model (int): the size of input for the first-layer of the FFN.
@@ -770,8 +802,7 @@ class Translator(object):
         self.max_length = args.max_length
 
     def translate(self, batch, step, attn_debug=False):
-        """ Generates summaries from one batch of data.
-        """
+        """Generates summaries from one batch of data."""
         self.model.eval()
         with torch.no_grad():
             batch_data = self.translate_batch(batch)
@@ -798,8 +829,7 @@ class Translator(object):
     # Where the beam search lives
     # I have no idea why it is being called from the method above
     def _fast_translate_batch(self, batch, max_length, min_length=0):
-        """ Beam Search using the encoder inputs contained in `batch`.
-        """
+        """Beam Search using the encoder inputs contained in `batch`."""
 
         # The batch object is funny
         # Instead of just looking at the size of the arguments we encapsulate
@@ -981,7 +1011,7 @@ def tile(x, count, dim=0):
 
 
 class BertSumOptimizer(object):
-    """ Specific optimizer for BertSum.
+    """Specific optimizer for BertSum.
 
     As described in [1], the authors fine-tune BertSum for abstractive
     summarization using two Adam Optimizers with different warm-up steps and
@@ -999,10 +1029,16 @@ class BertSumOptimizer(object):
 
         self.optimizers = {
             "encoder": torch.optim.Adam(
-                model.encoder.parameters(), lr=lr["encoder"], betas=(beta_1, beta_2), eps=eps,
+                model.encoder.parameters(),
+                lr=lr["encoder"],
+                betas=(beta_1, beta_2),
+                eps=eps,
             ),
             "decoder": torch.optim.Adam(
-                model.decoder.parameters(), lr=lr["decoder"], betas=(beta_1, beta_2), eps=eps,
+                model.decoder.parameters(),
+                lr=lr["decoder"],
+                betas=(beta_1, beta_2),
+                eps=eps,
             ),
         }
 
diff --git a/examples/seq2seq/bertabs/run_summarization.py b/examples/seq2seq/bertabs/run_summarization.py
index 542bb8d098..33be67233f 100644
--- a/examples/seq2seq/bertabs/run_summarization.py
+++ b/examples/seq2seq/bertabs/run_summarization.py
@@ -99,7 +99,7 @@ def evaluate(args):
 
 
 def save_summaries(summaries, path, original_document_name):
-    """ Write the summaries in fies that are prefixed by the original
+    """Write the summaries in fies that are prefixed by the original
     files' name with the `_summary` appended.
 
     Attributes:
@@ -125,7 +125,7 @@ def save_summaries(summaries, path, original_document_name):
 
 
 def format_summary(translation):
-    """ Transforms the output of the `from_batch` function
+    """Transforms the output of the `from_batch` function
     into nicely formatted summaries.
     """
     raw_summary, _, _ = translation
@@ -190,7 +190,12 @@ def build_data_iterator(args, tokenizer):
     def collate_fn(data):
         return collate(data, tokenizer, block_size=512, device=args.device)
 
-    iterator = DataLoader(dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,)
+    iterator = DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=args.batch_size,
+        collate_fn=collate_fn,
+    )
 
     return iterator
 
@@ -201,7 +206,7 @@ def load_and_cache_examples(args, tokenizer):
 
 
 def collate(data, tokenizer, block_size, device):
-    """ Collate formats the data passed to the data loader.
+    """Collate formats the data passed to the data loader.
 
     In particular we tokenize the data batch after batch to avoid keeping them
     all in memory. We output the data as a namedtuple to fit the original BertAbs's
@@ -231,7 +236,7 @@ def collate(data, tokenizer, block_size, device):
 
 
 def decode_summary(summary_tokens, tokenizer):
-    """ Decode the summary and return it in a format
+    """Decode the summary and return it in a format
     suitable for evaluation.
     """
     summary_tokens = summary_tokens.to("cpu").numpy()
@@ -242,8 +247,7 @@ def decode_summary(summary_tokens, tokenizer):
 
 
 def main():
-    """ The main function defines the interface with the users.
-    """
+    """The main function defines the interface with the users."""
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--documents_dir",
@@ -268,23 +272,41 @@ def main():
     )
     # EVALUATION options
     parser.add_argument(
-        "--no_cuda", default=False, type=bool, help="Whether to force the execution on CPU.",
+        "--no_cuda",
+        default=False,
+        type=bool,
+        help="Whether to force the execution on CPU.",
     )
     parser.add_argument(
-        "--batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.",
+        "--batch_size",
+        default=4,
+        type=int,
+        help="Batch size per GPU/CPU for training.",
     )
     # BEAM SEARCH arguments
     parser.add_argument(
-        "--min_length", default=50, type=int, help="Minimum number of tokens for the summaries.",
+        "--min_length",
+        default=50,
+        type=int,
+        help="Minimum number of tokens for the summaries.",
     )
     parser.add_argument(
-        "--max_length", default=200, type=int, help="Maixmum number of tokens for the summaries.",
+        "--max_length",
+        default=200,
+        type=int,
+        help="Maixmum number of tokens for the summaries.",
     )
     parser.add_argument(
-        "--beam_size", default=5, type=int, help="The number of beams to start with for each example.",
+        "--beam_size",
+        default=5,
+        type=int,
+        help="The number of beams to start with for each example.",
     )
     parser.add_argument(
-        "--alpha", default=0.95, type=float, help="The value of alpha for the length penalty in the beam search.",
+        "--alpha",
+        default=0.95,
+        type=float,
+        help="The value of alpha for the length penalty in the beam search.",
     )
     parser.add_argument(
         "--block_trigram",
diff --git a/examples/seq2seq/bertabs/test_utils_summarization.py b/examples/seq2seq/bertabs/test_utils_summarization.py
index 1205543d17..5af0898246 100644
--- a/examples/seq2seq/bertabs/test_utils_summarization.py
+++ b/examples/seq2seq/bertabs/test_utils_summarization.py
@@ -43,8 +43,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
         self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)
 
     def test_process_story_no_highlights(self):
-        """ Processing a story with no highlights returns an empty list for the summary.
-        """
+        """Processing a story with no highlights returns an empty list for the summary."""
         raw_story = """It was the year of Our Lord one thousand seven hundred and
         seventy-five.\n\nSpiritual revelations were conceded to England at that
         favoured period, as at this."""
@@ -52,8 +51,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
         self.assertEqual(summary_lines, [])
 
     def test_process_empty_story(self):
-        """ An empty story returns an empty collection of lines.
-        """
+        """An empty story returns an empty collection of lines."""
         raw_story = ""
         story_lines, summary_lines = process_story(raw_story)
         self.assertEqual(story_lines, [])
diff --git a/examples/seq2seq/bertabs/utils_summarization.py b/examples/seq2seq/bertabs/utils_summarization.py
index a6193339e5..11ce599429 100644
--- a/examples/seq2seq/bertabs/utils_summarization.py
+++ b/examples/seq2seq/bertabs/utils_summarization.py
@@ -11,7 +11,7 @@ from torch.utils.data import Dataset
 
 
 class CNNDMDataset(Dataset):
-    """ Abstracts the dataset used to train seq2seq models.
+    """Abstracts the dataset used to train seq2seq models.
 
     The class will process the documents that are located in the specified
     folder. The preprocessing will work on any document that is reasonably
@@ -31,7 +31,7 @@ class CNNDMDataset(Dataset):
     """
 
     def __init__(self, path="", prefix="train"):
-        """ We initialize the class by listing all the documents to summarize.
+        """We initialize the class by listing all the documents to summarize.
         Files are not read in memory due to the size of some datasets (like CNN/DailyMail).
         """
         assert os.path.isdir(path)
@@ -60,7 +60,7 @@ class CNNDMDataset(Dataset):
 
 
 def process_story(raw_story):
-    """ Extract the story and summary from a story file.
+    """Extract the story and summary from a story file.
 
     Arguments:
         raw_story (str): content of the story file as an utf-8 encoded string.
@@ -108,7 +108,7 @@ def _add_missing_period(line):
 
 
 def truncate_or_pad(sequence, block_size, pad_token_id):
-    """ Adapt the source and target sequences' lengths to the block size.
+    """Adapt the source and target sequences' lengths to the block size.
     If the sequence is shorter we append padding token to the right of the sequence.
     """
     if len(sequence) > block_size:
@@ -119,8 +119,8 @@ def truncate_or_pad(sequence, block_size, pad_token_id):
 
 
 def build_mask(sequence, pad_token_id):
-    """ Builds the mask. The attention mechanism will only attend to positions
-    with value 1. """
+    """Builds the mask. The attention mechanism will only attend to positions
+    with value 1."""
     mask = torch.ones_like(sequence)
     idx_pad_tokens = sequence == pad_token_id
     mask[idx_pad_tokens] = 0
@@ -128,7 +128,7 @@ def build_mask(sequence, pad_token_id):
 
 
 def encode_for_summarization(story_lines, summary_lines, tokenizer):
-    """ Encode the story and summary lines, and join them
+    """Encode the story and summary lines, and join them
     as specified in [1] by using `[SEP] [CLS]` tokens to separate
     sentences.
     """
@@ -141,7 +141,7 @@ def encode_for_summarization(story_lines, summary_lines, tokenizer):
 
 
 def compute_token_type_ids(batch, separator_token_id):
-    """ Segment embeddings as described in [1]
+    """Segment embeddings as described in [1]
 
     The values {0,1} were found in the repository [2].
 
diff --git a/examples/seq2seq/callbacks.py b/examples/seq2seq/callbacks.py
index 68e06a5f48..39a9cbc9f1 100644
--- a/examples/seq2seq/callbacks.py
+++ b/examples/seq2seq/callbacks.py
@@ -97,4 +97,9 @@ def get_checkpoint_callback(output_dir, metric):
 
 
 def get_early_stopping_callback(metric, patience):
-    return EarlyStopping(monitor=f"val_{metric}", mode="max", patience=patience, verbose=True,)
+    return EarlyStopping(
+        monitor=f"val_{metric}",
+        mode="max",
+        patience=patience,
+        verbose=True,
+    )
diff --git a/examples/seq2seq/distillation.py b/examples/seq2seq/distillation.py
index 82194981a0..9cf6cbd818 100644
--- a/examples/seq2seq/distillation.py
+++ b/examples/seq2seq/distillation.py
@@ -348,7 +348,10 @@ class T5SummarizationDistiller(BartSummarizationDistiller):
         if self.different_encoder:
             with torch.no_grad():
                 teacher_enc_outputs, teacher_enc_hid = self.teacher.encoder(
-                    source_ids, attention_mask=source_mask, output_hidden_states=True, use_cache=False,
+                    source_ids,
+                    attention_mask=source_mask,
+                    output_hidden_states=True,
+                    use_cache=False,
                 )
             if self.hparams.alpha_encoder_loss > 0:
                 loss_encoder = self.calc_mse_loss(enc_outputs, teacher_enc_outputs, source_mask)
diff --git a/examples/seq2seq/test_seq2seq_examples.py b/examples/seq2seq/test_seq2seq_examples.py
index 4d02190c82..f3ef051590 100644
--- a/examples/seq2seq/test_seq2seq_examples.py
+++ b/examples/seq2seq/test_seq2seq_examples.py
@@ -117,7 +117,12 @@ class TestSummarizationDistiller(unittest.TestCase):
 
     @require_multigpu
     def test_multigpu(self):
-        updates = dict(no_teacher=True, freeze_encoder=True, gpus=2, sortish_sampler=False,)
+        updates = dict(
+            no_teacher=True,
+            freeze_encoder=True,
+            gpus=2,
+            sortish_sampler=False,
+        )
         self._test_distiller_cli(updates)
 
     def test_distill_no_teacher(self):
@@ -261,7 +266,8 @@ def test_run_eval_bart(model):
 
 
 @pytest.mark.parametrize(
-    ["model"], [pytest.param(T5_TINY), pytest.param(BART_TINY), pytest.param(MBART_TINY), pytest.param(MARIAN_TINY)],
+    ["model"],
+    [pytest.param(T5_TINY), pytest.param(BART_TINY), pytest.param(MBART_TINY), pytest.param(MARIAN_TINY)],
 )
 def test_finetune(model):
     args_d: dict = CHEAP_ARGS.copy()
@@ -329,7 +335,8 @@ def test_finetune_extra_model_args():
     output_dir = tempfile.mkdtemp(prefix="output_1_")
     args_d1 = args_d.copy()
     args_d1.update(
-        model_name_or_path=model, output_dir=output_dir,
+        model_name_or_path=model,
+        output_dir=output_dir,
     )
     extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
     for p in extra_model_params:
@@ -344,7 +351,8 @@ def test_finetune_extra_model_args():
     output_dir = tempfile.mkdtemp(prefix="output_2_")
     args_d2 = args_d.copy()
     args_d2.update(
-        model_name_or_path=model, output_dir=output_dir,
+        model_name_or_path=model,
+        output_dir=output_dir,
     )
     unsupported_param = "encoder_layerdrop"
     args_d2[unsupported_param] = 0.5
@@ -478,7 +486,11 @@ def test_summarization_dataset_truncation(tok):
     max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES)
     trunc_target = 4
     train_dataset = Seq2SeqDataset(
-        tokenizer, data_dir=tmp_dir, type_path="train", max_source_length=20, max_target_length=trunc_target,
+        tokenizer,
+        data_dir=tmp_dir,
+        type_path="train",
+        max_source_length=20,
+        max_target_length=trunc_target,
     )
     dataloader = DataLoader(train_dataset, batch_size=2, collate_fn=train_dataset.collate_fn)
     for batch in dataloader:
diff --git a/examples/seq2seq/utils.py b/examples/seq2seq/utils.py
index 80a31f462a..48375c6854 100644
--- a/examples/seq2seq/utils.py
+++ b/examples/seq2seq/utils.py
@@ -63,7 +63,9 @@ def calculate_bleu(output_lns, refs_lns, **kwargs) -> dict:
 
 
 def trim_batch(
-    input_ids, pad_token_id, attention_mask=None,
+    input_ids,
+    pad_token_id,
+    attention_mask=None,
 ):
     """Remove columns that are populated exclusively by pad_token_id"""
     keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
diff --git a/examples/text-classification/run_pl_glue.py b/examples/text-classification/run_pl_glue.py
index a4d229e216..80315abc56 100644
--- a/examples/text-classification/run_pl_glue.py
+++ b/examples/text-classification/run_pl_glue.py
@@ -153,7 +153,11 @@ class GLUETransformer(BaseTransformer):
         )
 
         parser.add_argument(
-            "--task", default="", type=str, required=True, help="The GLUE task to run",
+            "--task",
+            default="",
+            type=str,
+            required=True,
+            help="The GLUE task to run",
         )
         parser.add_argument(
             "--gpus",
@@ -177,7 +181,10 @@ def main():
 
     # If output_dir not provided, a folder will be generated in pwd
     if args.output_dir is None:
-        args.output_dir = os.path.join("./results", f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",)
+        args.output_dir = os.path.join(
+            "./results",
+            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
+        )
         os.makedirs(args.output_dir)
 
     model = GLUETransformer(args)
diff --git a/examples/text-classification/run_xnli.py b/examples/text-classification/run_xnli.py
index c1db6fa37c..1e4a587f5d 100644
--- a/examples/text-classification/run_xnli.py
+++ b/examples/text-classification/run_xnli.py
@@ -328,7 +328,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
             processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
         )
         features = convert_examples_to_features(
-            examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=output_mode,
+            examples,
+            tokenizer,
+            max_length=args.max_seq_length,
+            label_list=label_list,
+            output_mode=output_mode,
         )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
diff --git a/examples/text-generation/pplm/run_pplm.py b/examples/text-generation/pplm/run_pplm.py
index bfbb79e438..55a2a94627 100644
--- a/examples/text-generation/pplm/run_pplm.py
+++ b/examples/text-generation/pplm/run_pplm.py
@@ -698,7 +698,9 @@ def run_pplm_example(
                 for word_id in pert_gen_tok_text.tolist()[0]:
                     if word_id in bow_word_ids:
                         pert_gen_text += "{}{}{}".format(
-                            colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL,
+                            colorama.Fore.RED,
+                            tokenizer.decode([word_id]),
+                            colorama.Style.RESET_ALL,
                         )
                     else:
                         pert_gen_text += tokenizer.decode([word_id])
@@ -729,7 +731,10 @@ if __name__ == "__main__":
     parser.add_argument("--cond_text", type=str, default="The lake", help="Prefix texts to condition on")
     parser.add_argument("--uncond", action="store_true", help="Generate from end-of-text as prefix")
     parser.add_argument(
-        "--num_samples", type=int, default=1, help="Number of samples to generate from the modified latents",
+        "--num_samples",
+        type=int,
+        default=1,
+        help="Number of samples to generate from the modified latents",
     )
     parser.add_argument(
         "--bag_of_words",
@@ -751,13 +756,22 @@ if __name__ == "__main__":
         help="Discriminator to use",
     )
     parser.add_argument(
-        "--discrim_weights", type=str, default=None, help="Weights for the generic discriminator",
+        "--discrim_weights",
+        type=str,
+        default=None,
+        help="Weights for the generic discriminator",
     )
     parser.add_argument(
-        "--discrim_meta", type=str, default=None, help="Meta information for the generic discriminator",
+        "--discrim_meta",
+        type=str,
+        default=None,
+        help="Meta information for the generic discriminator",
     )
     parser.add_argument(
-        "--class_label", type=int, default=-1, help="Class label used for the discriminator",
+        "--class_label",
+        type=int,
+        default=-1,
+        help="Class label used for the discriminator",
     )
     parser.add_argument("--length", type=int, default=100)
     parser.add_argument("--stepsize", type=float, default=0.02)
@@ -773,7 +787,10 @@ if __name__ == "__main__":
         help="Length of past which is being optimized; 0 corresponds to infinite window length",
     )
     parser.add_argument(
-        "--horizon_length", type=int, default=1, help="Length of future to optimize over",
+        "--horizon_length",
+        type=int,
+        default=1,
+        help="Length of future to optimize over",
     )
     parser.add_argument("--decay", action="store_true", help="whether to decay or not")
     parser.add_argument("--gamma", type=float, default=1.5)
@@ -783,7 +800,10 @@ if __name__ == "__main__":
     parser.add_argument("--no_cuda", action="store_true", help="no cuda")
     parser.add_argument("--colorama", action="store_true", help="colors keywords")
     parser.add_argument(
-        "--repetition_penalty", type=float, default=1.0, help="Penalize repetition. More than 1.0 -> less repetition",
+        "--repetition_penalty",
+        type=float,
+        default=1.0,
+        help="Penalize repetition. More than 1.0 -> less repetition",
     )
 
     args = parser.parse_args()
diff --git a/examples/text-generation/pplm/run_pplm_discrim_train.py b/examples/text-generation/pplm/run_pplm_discrim_train.py
index ce6f583dc6..306e519b52 100644
--- a/examples/text-generation/pplm/run_pplm_discrim_train.py
+++ b/examples/text-generation/pplm/run_pplm_discrim_train.py
@@ -242,7 +242,12 @@ def train_discriminator(
 
         text = torchtext_data.Field()
         label = torchtext_data.Field(sequential=False)
-        train_data, val_data, test_data = datasets.SST.splits(text, label, fine_grained=True, train_subtrees=True,)
+        train_data, val_data, test_data = datasets.SST.splits(
+            text,
+            label,
+            fine_grained=True,
+            train_subtrees=True,
+        )
 
         x = []
         y = []
diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
index 12acd0cb0d..1b4b6f1e5f 100644
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -41,7 +41,9 @@ from transformers import (
 
 
 logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
 )
 logger = logging.getLogger(__name__)
 
@@ -197,7 +199,10 @@ def main():
     args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
 
     logger.warning(
-        "device: %s, n_gpu: %s, 16-bits training: %s", args.device, args.n_gpu, args.fp16,
+        "device: %s, n_gpu: %s, 16-bits training: %s",
+        args.device,
+        args.n_gpu,
+        args.fp16,
     )
 
     set_seed(args)
diff --git a/examples/token-classification/utils_ner.py b/examples/token-classification/utils_ner.py
index 6a708055fc..616ce20550 100644
--- a/examples/token-classification/utils_ner.py
+++ b/examples/token-classification/utils_ner.py
@@ -90,11 +90,11 @@ class TokenClassificationTask:
         sequence_a_segment_id=0,
         mask_padding_with_zero=True,
     ) -> List[InputFeatures]:
-        """ Loads a data file into a list of `InputFeatures`
-            `cls_token_at_end` define the location of the CLS token:
-                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-            `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
+        """Loads a data file into a list of `InputFeatures`
+        `cls_token_at_end` define the location of the CLS token:
+            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
+            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
+        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
         """
         # TODO clean up all this to leverage built-in features of tokenizers
 
@@ -230,7 +230,8 @@ if is_torch_available():
         ):
             # Load data features from cache or dataset file
             cached_features_file = os.path.join(
-                data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
+                data_dir,
+                "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
             )
 
             # Make sure only the first process in distributed training processes the dataset,
diff --git a/src/transformers/activations.py b/src/transformers/activations.py
index a50f7defe5..c2618180d3 100644
--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@@ -14,18 +14,18 @@ def swish(x):
 
 
 def _gelu_python(x):
-    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
-        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
-        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-        This is now written in C in torch.nn.functional
-        Also see https://arxiv.org/abs/1606.08415
+    """Original Implementation of the gelu activation function in Google Bert repo when initially created.
+    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+    0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    This is now written in C in torch.nn.functional
+    Also see https://arxiv.org/abs/1606.08415
     """
     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
 
 
 def gelu_new(x):
-    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
-        Also see https://arxiv.org/abs/1606.08415
+    """Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
+    Also see https://arxiv.org/abs/1606.08415
     """
     return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
 
diff --git a/src/transformers/benchmark/benchmark.py b/src/transformers/benchmark/benchmark.py
index 92b39f436e..d36f589ed1 100644
--- a/src/transformers/benchmark/benchmark.py
+++ b/src/transformers/benchmark/benchmark.py
@@ -199,11 +199,17 @@ class PyTorchBenchmark(Benchmark):
                 # run additional 10 times to stabilize compilation for tpu and torchscript
                 logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
                 timeit.repeat(
-                    func, repeat=1, number=5,
+                    func,
+                    repeat=1,
+                    number=5,
                 )
 
             # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
-            runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
+            runtimes = timeit.repeat(
+                func,
+                repeat=self.args.repeat,
+                number=10,
+            )
 
             if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics:
                 import torch_xla.debug.metrics as met
diff --git a/src/transformers/benchmark/benchmark_args_tf.py b/src/transformers/benchmark/benchmark_args_tf.py
index 0adbe6e4c6..2b005df68b 100644
--- a/src/transformers/benchmark/benchmark_args_tf.py
+++ b/src/transformers/benchmark/benchmark_args_tf.py
@@ -32,10 +32,12 @@ logger = logging.get_logger(__name__)
 @dataclass
 class TensorFlowBenchmarkArguments(BenchmarkArguments):
     tpu_name: str = field(
-        default=None, metadata={"help": "Name of TPU"},
+        default=None,
+        metadata={"help": "Name of TPU"},
     )
     device_idx: int = field(
-        default=0, metadata={"help": "CPU / GPU device index. Defaults to 0."},
+        default=0,
+        metadata={"help": "CPU / GPU device index. Defaults to 0."},
     )
     eager_mode: bool = field(default=False, metadata={"help": "Benchmark models in eager model."})
     use_xla: bool = field(
diff --git a/src/transformers/benchmark/benchmark_tf.py b/src/transformers/benchmark/benchmark_tf.py
index be74df675f..93e0e35a83 100644
--- a/src/transformers/benchmark/benchmark_tf.py
+++ b/src/transformers/benchmark/benchmark_tf.py
@@ -219,7 +219,11 @@ class TensorFlowBenchmark(Benchmark):
                     timeit.repeat(func, repeat=1, number=5)
 
                 # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
-                runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
+                runtimes = timeit.repeat(
+                    func,
+                    repeat=self.args.repeat,
+                    number=10,
+                )
 
                 return min(runtimes) / 10.0
             except ResourceExhaustedError as e:
diff --git a/src/transformers/benchmark/benchmark_utils.py b/src/transformers/benchmark/benchmark_utils.py
index d7194fd62b..46ab489e86 100644
--- a/src/transformers/benchmark/benchmark_utils.py
+++ b/src/transformers/benchmark/benchmark_utils.py
@@ -63,15 +63,15 @@ BenchmarkOutput = namedtuple(
 
 def separate_process_wrapper_fn(func: Callable[[], None], do_multi_processing: bool) -> Callable[[], None]:
     """
-        This function wraps another function into its own separated process.
-        In order to ensure accurate memory measurements it is important that the function
-        is executed in a separate process
+    This function wraps another function into its own separated process.
+    In order to ensure accurate memory measurements it is important that the function
+    is executed in a separate process
 
-        Args:
-            - `func`: (`callable`): function() -> ...
-                generic function which will be executed in its own separate process
-            - `do_multi_processing`: (`bool`)
-                Whether to run function on separate process or not
+    Args:
+        - `func`: (`callable`): function() -> ...
+            generic function which will be executed in its own separate process
+        - `do_multi_processing`: (`bool`)
+            Whether to run function on separate process or not
     """
 
     def multi_process_func(*args, **kwargs):
@@ -106,13 +106,13 @@ def is_memory_tracing_enabled():
 
 
 class Frame(NamedTuple):
-    """ `Frame` is a NamedTuple used to gather the current frame state.
-            `Frame` has the following fields:
-            - 'filename' (string): Name of the file currently executed
-            - 'module' (string): Name of the module currently executed
-            - 'line_number' (int): Number of the line currently executed
-            - 'event' (string): Event that triggered the tracing (default will be "line")
-            - 'line_text' (string): Text of the line in the python script
+    """`Frame` is a NamedTuple used to gather the current frame state.
+    `Frame` has the following fields:
+    - 'filename' (string): Name of the file currently executed
+    - 'module' (string): Name of the module currently executed
+    - 'line_number' (int): Number of the line currently executed
+    - 'event' (string): Event that triggered the tracing (default will be "line")
+    - 'line_text' (string): Text of the line in the python script
     """
 
     filename: str
@@ -123,10 +123,10 @@ class Frame(NamedTuple):
 
 
 class UsedMemoryState(NamedTuple):
-    """ `UsedMemoryState` are named tuples with the following fields:
-        - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
-        - 'cpu_memory': CPU RSS memory state *before* executing the line
-        - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
+    """`UsedMemoryState` are named tuples with the following fields:
+    - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
+    - 'cpu_memory': CPU RSS memory state *before* executing the line
+    - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
     """
 
     frame: Frame
@@ -135,9 +135,9 @@ class UsedMemoryState(NamedTuple):
 
 
 class Memory(NamedTuple):
-    """ `Memory` NamedTuple have a single field `bytes` and
-        you can get a human readable str of the number of mega bytes by calling `__repr__`
-            - `byte` (integer): number of bytes,
+    """`Memory` NamedTuple have a single field `bytes` and
+    you can get a human readable str of the number of mega bytes by calling `__repr__`
+        - `byte` (integer): number of bytes,
     """
 
     bytes: int
@@ -147,11 +147,11 @@ class Memory(NamedTuple):
 
 
 class MemoryState(NamedTuple):
-    """ `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
-        - `frame` (`Frame`): the current frame (see above)
-        - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
-        - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
-        - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
+    """`MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
+    - `frame` (`Frame`): the current frame (see above)
+    - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
+    - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
+    - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
     """
 
     frame: Frame
@@ -161,14 +161,14 @@ class MemoryState(NamedTuple):
 
 
 class MemorySummary(NamedTuple):
-    """ `MemorySummary` namedtuple otherwise with the fields:
-        - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
-            by substracting the memory after executing each line from the memory before executing said line.
-        - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
-            obtained by summing repeated memory increase for a line if it's executed several times.
-            The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
-        - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
-            Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
+    """`MemorySummary` namedtuple otherwise with the fields:
+    - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
+        by substracting the memory after executing each line from the memory before executing said line.
+    - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
+        obtained by summing repeated memory increase for a line if it's executed several times.
+        The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
+    - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
+        Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
     """
 
     sequential: List[MemoryState]
@@ -182,38 +182,38 @@ MemoryTrace = List[UsedMemoryState]
 
 def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_idx=None) -> int:
     """
-        measures peak cpu memory consumption of a given `function`
-        running the function for at least interval seconds
-        and at most 20 * interval seconds.
-        This function is heavily inspired by: `memory_usage`
-        of the package `memory_profiler`: https://github.com/pythonprofilers/memory_profiler/blob/895c4ac7a08020d66ae001e24067da6dcea42451/memory_profiler.py#L239
+    measures peak cpu memory consumption of a given `function`
+    running the function for at least interval seconds
+    and at most 20 * interval seconds.
+    This function is heavily inspired by: `memory_usage`
+    of the package `memory_profiler`: https://github.com/pythonprofilers/memory_profiler/blob/895c4ac7a08020d66ae001e24067da6dcea42451/memory_profiler.py#L239
 
-        Args:
-            - `function`: (`callable`): function() -> ...
-                function without any arguments to measure for which to measure the peak memory
+    Args:
+        - `function`: (`callable`): function() -> ...
+            function without any arguments to measure for which to measure the peak memory
 
-            - `interval`: (`float`, `optional`, defaults to `0.5`)
-                interval in second for which to measure the memory usage
+        - `interval`: (`float`, `optional`, defaults to `0.5`)
+            interval in second for which to measure the memory usage
 
-            - `device_idx`: (`int`, `optional`, defaults to `None`)
-                device id for which to measure gpu usage
+        - `device_idx`: (`int`, `optional`, defaults to `None`)
+            device id for which to measure gpu usage
 
-        Returns:
-            - `max_memory`: (`int`)
-                cosumed memory peak in Bytes
+    Returns:
+        - `max_memory`: (`int`)
+            cosumed memory peak in Bytes
     """
 
     def get_cpu_memory(process_id: int) -> int:
         """
-            measures current cpu memory usage of a given `process_id`
+        measures current cpu memory usage of a given `process_id`
 
-            Args:
-                - `process_id`: (`int`)
-                    process_id for which to measure memory
+        Args:
+            - `process_id`: (`int`)
+                process_id for which to measure memory
 
-            Returns
-                - `memory`: (`int`)
-                    cosumed memory in Bytes
+        Returns
+            - `memory`: (`int`)
+                cosumed memory in Bytes
         """
         process = psutil.Process(process_id)
         try:
@@ -234,8 +234,8 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_i
         class MemoryMeasureProcess(Process):
 
             """
-                `MemoryMeasureProcess` inherits from `Process` and overwrites
-                its `run()` method. Used to measure the memory usage of a process
+            `MemoryMeasureProcess` inherits from `Process` and overwrites
+            its `run()` method. Used to measure the memory usage of a process
             """
 
             def __init__(self, process_id: int, child_connection: Connection, interval: float):
@@ -309,37 +309,37 @@ def start_memory_tracing(
     events_to_trace: str = "line",
     gpus_to_trace: Optional[List[int]] = None,
 ) -> MemoryTrace:
-    """ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
-        See `./benchmark.py` for usage examples.
-        Current memory consumption is returned using psutil and in particular is the RSS memory
-            "Resident Set Size” (the non-swapped physical memory the process is using).
-            See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
+    """Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
+    See `./benchmark.py` for usage examples.
+    Current memory consumption is returned using psutil and in particular is the RSS memory
+        "Resident Set Size” (the non-swapped physical memory the process is using).
+        See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
 
-        Args:
-            - `modules_to_trace`: (None, string, list/tuple of string)
-                if None, all events are recorded
-                if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
-            - `modules_not_to_trace`: (None, string, list/tuple of string)
-                if None, no module is avoided
-                if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
-            - `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
-                default to line
-            - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
+    Args:
+        - `modules_to_trace`: (None, string, list/tuple of string)
+            if None, all events are recorded
+            if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
+        - `modules_not_to_trace`: (None, string, list/tuple of string)
+            if None, no module is avoided
+            if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
+        - `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
+            default to line
+        - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
 
-        Return:
-            - `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
-                - `UsedMemoryState` are named tuples with the following fields:
-                    - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
-                    - 'cpu_memory': CPU RSS memory state *before* executing the line
-                    - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
+    Return:
+        - `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
+            - `UsedMemoryState` are named tuples with the following fields:
+                - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
+                - 'cpu_memory': CPU RSS memory state *before* executing the line
+                - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
 
-        `Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
-            `Frame` has the following fields:
-            - 'filename' (string): Name of the file currently executed
-            - 'module' (string): Name of the module currently executed
-            - 'line_number' (int): Number of the line currently executed
-            - 'event' (string): Event that triggered the tracing (default will be "line")
-            - 'line_text' (string): Text of the line in the python script
+    `Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
+        `Frame` has the following fields:
+        - 'filename' (string): Name of the file currently executed
+        - 'module' (string): Name of the module currently executed
+        - 'line_number' (int): Number of the line currently executed
+        - 'event' (string): Event that triggered the tracing (default will be "line")
+        - 'line_text' (string): Text of the line in the python script
 
     """
     if is_psutil_available():
@@ -371,8 +371,8 @@ def start_memory_tracing(
     memory_trace = []
 
     def traceit(frame, event, args):
-        """ Tracing method executed before running each line in a module or sub-module
-            Record memory allocated in a list with debugging information
+        """Tracing method executed before running each line in a module or sub-module
+        Record memory allocated in a list with debugging information
         """
         global _is_memory_tracing_enabled
 
@@ -456,39 +456,39 @@ def start_memory_tracing(
 def stop_memory_tracing(
     memory_trace: Optional[MemoryTrace] = None, ignore_released_memory: bool = True
 ) -> Optional[MemorySummary]:
-    """ Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
+    """Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
 
-        Args:
-            - `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
-            - `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory
+    Args:
+        - `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
+        - `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory
 
-        Return:
-            - None if `memory_trace` is None
-            - `MemorySummary` namedtuple otherwise with the fields:
-                - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
-                    by substracting the memory after executing each line from the memory before executing said line.
-                - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
-                    obtained by summing repeated memory increase for a line if it's executed several times.
-                    The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
-                - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
-                    Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
+    Return:
+        - None if `memory_trace` is None
+        - `MemorySummary` namedtuple otherwise with the fields:
+            - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
+                by substracting the memory after executing each line from the memory before executing said line.
+            - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
+                obtained by summing repeated memory increase for a line if it's executed several times.
+                The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
+            - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
+                Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
 
-        `Memory` named tuple have fields
-            - `byte` (integer): number of bytes,
-            - `string` (string): same as human readable string (ex: "3.5MB")
+    `Memory` named tuple have fields
+        - `byte` (integer): number of bytes,
+        - `string` (string): same as human readable string (ex: "3.5MB")
 
-        `Frame` are namedtuple used to list the current frame state and have the following fields:
-            - 'filename' (string): Name of the file currently executed
-            - 'module' (string): Name of the module currently executed
-            - 'line_number' (int): Number of the line currently executed
-            - 'event' (string): Event that triggered the tracing (default will be "line")
-            - 'line_text' (string): Text of the line in the python script
+    `Frame` are namedtuple used to list the current frame state and have the following fields:
+        - 'filename' (string): Name of the file currently executed
+        - 'module' (string): Name of the module currently executed
+        - 'line_number' (int): Number of the line currently executed
+        - 'event' (string): Event that triggered the tracing (default will be "line")
+        - 'line_text' (string): Text of the line in the python script
 
-        `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
-            - `frame` (`Frame`): the current frame (see above)
-            - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
-            - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
-            - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
+    `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
+        - `frame` (`Frame`): the current frame (see above)
+        - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
+        - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
+        - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
     """
     global _is_memory_tracing_enabled
     _is_memory_tracing_enabled = False
@@ -499,15 +499,19 @@ def stop_memory_tracing(
 
         cumulative_memory_dict = defaultdict(lambda: [0, 0, 0])
 
-        for ((frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem),) in zip(
-            memory_trace[:-1], memory_trace[1:]
-        ):
+        for (
+            (frame, cpu_mem, gpu_mem),
+            (next_frame, next_cpu_mem, next_gpu_mem),
+        ) in zip(memory_trace[:-1], memory_trace[1:]):
             cpu_mem_inc = next_cpu_mem - cpu_mem
             gpu_mem_inc = next_gpu_mem - gpu_mem
             cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc
             memory_diff_trace.append(
                 MemoryState(
-                    frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
+                    frame=frame,
+                    cpu=Memory(cpu_mem_inc),
+                    gpu=Memory(gpu_mem_inc),
+                    cpu_gpu=Memory(cpu_gpu_mem_inc),
                 )
             )
 
@@ -529,7 +533,10 @@ def stop_memory_tracing(
         )  # order by the total CPU + GPU memory increase
         cumulative_memory = list(
             MemoryState(
-                frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
+                frame=frame,
+                cpu=Memory(cpu_mem_inc),
+                gpu=Memory(gpu_mem_inc),
+                cpu_gpu=Memory(cpu_gpu_mem_inc),
             )
             for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory
         )
@@ -544,15 +551,17 @@ def stop_memory_tracing(
         total_memory = Memory(total_memory)
 
         return MemorySummary(
-            sequential=memory_diff_trace, cumulative=cumulative_memory, current=memory_curr_trace, total=total_memory,
+            sequential=memory_diff_trace,
+            cumulative=cumulative_memory,
+            current=memory_curr_trace,
+            total=total_memory,
         )
 
     return None
 
 
 def bytes_to_mega_bytes(memory_amount: int) -> int:
-    """ Utility to convert a number of bytes (int) into a number of mega bytes (int)
-    """
+    """Utility to convert a number of bytes (int) into a number of mega bytes (int)"""
     return memory_amount >> 20
 
 
diff --git a/src/transformers/configuration_albert.py b/src/transformers/configuration_albert.py
index 5dc43019f8..789d690b17 100644
--- a/src/transformers/configuration_albert.py
+++ b/src/transformers/configuration_albert.py
@@ -32,71 +32,71 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class AlbertConfig(PretrainedConfig):
     r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel`.
-        It is used to instantiate an ALBERT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel`.
+    It is used to instantiate an ALBERT model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
 
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
 
 
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30000):
-                Vocabulary size of the ALBERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
-            embedding_size (:obj:`int`, optional, defaults to 128):
-                Dimensionality of vocabulary embeddings.
-            hidden_size (:obj:`int`, optional, defaults to 4096):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_hidden_groups (:obj:`int`, optional, defaults to 1):
-                Number of groups for the hidden layers, parameters in the same group are shared.
-            num_attention_heads (:obj:`int`, optional, defaults to 64):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 16384):
-                The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            inner_group_num (:obj:`int`, optional, defaults to 1):
-                The number of inner repetition of attention and ffn.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with. Typically set this to something
-                large (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for attached classifiers.
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30000):
+            Vocabulary size of the ALBERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
+        embedding_size (:obj:`int`, optional, defaults to 128):
+            Dimensionality of vocabulary embeddings.
+        hidden_size (:obj:`int`, optional, defaults to 4096):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_hidden_groups (:obj:`int`, optional, defaults to 1):
+            Number of groups for the hidden layers, parameters in the same group are shared.
+        num_attention_heads (:obj:`int`, optional, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 16384):
+            The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        inner_group_num (:obj:`int`, optional, defaults to 1):
+            The number of inner repetition of attention and ffn.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something
+            large (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for attached classifiers.
 
-        Example::
+    Example::
 
-            >>> from transformers import AlbertConfig, AlbertModel
-            >>> # Initializing an ALBERT-xxlarge style configuration
-            >>> albert_xxlarge_configuration = AlbertConfig()
+        >>> from transformers import AlbertConfig, AlbertModel
+        >>> # Initializing an ALBERT-xxlarge style configuration
+        >>> albert_xxlarge_configuration = AlbertConfig()
 
-            >>> # Initializing an ALBERT-base style configuration
-            >>> albert_base_configuration = AlbertConfig(
-            ...      hidden_size=768,
-            ...      num_attention_heads=12,
-            ...      intermediate_size=3072,
-            ...  )
+        >>> # Initializing an ALBERT-base style configuration
+        >>> albert_base_configuration = AlbertConfig(
+        ...      hidden_size=768,
+        ...      num_attention_heads=12,
+        ...      intermediate_size=3072,
+        ...  )
 
-            >>> # Initializing a model from the ALBERT-base style configuration
-            >>> model = AlbertModel(albert_xxlarge_configuration)
+        >>> # Initializing a model from the ALBERT-base style configuration
+        >>> model = AlbertModel(albert_xxlarge_configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
     """
 
     model_type = "albert"
diff --git a/src/transformers/configuration_auto.py b/src/transformers/configuration_auto.py
index 96916488e0..75e3c36bd5 100644
--- a/src/transformers/configuration_auto.py
+++ b/src/transformers/configuration_auto.py
@@ -73,43 +73,112 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
 
 CONFIG_MAPPING = OrderedDict(
     [
-        ("retribert", RetriBertConfig,),
-        ("t5", T5Config,),
-        ("mobilebert", MobileBertConfig,),
-        ("distilbert", DistilBertConfig,),
-        ("albert", AlbertConfig,),
-        ("camembert", CamembertConfig,),
-        ("xlm-roberta", XLMRobertaConfig,),
+        (
+            "retribert",
+            RetriBertConfig,
+        ),
+        (
+            "t5",
+            T5Config,
+        ),
+        (
+            "mobilebert",
+            MobileBertConfig,
+        ),
+        (
+            "distilbert",
+            DistilBertConfig,
+        ),
+        (
+            "albert",
+            AlbertConfig,
+        ),
+        (
+            "camembert",
+            CamembertConfig,
+        ),
+        (
+            "xlm-roberta",
+            XLMRobertaConfig,
+        ),
         ("pegasus", PegasusConfig),
-        ("marian", MarianConfig,),
-        ("mbart", MBartConfig,),
-        ("bart", BartConfig,),
-        ("reformer", ReformerConfig,),
-        ("longformer", LongformerConfig,),
-        ("roberta", RobertaConfig,),
-        ("flaubert", FlaubertConfig,),
-        ("bert", BertConfig,),
-        ("openai-gpt", OpenAIGPTConfig,),
-        ("gpt2", GPT2Config,),
-        ("transfo-xl", TransfoXLConfig,),
-        ("xlnet", XLNetConfig,),
-        ("xlm", XLMConfig,),
-        ("ctrl", CTRLConfig,),
-        ("electra", ElectraConfig,),
-        ("encoder-decoder", EncoderDecoderConfig,),
+        (
+            "marian",
+            MarianConfig,
+        ),
+        (
+            "mbart",
+            MBartConfig,
+        ),
+        (
+            "bart",
+            BartConfig,
+        ),
+        (
+            "reformer",
+            ReformerConfig,
+        ),
+        (
+            "longformer",
+            LongformerConfig,
+        ),
+        (
+            "roberta",
+            RobertaConfig,
+        ),
+        (
+            "flaubert",
+            FlaubertConfig,
+        ),
+        (
+            "bert",
+            BertConfig,
+        ),
+        (
+            "openai-gpt",
+            OpenAIGPTConfig,
+        ),
+        (
+            "gpt2",
+            GPT2Config,
+        ),
+        (
+            "transfo-xl",
+            TransfoXLConfig,
+        ),
+        (
+            "xlnet",
+            XLNetConfig,
+        ),
+        (
+            "xlm",
+            XLMConfig,
+        ),
+        (
+            "ctrl",
+            CTRLConfig,
+        ),
+        (
+            "electra",
+            ElectraConfig,
+        ),
+        (
+            "encoder-decoder",
+            EncoderDecoderConfig,
+        ),
     ]
 )
 
 
 class AutoConfig:
     r"""
-        :class:`~transformers.AutoConfig` is a generic configuration class
-        that will be instantiated as one of the configuration classes of the library
-        when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
+    :class:`~transformers.AutoConfig` is a generic configuration class
+    that will be instantiated as one of the configuration classes of the library
+    when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
 
-        The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
+    The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
+    based on the `model_type` property of the config object, or when it's missing,
+    falling back to using pattern matching on the `pretrained_model_name_or_path` string.
     """
 
     def __init__(self):
diff --git a/src/transformers/configuration_bart.py b/src/transformers/configuration_bart.py
index 999dd1c60e..3a28dd9e37 100644
--- a/src/transformers/configuration_bart.py
+++ b/src/transformers/configuration_bart.py
@@ -102,7 +102,7 @@ BART_CONFIG_ARGS_DOC = r"""
 @add_start_docstrings_to_callable(BART_CONFIG_ARGS_DOC)
 class BartConfig(PretrainedConfig):
     r"""
-        Configuration class for Bart. Parameters are renamed from the fairseq implementation
+    Configuration class for Bart. Parameters are renamed from the fairseq implementation
     """
     model_type = "bart"
 
@@ -141,14 +141,14 @@ class BartConfig(PretrainedConfig):
         **common_kwargs
     ):
         r"""
-            :class:`~transformers.BartConfig` is the configuration class for `BartModel`.
+        :class:`~transformers.BartConfig` is the configuration class for `BartModel`.
 
-            Examples::
+        Examples::
 
-                >>> from transformers import BartConfig, BartModel
+            >>> from transformers import BartConfig, BartModel
 
-                >>> config = BartConfig.from_pretrained('facebook/bart-large')
-                >>> model = BartModel(config)
+            >>> config = BartConfig.from_pretrained('facebook/bart-large')
+            >>> model = BartModel(config)
 
         """
         if "hidden_size" in common_kwargs:
diff --git a/src/transformers/configuration_bert.py b/src/transformers/configuration_bert.py
index c0c00bedcd..14dafb4341 100644
--- a/src/transformers/configuration_bert.py
+++ b/src/transformers/configuration_bert.py
@@ -50,59 +50,59 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class BertConfig(PretrainedConfig):
     r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
-        It is used to instantiate an BERT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
+    It is used to instantiate an BERT model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
 
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
 
 
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the BERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
-            hidden_size (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 3072):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            gradient_checkpointing (:obj:`bool`, optional, defaults to False):
-                If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+        hidden_size (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        gradient_checkpointing (:obj:`bool`, optional, defaults to False):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
 
-        Example::
+    Example::
 
-            >>> from transformers import BertModel, BertConfig
+        >>> from transformers import BertModel, BertConfig
 
-            >>> # Initializing a BERT bert-base-uncased style configuration
-            >>> configuration = BertConfig()
+        >>> # Initializing a BERT bert-base-uncased style configuration
+        >>> configuration = BertConfig()
 
-            >>> # Initializing a model from the bert-base-uncased style configuration
-            >>> model = BertModel(configuration)
+        >>> # Initializing a model from the bert-base-uncased style configuration
+        >>> model = BertModel(configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
     """
     model_type = "bert"
 
diff --git a/src/transformers/configuration_ctrl.py b/src/transformers/configuration_ctrl.py
index 139799e057..304aa06b0d 100644
--- a/src/transformers/configuration_ctrl.py
+++ b/src/transformers/configuration_ctrl.py
@@ -25,55 +25,55 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.h
 
 class CTRLConfig(PretrainedConfig):
     """
-        This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel`.
-        It is used to instantiate an CTRL model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
+    This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel`.
+    It is used to instantiate an CTRL model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
 
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
 
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 246534):
-                Vocabulary size of the CTRL model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
-            n_positions (:obj:`int`, optional, defaults to 256):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            n_ctx (:obj:`int`, optional, defaults to 256):
-                Dimensionality of the causal mask (usually same as n_positions).
-            n_embd (:obj:`int`, optional, defaults to 1280):
-                Dimensionality of the embeddings and hidden states.
-            dff (:obj:`int`, optional, defaults to 8192):
-                Dimensionality of the inner dimension of the FFN.
-            n_layer (:obj:`int`, optional, defaults to 48):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
-                The dropout ratio for the embeddings.
-            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention.
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
-                The epsilon to use in the layer normalization layers
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 246534):
+            Vocabulary size of the CTRL model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
+        n_positions (:obj:`int`, optional, defaults to 256):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        n_ctx (:obj:`int`, optional, defaults to 256):
+            Dimensionality of the causal mask (usually same as n_positions).
+        n_embd (:obj:`int`, optional, defaults to 1280):
+            Dimensionality of the embeddings and hidden states.
+        dff (:obj:`int`, optional, defaults to 8192):
+            Dimensionality of the inner dimension of the FFN.
+        n_layer (:obj:`int`, optional, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
+            The epsilon to use in the layer normalization layers
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 
-        Example::
+    Example::
 
-            >>> from transformers import CTRLModel, CTRLConfig
+        >>> from transformers import CTRLModel, CTRLConfig
 
-            >>> # Initializing a CTRL configuration
-            >>> configuration = CTRLConfig()
+        >>> # Initializing a CTRL configuration
+        >>> configuration = CTRLConfig()
 
-            >>> # Initializing a model from the configuration
-            >>> model = CTRLModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = CTRLModel(configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
     """
 
     model_type = "ctrl"
diff --git a/src/transformers/configuration_distilbert.py b/src/transformers/configuration_distilbert.py
index 949b9e0b6d..2f0b2763b6 100644
--- a/src/transformers/configuration_distilbert.py
+++ b/src/transformers/configuration_distilbert.py
@@ -33,61 +33,61 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class DistilBertConfig(PretrainedConfig):
     r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
-        It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
+    It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
 
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
 
 
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the DistilBERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use sinusoidal positional embeddings.
-            n_layers (:obj:`int`, optional, defaults to 6):
-                Number of hidden layers in the Transformer encoder.
-            n_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            dim (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the encoder layers and the pooler layer.
-            hidden_dim (:obj:`int`, optional, defaults to 3072):
-                The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            qa_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilities used in the question answering model
-                :class:`~transformers.DistilBertForQuestionAnswering`.
-            seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
-                The dropout probabilities used in the sequence classification and the multiple choice model
-                :class:`~transformers.DistilBertForSequenceClassification`.
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the DistilBERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use sinusoidal positional embeddings.
+        n_layers (:obj:`int`, optional, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        n_heads (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dim (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        hidden_dim (:obj:`int`, optional, defaults to 3072):
+            The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        qa_dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilities used in the question answering model
+            :class:`~transformers.DistilBertForQuestionAnswering`.
+        seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
+            The dropout probabilities used in the sequence classification and the multiple choice model
+            :class:`~transformers.DistilBertForSequenceClassification`.
 
-        Example::
+    Example::
 
-            >>> from transformers import DistilBertModel, DistilBertConfig
+        >>> from transformers import DistilBertModel, DistilBertConfig
 
-            >>> # Initializing a DistilBERT configuration
-            >>> configuration = DistilBertConfig()
+        >>> # Initializing a DistilBERT configuration
+        >>> configuration = DistilBertConfig()
 
-            >>> # Initializing a model from the configuration
-            >>> model = DistilBertModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = DistilBertModel(configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
     """
     model_type = "distilbert"
 
diff --git a/src/transformers/configuration_dpr.py b/src/transformers/configuration_dpr.py
index a012a8a7b9..ea6a6e5954 100644
--- a/src/transformers/configuration_dpr.py
+++ b/src/transformers/configuration_dpr.py
@@ -29,16 +29,16 @@ DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class DPRConfig(BertConfig):
     r"""
-        :class:`~transformers.DPRConfig` is the configuration class to store the configuration of a
-        `DPRModel`.
+    :class:`~transformers.DPRConfig` is the configuration class to store the configuration of a
+    `DPRModel`.
 
-        This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`.
-        It is used to instantiate the components of the DPR model.
+    This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`.
+    It is used to instantiate the components of the DPR model.
 
-        Args:
-            projection_dim (:obj:`int`, optional, defaults to 0):
-                Dimension of the projection for the context and question encoders.
-                If it is set to zero (default), then no projection is done.
+    Args:
+        projection_dim (:obj:`int`, optional, defaults to 0):
+            Dimension of the projection for the context and question encoders.
+            If it is set to zero (default), then no projection is done.
     """
     model_type = "dpr"
 
diff --git a/src/transformers/configuration_electra.py b/src/transformers/configuration_electra.py
index 7295a5941e..be81bc4977 100644
--- a/src/transformers/configuration_electra.py
+++ b/src/transformers/configuration_electra.py
@@ -33,82 +33,82 @@ ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class ElectraConfig(PretrainedConfig):
     r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
-        It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
-        architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
+    It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
+    architecture.
 
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
 
 
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the ELECTRA model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
-            embedding_size (:obj:`int`, optional, defaults to 128):
-                Dimensionality of the encoder layers and the pooler layer.
-            hidden_size (:obj:`int`, optional, defaults to 256):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 4):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            summary_type (:obj:`string`, optional, defaults to "first"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.ElectraForMultipleChoice`.
-                Is one of the following options:
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the ELECTRA model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
+        embedding_size (:obj:`int`, optional, defaults to 128):
+            Dimensionality of the encoder layers and the pooler layer.
+        hidden_size (:obj:`int`, optional, defaults to 256):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        summary_type (:obj:`string`, optional, defaults to "first"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.ElectraForMultipleChoice`.
+            Is one of the following options:
 
-                    - 'last' => take the last token hidden state (like XLNet)
-                    - 'first' => take the first token hidden state (like Bert)
-                    - 'mean' => take the mean of all tokens hidden states
-                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                    - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.ElectraForMultipleChoice`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.ElectraForMultipleChoice`.
-                'gelu' => add a gelu activation to the output, Other => no activation.
-            summary_last_dropout (:obj:`float`, optional, defaults to 0.0):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.ElectraForMultipleChoice`.
-                Add a dropout after the projection and activation
+                - 'last' => take the last token hidden state (like XLNet)
+                - 'first' => take the first token hidden state (like Bert)
+                - 'mean' => take the mean of all tokens hidden states
+                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.ElectraForMultipleChoice`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.ElectraForMultipleChoice`.
+            'gelu' => add a gelu activation to the output, Other => no activation.
+        summary_last_dropout (:obj:`float`, optional, defaults to 0.0):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.ElectraForMultipleChoice`.
+            Add a dropout after the projection and activation
 
-        Example::
+    Example::
 
-            >>> from transformers import ElectraModel, ElectraConfig
+        >>> from transformers import ElectraModel, ElectraConfig
 
-            >>> # Initializing a ELECTRA electra-base-uncased style configuration
-            >>> configuration = ElectraConfig()
+        >>> # Initializing a ELECTRA electra-base-uncased style configuration
+        >>> configuration = ElectraConfig()
 
-            >>> # Initializing a model from the electra-base-uncased style configuration
-            >>> model = ElectraModel(configuration)
+        >>> # Initializing a model from the electra-base-uncased style configuration
+        >>> model = ElectraModel(configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
     """
     model_type = "electra"
 
diff --git a/src/transformers/configuration_encoder_decoder.py b/src/transformers/configuration_encoder_decoder.py
index af9f21a49b..7ce3bd327c 100644
--- a/src/transformers/configuration_encoder_decoder.py
+++ b/src/transformers/configuration_encoder_decoder.py
@@ -25,47 +25,47 @@ logger = logging.get_logger(__name__)
 
 class EncoderDecoderConfig(PretrainedConfig):
     r"""
-        :class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`.
+    :class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`.
 
-        It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs.
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig`
-        and can be used to control the model outputs.
-        See the documentation for :class:`~transformers.PretrainedConfig` for more information.
+    It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig`
+    and can be used to control the model outputs.
+    See the documentation for :class:`~transformers.PretrainedConfig` for more information.
 
-        Args:
-            kwargs (`optional`):
-                Remaining dictionary of keyword arguments. Notably:
-                    encoder (:class:`PretrainedConfig`, optional, defaults to `None`):
-                        An instance of a configuration object that defines the encoder config.
-                    decoder (:class:`PretrainedConfig`, optional, defaults to `None`):
-                        An instance of a configuration object that defines the decoder config.
+    Args:
+        kwargs (`optional`):
+            Remaining dictionary of keyword arguments. Notably:
+                encoder (:class:`PretrainedConfig`, optional, defaults to `None`):
+                    An instance of a configuration object that defines the encoder config.
+                decoder (:class:`PretrainedConfig`, optional, defaults to `None`):
+                    An instance of a configuration object that defines the decoder config.
 
-        Example::
+    Example::
 
-            >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
+        >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
 
-            >>> # Initializing a BERT bert-base-uncased style configuration
-            >>> config_encoder = BertConfig()
-            >>> config_decoder = BertConfig()
+        >>> # Initializing a BERT bert-base-uncased style configuration
+        >>> config_encoder = BertConfig()
+        >>> config_decoder = BertConfig()
 
-            >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+        >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
 
-            >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
-            >>> model = EncoderDecoderModel(config=config)
+        >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
+        >>> model = EncoderDecoderModel(config=config)
 
-            >>> # Accessing the model configuration
-            >>> config_encoder = model.config.encoder
-            >>> config_decoder  = model.config.decoder
-            >>> # set decoder config to causal lm
-            >>> config_decoder.is_decoder = True
-            >>> config_decoder.add_cross_attention = True
+        >>> # Accessing the model configuration
+        >>> config_encoder = model.config.encoder
+        >>> config_decoder  = model.config.decoder
+        >>> # set decoder config to causal lm
+        >>> config_decoder.is_decoder = True
+        >>> config_decoder.add_cross_attention = True
 
-            >>> # Saving the model, including its configuration
-            >>> model.save_pretrained('my-model')
+        >>> # Saving the model, including its configuration
+        >>> model.save_pretrained('my-model')
 
-            >>> # loading model and config from pretrained folder
-            >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
-            >>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
+        >>> # loading model and config from pretrained folder
+        >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
+        >>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
     """
     model_type = "encoder_decoder"
 
diff --git a/src/transformers/configuration_flaubert.py b/src/transformers/configuration_flaubert.py
index bb5dc114ba..42a182368d 100644
--- a/src/transformers/configuration_flaubert.py
+++ b/src/transformers/configuration_flaubert.py
@@ -30,121 +30,120 @@ FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class FlaubertConfig(XLMConfig):
     """
-        Configuration class to store the configuration of a `FlaubertModel`.
-        This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
-        It is used to instantiate an XLM model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
+    Configuration class to store the configuration of a `FlaubertModel`.
+    This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
+    It is used to instantiate an XLM model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
 
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
 
-        Args:
-            pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether to apply the layer normalization before or after the feed forward layer following the
-                attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
-            layerdrop (:obj:`float`, `optional`, defaults to 0.0):
-                Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
-                with Structured Dropout. ICLR 2020)
-            vocab_size (:obj:`int`, optional, defaults to 30145):
-                Vocabulary size of the Flaubert model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
-            emb_dim (:obj:`int`, optional, defaults to 2048):
-                Dimensionality of the encoder layers and the pooler layer.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for the attention mechanism
-            gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
-                The non-linear activation function (function or string) in the
-                encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
-            sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
-            causal (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Set this to `True` for the model to behave in a causal manner.
-                Causal models use a triangular attention mask in order to only attend to the left-side context instead
-                if a bidirectional context.
-            asm (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
-                layer.
-            n_langs (:obj:`int`, optional, defaults to 1):
-                The number of languages the model handles. Set to 1 for monolingual models.
-            use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
-                Whether to use language embeddings. Some models use additional language embeddings, see
-                `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
-                for information on how to use them.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
-                The standard deviation of the truncated_normal_initializer for
-                initializing the embedding matrices.
-            init_std (:obj:`int`, optional, defaults to 50257):
-                The standard deviation of the truncated_normal_initializer for
-                initializing all weight matrices except the embedding matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            bos_index (:obj:`int`, optional, defaults to 0):
-                The index of the beginning of sentence token in the vocabulary.
-            eos_index (:obj:`int`, optional, defaults to 1):
-                The index of the end of sentence token in the vocabulary.
-            pad_index (:obj:`int`, optional, defaults to 2):
-                The index of the padding token in the vocabulary.
-            unk_index (:obj:`int`, optional, defaults to 3):
-                The index of the unknown token in the vocabulary.
-            mask_index (:obj:`int`, optional, defaults to 5):
-                The index of the masking token in the vocabulary.
-            is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
-                Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
-            summary_type (:obj:`string`, optional, defaults to "first"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Is one of the following options:
+    Args:
+        pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to apply the layer normalization before or after the feed forward layer following the
+            attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
+        layerdrop (:obj:`float`, `optional`, defaults to 0.0):
+            Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
+            with Structured Dropout. ICLR 2020)
+        vocab_size (:obj:`int`, optional, defaults to 30145):
+            Vocabulary size of the Flaubert model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
+        emb_dim (:obj:`int`, optional, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        n_layer (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for the attention mechanism
+        gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
+        sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
+        causal (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Set this to `True` for the model to behave in a causal manner.
+            Causal models use a triangular attention mask in order to only attend to the left-side context instead
+            if a bidirectional context.
+        asm (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
+            layer.
+        n_langs (:obj:`int`, optional, defaults to 1):
+            The number of languages the model handles. Set to 1 for monolingual models.
+        use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
+            Whether to use language embeddings. Some models use additional language embeddings, see
+            `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
+            for information on how to use them.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
+            The standard deviation of the truncated_normal_initializer for
+            initializing the embedding matrices.
+        init_std (:obj:`int`, optional, defaults to 50257):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices except the embedding matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        bos_index (:obj:`int`, optional, defaults to 0):
+            The index of the beginning of sentence token in the vocabulary.
+        eos_index (:obj:`int`, optional, defaults to 1):
+            The index of the end of sentence token in the vocabulary.
+        pad_index (:obj:`int`, optional, defaults to 2):
+            The index of the padding token in the vocabulary.
+        unk_index (:obj:`int`, optional, defaults to 3):
+            The index of the unknown token in the vocabulary.
+        mask_index (:obj:`int`, optional, defaults to 5):
+            The index of the masking token in the vocabulary.
+        is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
+            Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
+        summary_type (:obj:`string`, optional, defaults to "first"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Is one of the following options:
 
-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a dropout before the projection and activation
-            start_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            end_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            mask_token_id (:obj:`int`, optional, defaults to 0):
-                Model agnostic parameter to identify masked tokens when generating text in an MLM context.
-            lang_id (:obj:`int`, optional, defaults to 1):
-                The ID of the language used by the model. This parameter is used when generating
-                text in a given language.
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Add a dropout before the projection and activation
+        start_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        end_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        mask_token_id (:obj:`int`, optional, defaults to 0):
+            Model agnostic parameter to identify masked tokens when generating text in an MLM context.
+        lang_id (:obj:`int`, optional, defaults to 1):
+            The ID of the language used by the model. This parameter is used when generating
+            text in a given language.
     """
 
     model_type = "flaubert"
 
     def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs):
-        """Constructs FlaubertConfig.
-        """
+        """Constructs FlaubertConfig."""
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
         self.layerdrop = layerdrop
         self.pre_norm = pre_norm
diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py
index d2ebdb4034..b809b3e1df 100644
--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -32,84 +32,84 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class GPT2Config(PretrainedConfig):
     """
-        This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
-        It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
+    It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
 
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
 
 
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 50257):
-                Vocabulary size of the GPT-2 model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
-            n_positions (:obj:`int`, optional, defaults to 1024):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            n_ctx (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the causal mask (usually same as n_positions).
-            n_embd (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the embeddings and hidden states.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            n_inner (:obj:`int`, optional, defaults to None):
-                Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
-            activation_function (:obj:`str`, optional, defaults to 'gelu'):
-                Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"].
-            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
-                The dropout ratio for the embeddings.
-            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention.
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
-                The epsilon to use in the layer normalization layers
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            summary_type (:obj:`string`, optional, defaults to "cls_index"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                Is one of the following options:
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
+        n_positions (:obj:`int`, optional, defaults to 1024):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        n_ctx (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the causal mask (usually same as n_positions).
+        n_embd (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (:obj:`int`, optional, defaults to None):
+            Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
+        activation_function (:obj:`str`, optional, defaults to 'gelu'):
+            Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"].
+        resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        summary_type (:obj:`string`, optional, defaults to "cls_index"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            Is one of the following options:
 
-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                Add a dropout before the projection and activation
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            Add a dropout before the projection and activation
 
-        Example::
+    Example::
 
-            >>> from transformers import GPT2Model, GPT2Config
+        >>> from transformers import GPT2Model, GPT2Config
 
-            >>> # Initializing a GPT2 configuration
-            >>> configuration = GPT2Config()
+        >>> # Initializing a GPT2 configuration
+        >>> configuration = GPT2Config()
 
-            >>> # Initializing a model from the configuration
-            >>> model = GPT2Model(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = GPT2Model(configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
     """
 
     model_type = "gpt2"
diff --git a/src/transformers/configuration_longformer.py b/src/transformers/configuration_longformer.py
index 21a8f37a4c..e69eef440e 100644
--- a/src/transformers/configuration_longformer.py
+++ b/src/transformers/configuration_longformer.py
@@ -33,32 +33,32 @@ LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class LongformerConfig(RobertaConfig):
     r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`.
-        It is used to instantiate an Longformer model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the RoBERTa `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096.
+    This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`.
+    It is used to instantiate an Longformer model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the RoBERTa `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096.
 
-        The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
-        It reuses the same defaults. Please check the parent class for more information.
+    The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
+    It reuses the same defaults. Please check the parent class for more information.
 
-        Args:
-            attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
-                Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
-                To specify a different window size for each layer, use a :obj:`List[int]` where
-                ``len(attention_window) == num_hidden_layers``.
+    Args:
+        attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
+            Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
+            To specify a different window size for each layer, use a :obj:`List[int]` where
+            ``len(attention_window) == num_hidden_layers``.
 
-        Example::
+    Example::
 
-            >>> from transformers import LongformerConfig, LongformerModel
+        >>> from transformers import LongformerConfig, LongformerModel
 
-            >>> # Initializing a Longformer configuration
-            >>> configuration = LongformerConfig()
+        >>> # Initializing a Longformer configuration
+        >>> configuration = LongformerConfig()
 
-            >>> # Initializing a model from the configuration
-            >>> model = LongformerModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = LongformerModel(configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
     """
     model_type = "longformer"
 
diff --git a/src/transformers/configuration_mobilebert.py b/src/transformers/configuration_mobilebert.py
index 83216243b3..c3d05a7b81 100644
--- a/src/transformers/configuration_mobilebert.py
+++ b/src/transformers/configuration_mobilebert.py
@@ -25,79 +25,79 @@ MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class MobileBertConfig(PretrainedConfig):
     r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel`.
-        It is used to instantiate a MobileBERT model according to the specified arguments, defining the model
-        architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel`.
+    It is used to instantiate a MobileBERT model according to the specified arguments, defining the model
+    architecture.
 
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
 
 
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the MobileBERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.MobileBertModel`.
-            hidden_size (:obj:`int`, optional, defaults to 512):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 24):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 4):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 512):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.0):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.MobileBertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the MobileBERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.MobileBertModel`.
+        hidden_size (:obj:`int`, optional, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 512):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.MobileBertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
 
-            pad_token_id (:obj:`int`, optional, defaults to 0):
-                The ID of the token in the word embedding to use as padding.
-            embedding_size (:obj:`int`, optional, defaults to 128):
-                The dimension of the word embedding vectors.
-            trigram_input (:obj:`bool`, optional, defaults to True):
-                Use a convolution of trigram as input.
-            use_bottleneck (:obj:`bool`, optional, defaults to True):
-                Whether to use bottleneck in BERT.
-            intra_bottleneck_size (:obj:`int`, optional, defaults to 128):
-                Size of bottleneck layer output.
-            use_bottleneck_attention (:obj:`bool`, optional, defaults to False):
-                Whether to use attention inputs from the bottleneck transformation.
-            key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True):
-                Whether to use the same linear transformation for query&key in the bottleneck.
-            num_feedforward_networks (:obj:`int`, optional, defaults to 4):
-                Number of FFNs in a block.
-            normalization_type (:obj:`str`, optional, defaults to "no_norm"):
-                The normalization type in BERT.
+        pad_token_id (:obj:`int`, optional, defaults to 0):
+            The ID of the token in the word embedding to use as padding.
+        embedding_size (:obj:`int`, optional, defaults to 128):
+            The dimension of the word embedding vectors.
+        trigram_input (:obj:`bool`, optional, defaults to True):
+            Use a convolution of trigram as input.
+        use_bottleneck (:obj:`bool`, optional, defaults to True):
+            Whether to use bottleneck in BERT.
+        intra_bottleneck_size (:obj:`int`, optional, defaults to 128):
+            Size of bottleneck layer output.
+        use_bottleneck_attention (:obj:`bool`, optional, defaults to False):
+            Whether to use attention inputs from the bottleneck transformation.
+        key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True):
+            Whether to use the same linear transformation for query&key in the bottleneck.
+        num_feedforward_networks (:obj:`int`, optional, defaults to 4):
+            Number of FFNs in a block.
+        normalization_type (:obj:`str`, optional, defaults to "no_norm"):
+            The normalization type in BERT.
 
-        Example:
+    Example:
 
-            >>> from transformers import MobileBertModel, MobileBertConfig
+        >>> from transformers import MobileBertModel, MobileBertConfig
 
-            >>> # Initializing a MobileBERT configuration
-            >>> configuration = MobileBertConfig()
+        >>> # Initializing a MobileBERT configuration
+        >>> configuration = MobileBertConfig()
 
-            >>> # Initializing a model from the configuration above
-            >>> model = MobileBertModel(configuration)
+        >>> # Initializing a model from the configuration above
+        >>> model = MobileBertModel(configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
 
-        Attributes:
-            pretrained_config_archive_map (Dict[str, str]):
-                A dictionary containing all the available pre-trained checkpoints.
+    Attributes:
+        pretrained_config_archive_map (Dict[str, str]):
+            A dictionary containing all the available pre-trained checkpoints.
     """
     pretrained_config_archive_map = MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
     model_type = "mobilebert"
diff --git a/src/transformers/configuration_openai.py b/src/transformers/configuration_openai.py
index 4ec26a93bb..9d5b57109e 100644
--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@@ -28,84 +28,84 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class OpenAIGPTConfig(PretrainedConfig):
     """
-        This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel`.
-        It is used to instantiate an GPT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
+    This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel`.
+    It is used to instantiate an GPT model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
 
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
 
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 40478):
-                Vocabulary size of the GPT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
-            n_positions (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            n_ctx (:obj:`int`, optional, defaults to 512):
-                Dimensionality of the causal mask (usually same as n_positions).
-            n_embd (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the embeddings and hidden states.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
-                The dropout ratio for the embeddings.
-            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention.
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
-                The epsilon to use in the layer normalization layers
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Whether special tokens should be predicted when the model is has a language modeling head.
-            summary_type (:obj:`string`, optional, defaults to "cls_index"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                Is one of the following options:
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 40478):
+            Vocabulary size of the GPT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
+        n_positions (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        n_ctx (:obj:`int`, optional, defaults to 512):
+            Dimensionality of the causal mask (usually same as n_positions).
+        n_embd (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Whether special tokens should be predicted when the model is has a language modeling head.
+        summary_type (:obj:`string`, optional, defaults to "cls_index"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            Is one of the following options:
 
-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                Add a dropout before the projection and activation
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            Add a dropout before the projection and activation
 
-        Example::
+    Example::
 
-            >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
+        >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
 
-            >>> # Initializing a GPT configuration
-            >>> configuration = OpenAIGPTConfig()
+        >>> # Initializing a GPT configuration
+        >>> configuration = OpenAIGPTConfig()
 
-            >>> # Initializing a model from the configuration
-            >>> model = OpenAIGPTModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = OpenAIGPTModel(configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
     """
 
     model_type = "openai-gpt"
diff --git a/src/transformers/configuration_pegasus.py b/src/transformers/configuration_pegasus.py
index a9c92e6a6d..4c3564fd10 100644
--- a/src/transformers/configuration_pegasus.py
+++ b/src/transformers/configuration_pegasus.py
@@ -92,8 +92,8 @@ expected_alpha = {
 @add_start_docstrings_to_callable(BART_CONFIG_ARGS_DOC)
 class PegasusConfig(BartConfig):
     r"""
-        :class:`~transformers.PegasusConfig` is the configuration class to store the configuration of a
-        `PegasusModel`.
+    :class:`~transformers.PegasusConfig` is the configuration class to store the configuration of a
+    `PegasusModel`.
     """
     model_type = "pegasus"
     # The implementation of the config object is in BartConfig
diff --git a/src/transformers/configuration_reformer.py b/src/transformers/configuration_reformer.py
index 9e501144d7..6f48508e77 100755
--- a/src/transformers/configuration_reformer.py
+++ b/src/transformers/configuration_reformer.py
@@ -29,105 +29,105 @@ REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class ReformerConfig(PretrainedConfig):
     r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.ReformerModel`.
-        It is used to instantiate an Reformer model according to the specified arguments, defining the model
-        architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.ReformerModel`.
+    It is used to instantiate an Reformer model according to the specified arguments, defining the model
+    architecture.
 
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
 
-        Args:
-            attention_head_size (:obj:`int`, optional, defaults to 64):
-                Dimensionality of the projected key, query and value vectors
-            attn_layers (:obj:`list(str)`, optional, defaults to ["local", "lsh", "local", "lsh", "local", "lsh"]):
-                List of attention layer types in ascending order. It can be chosen between a
-                LSHSelfAttention layer ("lsh") and a LocalSelfAttention layer ("local").
-                For more information on LSHSelfAttention layer, see `LSH Self Attention <reformer.html#lsh-self-attention>`__ .
-                For more information on LocalSelfAttention layer, see `Local Self Attention <reformer.html#local-sensitive-hashing-self-attention>`__ .
-            axial_pos_embds (:obj:`bool`, optional, defaults to True):
-                If `True` use axial position embeddings. For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__
-            axial_norm_std (:obj:`float`, optional, defaluts to 1.0):
-                The standard deviation of the normal_initializer for initializing the weight matrices of the axial positional encodings.
-            axial_pos_shape (:obj:`list(int)`, optional, defaults to `[64, 64]`):
-                The position dims of the axial position encodings.
-                During training the product of the position dims has to equal the sequence length.
-                For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
-            axial_pos_embds_dim (:obj:`list(int)`, optional, defaults to `[64, 192]`):
-                The embedding dims of the axial position encodings.
-                The sum of the embedding dims has to equal the hidden size.
-                For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
-            chunk_size_lm_head (:obj:`int`, optional, defaults to 0):
-                The chunk size of the final language model feed forward head layer.
-                A chunk size of 0 means that the feed forward layer is not chunked.
-                A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time.
-                For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
-            eos_token_id (:obj:`int`, optional, defaults to 2):
-                The token id for the <EOS> token.
-            feed_forward_size (:obj:`int`, optional, defaults to 512):
-                Dimensionality of the "feed_forward" (i.e., feed-forward) layer in the residual attention block.
-            hash_seed (:obj:`int`, optional, defaults to `None`):
-                Seed that can be used to make local sensitive hashing in LSHSelfAttention deterministic. This should only be set for testing purposed. For evaluation and training purposes `hash_seed` should be set to `None` to ensure fully random rotations in local sensitive hashing scheme.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
-                The non-linear activation function (function or string) in the feed forward layer in the residual attention block.
-                If string, "gelu", "relu", "swish", "gelu_new" and "gelu_fast" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.05):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            hidden_size (:obj:`int`, optional, defaults to 256):
-                Dimensionality of the output hidden states of the residual attention blocks.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            is_decoder (:obj:`bool`, optional, defaults to False):
-                If `is_decoder` is True, a causal mask is used in addition to `attention_mask`.
-                When using the Reformer for causal language modeling, `is_decoder` is set to `True`.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            local_chunk_length (:obj:`int`, optional, defaults to 64):
-                Length of chunk which attends to itself in LocalSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
-            local_num_chunks_before (:obj:`int`, optional, defaults to 1):
-                Number of previous neighbouring chunks to attend to in LocalSelfAttention layer to itself.
-            local_num_chunks_after (:obj:`int`, optional, defaults to 0):
-                Number of following neighbouring chunks to attend to in LocalSelfAttention layer in addition to itself.
-            local_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities in LocalSelfAttention.
-            lsh_attn_chunk_length (:obj:`int`, optional, defaults to 64):
-                Length of chunk which attends to itself in LSHSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
-            lsh_num_chunks_before (:obj:`int`, optional, defaults to 1):
-                Number of previous neighbouring chunks to attend to in LSHSelfAttention layer to itself.
-            lsh_num_chunks_after (:obj:`int`, optional, defaults to 0):
-                Number of following neighbouring chunks to attend to in LSHSelfAttention layer to itself.
-            lsh_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities in LSHSelfAttention.
-            max_position_embeddings (:obj:`int`, optional, defaults to 4096):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            num_attention_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            num_buckets (:obj:`int` or :obj:`list(int)`, optional, defaults to `None`):
-                Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme. Each query key vector is hashed into a hash in `1, ..., num_buckets`.
-                The number of buckets can also be factorized into a list for improved memory complexity. In this case, each query key vector is hashed into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if `num_buckets` is factorized into two factors.
-                The number of buckets (or the product the factors) should approximately equal sequence length / lsh_chunk_length. If `num_buckets` is set to `None`, a good value for `num_buckets` is calculated on the fly.
-            num_hashes (:obj:`int`, optional, defaults to 1):
-                Number of hashing rounds (e.g. number of random rotations) in Local Sensitive Hashing scheme.
-                The higher `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time intensive the hashing becomes.
-            pad_token_id (:obj:`int`, optional, defaults to 0):
-                The token id for the <PAD> token.
-            vocab_size (:obj:`int`, optional, defaults to 320):
-                Vocabulary size of the Reformer model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ReformerModel`.
+    Args:
+        attention_head_size (:obj:`int`, optional, defaults to 64):
+            Dimensionality of the projected key, query and value vectors
+        attn_layers (:obj:`list(str)`, optional, defaults to ["local", "lsh", "local", "lsh", "local", "lsh"]):
+            List of attention layer types in ascending order. It can be chosen between a
+            LSHSelfAttention layer ("lsh") and a LocalSelfAttention layer ("local").
+            For more information on LSHSelfAttention layer, see `LSH Self Attention <reformer.html#lsh-self-attention>`__ .
+            For more information on LocalSelfAttention layer, see `Local Self Attention <reformer.html#local-sensitive-hashing-self-attention>`__ .
+        axial_pos_embds (:obj:`bool`, optional, defaults to True):
+            If `True` use axial position embeddings. For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__
+        axial_norm_std (:obj:`float`, optional, defaluts to 1.0):
+            The standard deviation of the normal_initializer for initializing the weight matrices of the axial positional encodings.
+        axial_pos_shape (:obj:`list(int)`, optional, defaults to `[64, 64]`):
+            The position dims of the axial position encodings.
+            During training the product of the position dims has to equal the sequence length.
+            For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
+        axial_pos_embds_dim (:obj:`list(int)`, optional, defaults to `[64, 192]`):
+            The embedding dims of the axial position encodings.
+            The sum of the embedding dims has to equal the hidden size.
+            For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
+        chunk_size_lm_head (:obj:`int`, optional, defaults to 0):
+            The chunk size of the final language model feed forward head layer.
+            A chunk size of 0 means that the feed forward layer is not chunked.
+            A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time.
+            For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
+        eos_token_id (:obj:`int`, optional, defaults to 2):
+            The token id for the <EOS> token.
+        feed_forward_size (:obj:`int`, optional, defaults to 512):
+            Dimensionality of the "feed_forward" (i.e., feed-forward) layer in the residual attention block.
+        hash_seed (:obj:`int`, optional, defaults to `None`):
+            Seed that can be used to make local sensitive hashing in LSHSelfAttention deterministic. This should only be set for testing purposed. For evaluation and training purposes `hash_seed` should be set to `None` to ensure fully random rotations in local sensitive hashing scheme.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
+            The non-linear activation function (function or string) in the feed forward layer in the residual attention block.
+            If string, "gelu", "relu", "swish", "gelu_new" and "gelu_fast" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.05):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        hidden_size (:obj:`int`, optional, defaults to 256):
+            Dimensionality of the output hidden states of the residual attention blocks.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        is_decoder (:obj:`bool`, optional, defaults to False):
+            If `is_decoder` is True, a causal mask is used in addition to `attention_mask`.
+            When using the Reformer for causal language modeling, `is_decoder` is set to `True`.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        local_chunk_length (:obj:`int`, optional, defaults to 64):
+            Length of chunk which attends to itself in LocalSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
+        local_num_chunks_before (:obj:`int`, optional, defaults to 1):
+            Number of previous neighbouring chunks to attend to in LocalSelfAttention layer to itself.
+        local_num_chunks_after (:obj:`int`, optional, defaults to 0):
+            Number of following neighbouring chunks to attend to in LocalSelfAttention layer in addition to itself.
+        local_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities in LocalSelfAttention.
+        lsh_attn_chunk_length (:obj:`int`, optional, defaults to 64):
+            Length of chunk which attends to itself in LSHSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
+        lsh_num_chunks_before (:obj:`int`, optional, defaults to 1):
+            Number of previous neighbouring chunks to attend to in LSHSelfAttention layer to itself.
+        lsh_num_chunks_after (:obj:`int`, optional, defaults to 0):
+            Number of following neighbouring chunks to attend to in LSHSelfAttention layer to itself.
+        lsh_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities in LSHSelfAttention.
+        max_position_embeddings (:obj:`int`, optional, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        num_attention_heads (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_buckets (:obj:`int` or :obj:`list(int)`, optional, defaults to `None`):
+            Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme. Each query key vector is hashed into a hash in `1, ..., num_buckets`.
+            The number of buckets can also be factorized into a list for improved memory complexity. In this case, each query key vector is hashed into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if `num_buckets` is factorized into two factors.
+            The number of buckets (or the product the factors) should approximately equal sequence length / lsh_chunk_length. If `num_buckets` is set to `None`, a good value for `num_buckets` is calculated on the fly.
+        num_hashes (:obj:`int`, optional, defaults to 1):
+            Number of hashing rounds (e.g. number of random rotations) in Local Sensitive Hashing scheme.
+            The higher `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time intensive the hashing becomes.
+        pad_token_id (:obj:`int`, optional, defaults to 0):
+            The token id for the <PAD> token.
+        vocab_size (:obj:`int`, optional, defaults to 320):
+            Vocabulary size of the Reformer model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ReformerModel`.
 
-        Example::
+    Example::
 
-            >>> from transformers import ReformerModel, ReformerConfig
+        >>> from transformers import ReformerModel, ReformerConfig
 
-            >>> # Initializing a Reformer configuration
-            >>> configuration = ReformerConfig()
+        >>> # Initializing a Reformer configuration
+        >>> configuration = ReformerConfig()
 
-            >>> # Initializing a Reformer model
-            >>> model = ReformerModel(configuration)
+        >>> # Initializing a Reformer model
+        >>> model = ReformerModel(configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
     """
     model_type = "reformer"
 
diff --git a/src/transformers/configuration_retribert.py b/src/transformers/configuration_retribert.py
index 8df4749f51..e4caa33b64 100644
--- a/src/transformers/configuration_retribert.py
+++ b/src/transformers/configuration_retribert.py
@@ -28,47 +28,47 @@ RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class RetriBertConfig(PretrainedConfig):
     r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.RetriBertModel`.
-        It is used to instantiate a RetriBertModel model according to the specified arguments, defining the model
-        architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.RetriBertModel`.
+    It is used to instantiate a RetriBertModel model according to the specified arguments, defining the model
+    architecture.
 
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
 
 
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the BERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
-            hidden_size (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 3072):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            share_encoders (:obj:`bool`, optional, defaults to True):
-                Whether to use the same Bert-type encoder for the queries and document
-            projection_dim (:obj:`int`, optional, defaults to 128):
-                Final dimension of the query and document representation after projection
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+        hidden_size (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        share_encoders (:obj:`bool`, optional, defaults to True):
+            Whether to use the same Bert-type encoder for the queries and document
+        projection_dim (:obj:`int`, optional, defaults to 128):
+            Final dimension of the query and document representation after projection
 
     """
     model_type = "retribert"
diff --git a/src/transformers/configuration_roberta.py b/src/transformers/configuration_roberta.py
index ddb6bad3f2..2283158efc 100644
--- a/src/transformers/configuration_roberta.py
+++ b/src/transformers/configuration_roberta.py
@@ -33,34 +33,33 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class RobertaConfig(BertConfig):
     r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel`.
-        It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel`.
+    It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
 
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
 
-        The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
-        It reuses the same defaults. Please check the parent class for more information.
+    The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
+    It reuses the same defaults. Please check the parent class for more information.
 
-        Example::
+    Example::
 
-            >>> from transformers import RobertaConfig, RobertaModel
+        >>> from transformers import RobertaConfig, RobertaModel
 
-            >>> # Initializing a RoBERTa configuration
-            >>> configuration = RobertaConfig()
+        >>> # Initializing a RoBERTa configuration
+        >>> configuration = RobertaConfig()
 
-            >>> # Initializing a model from the configuration
-            >>> model = RobertaModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = RobertaModel(configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
     """
     model_type = "roberta"
 
     def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
-        """Constructs RobertaConfig.
-        """
+        """Constructs RobertaConfig."""
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
diff --git a/src/transformers/configuration_t5.py b/src/transformers/configuration_t5.py
index 63ffd9d420..80e624c052 100644
--- a/src/transformers/configuration_t5.py
+++ b/src/transformers/configuration_t5.py
@@ -31,33 +31,33 @@ T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class T5Config(PretrainedConfig):
     r"""
-        :class:`~transformers.T5Config` is the configuration class to store the configuration of a
-        `T5Model`.
+    :class:`~transformers.T5Config` is the configuration class to store the configuration of a
+    `T5Model`.
 
 
-        Arguments:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
-            d_model: Size of the encoder layers and the pooler layer. `d_model` can also accesed via the property `hidden_size`.
-            num_layers: Number of hidden layers in the Transformer encoder. `num_layers` can also be accessed via the property `num_hidden_layers`.
-            d_kv: Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // num_heads`.
-            d_ff: Size of the intermediate feed forward layer in each `T5Block`.
-            num_heads: Number of attention heads for each attention layer in
-                the Transformer encoder. `num_heads` can also be accessed via the property `num_attention_heads`.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob: The dropout ratio for the attention
-                probabilities.
-            n_positions: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048). `n_positions` can also be accessed via the property `max_position_embeddings`.
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `T5Model`.
-            initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
-            layer_norm_eps: The epsilon used by LayerNorm.
+    Arguments:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
+        d_model: Size of the encoder layers and the pooler layer. `d_model` can also accesed via the property `hidden_size`.
+        num_layers: Number of hidden layers in the Transformer encoder. `num_layers` can also be accessed via the property `num_hidden_layers`.
+        d_kv: Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // num_heads`.
+        d_ff: Size of the intermediate feed forward layer in each `T5Block`.
+        num_heads: Number of attention heads for each attention layer in
+            the Transformer encoder. `num_heads` can also be accessed via the property `num_attention_heads`.
+        intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+            layer in the Transformer encoder.
+        hidden_act: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob: The dropout ratio for the attention
+            probabilities.
+        n_positions: The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048). `n_positions` can also be accessed via the property `max_position_embeddings`.
+        type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+            `T5Model`.
+        initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
+        layer_norm_eps: The epsilon used by LayerNorm.
     """
     model_type = "t5"
 
@@ -80,7 +80,10 @@ class T5Config(PretrainedConfig):
         **kwargs
     ):
         super().__init__(
-            pad_token_id=pad_token_id, eos_token_id=eos_token_id, is_encoder_decoder=is_encoder_decoder, **kwargs,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
         )
         self.vocab_size = vocab_size
         self.n_positions = n_positions
diff --git a/src/transformers/configuration_transfo_xl.py b/src/transformers/configuration_transfo_xl.py
index 9214449637..4fbf599fe1 100644
--- a/src/transformers/configuration_transfo_xl.py
+++ b/src/transformers/configuration_transfo_xl.py
@@ -31,84 +31,84 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class TransfoXLConfig(PretrainedConfig):
     """
-        This is the configuration class to store the configuration of a :class:`~transformers.TransfoXLModel`.
-        It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.TransfoXLModel`.
+    It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
 
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
 
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 267735):
-                Vocabulary size of the Transformer XL model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
-            cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
-                Cutoffs for the adaptive softmax
-            d_model (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the model's hidden states.
-            d_embed (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the embeddings
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            d_head (:obj:`int`, optional, defaults to 64):
-                Dimensionality of the model's heads.
-            d_inner (:obj:`int`, optional, defaults to 4096):
-                Inner dimension in FF
-            div_val (:obj:`int`, optional, defaults to 4):
-                Divident value for adapative input and softmax
-            pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Apply LayerNorm to the input instead of the output
-            n_layer (:obj:`int`, optional, defaults to 18):
-                Number of hidden layers in the Transformer encoder.
-            tgt_len (:obj:`int`, optional, defaults to 128):
-                Number of tokens to predict
-            ext_len (:obj:`int`, optional, defaults to 0):
-                Length of the extended context
-            mem_len (:obj:`int`, optional, defaults to 1600):
-                Length of the retained previous heads
-            clamp_len (:obj:`int`, optional, defaults to 1000):
-                use the same pos embeddings after clamp_len
-            same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Use the same attn length for all tokens
-            proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
-                True to share all but first projs, False not to share.
-            attn_type (:obj:`int`, optional, defaults to 0):
-                Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
-            sample_softmax (:obj:`int`, optional, defaults to -1):
-                number of samples in sampled softmax
-            adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
-                use adaptive softmax
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            dropatt (:obj:`float`, optional, defaults to 0):
-                The dropout ratio for the attention probabilities.
-            untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Untie relative position biases
-            init (:obj:`string`, optional, defaults to `normal`):
-                Parameter initializer to use
-            init_range (:obj:`float`, optional, defaults to 0.01):
-                Parameters initialized by U(-init_range, init_range).
-            proj_init_std (:obj:`float`, optional, defaults to 0.01):
-                Parameters initialized by N(0, init_std)
-            init_std (:obj:`float`, optional, defaults to 0.02):
-                Parameters initialized by N(0, init_std)
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
-                The epsilon to use in the layer normalization layers
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 267735):
+            Vocabulary size of the Transformer XL model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
+        cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
+            Cutoffs for the adaptive softmax
+        d_model (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the model's hidden states.
+        d_embed (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the embeddings
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        d_head (:obj:`int`, optional, defaults to 64):
+            Dimensionality of the model's heads.
+        d_inner (:obj:`int`, optional, defaults to 4096):
+            Inner dimension in FF
+        div_val (:obj:`int`, optional, defaults to 4):
+            Divident value for adapative input and softmax
+        pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Apply LayerNorm to the input instead of the output
+        n_layer (:obj:`int`, optional, defaults to 18):
+            Number of hidden layers in the Transformer encoder.
+        tgt_len (:obj:`int`, optional, defaults to 128):
+            Number of tokens to predict
+        ext_len (:obj:`int`, optional, defaults to 0):
+            Length of the extended context
+        mem_len (:obj:`int`, optional, defaults to 1600):
+            Length of the retained previous heads
+        clamp_len (:obj:`int`, optional, defaults to 1000):
+            use the same pos embeddings after clamp_len
+        same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Use the same attn length for all tokens
+        proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
+            True to share all but first projs, False not to share.
+        attn_type (:obj:`int`, optional, defaults to 0):
+            Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
+        sample_softmax (:obj:`int`, optional, defaults to -1):
+            number of samples in sampled softmax
+        adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
+            use adaptive softmax
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        dropatt (:obj:`float`, optional, defaults to 0):
+            The dropout ratio for the attention probabilities.
+        untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Untie relative position biases
+        init (:obj:`string`, optional, defaults to `normal`):
+            Parameter initializer to use
+        init_range (:obj:`float`, optional, defaults to 0.01):
+            Parameters initialized by U(-init_range, init_range).
+        proj_init_std (:obj:`float`, optional, defaults to 0.01):
+            Parameters initialized by N(0, init_std)
+        init_std (:obj:`float`, optional, defaults to 0.02):
+            Parameters initialized by N(0, init_std)
+        layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers
 
-        Example::
+    Example::
 
-            >>> from transformers import TransfoXLConfig, TransfoXLModel
+        >>> from transformers import TransfoXLConfig, TransfoXLModel
 
-            >>> # Initializing a Transformer XL configuration
-            >>> configuration = TransfoXLConfig()
+        >>> # Initializing a Transformer XL configuration
+        >>> configuration = TransfoXLConfig()
 
-            >>> # Initializing a model from the configuration
-            >>> model = TransfoXLModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = TransfoXLModel(configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
     """
 
     model_type = "transfo-xl"
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 7eeca428f1..4b92d198eb 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -29,116 +29,116 @@ logger = logging.get_logger(__name__)
 
 
 class PretrainedConfig(object):
-    r""" Base class for all configuration classes.
-        Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving
-        configurations.
+    r"""Base class for all configuration classes.
+    Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving
+    configurations.
 
-        Note:
-            A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to
-            initialize a model does **not** load the model weights.
-            It only affects the model's configuration.
+    Note:
+        A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to
+        initialize a model does **not** load the model weights.
+        It only affects the model's configuration.
 
-        Class attributes (overridden by derived classes)
-            - **model_type** (:obj:`str`): An identifier for the model type, serialized into the JSON file, and used to
-              recreate the correct object in :class:`~transformers.AutoConfig`.
+    Class attributes (overridden by derived classes)
+        - **model_type** (:obj:`str`): An identifier for the model type, serialized into the JSON file, and used to
+          recreate the correct object in :class:`~transformers.AutoConfig`.
 
-        Args:
-            output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the model should return all hidden-states.
-            output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the model should returns all attentions.
-            use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                Whether or not the model should return the last key/values attentions (not used by all models).
-            return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a
-                plain tuple.
-            is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether the model is used as an encoder/decoder or not.
-            is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether the model is used as decoder or not (in which case it's used as an encoder).
-            add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether cross-attention layers should be added to the model. Note, this option is only relevant for models that can be used as decoder models within the `:class:~transformers.EncoderDecoderModel` class, which consists of all models in ``AUTO_MODELS_FOR_CAUSAL_LM``.
-            tie_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`)
-                Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder and decoder model to have the exact same parameter names.
-            prune_heads (:obj:`Dict[int, List[int]]`, `optional`, defaults to :obj:`{}`):
-                Pruned heads of the model. The keys are the selected layer indices and the associated values, the list
-                of heads to prune in said layer.
+    Args:
+        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the model should return all hidden-states.
+        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the model should returns all attentions.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a
+            plain tuple.
+        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether the model is used as an encoder/decoder or not.
+        is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether the model is used as decoder or not (in which case it's used as an encoder).
+        add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether cross-attention layers should be added to the model. Note, this option is only relevant for models that can be used as decoder models within the `:class:~transformers.EncoderDecoderModel` class, which consists of all models in ``AUTO_MODELS_FOR_CAUSAL_LM``.
+        tie_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`)
+            Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder and decoder model to have the exact same parameter names.
+        prune_heads (:obj:`Dict[int, List[int]]`, `optional`, defaults to :obj:`{}`):
+            Pruned heads of the model. The keys are the selected layer indices and the associated values, the list
+            of heads to prune in said layer.
 
-                For instance ``{1: [0, 2], 2: [2, 3]}`` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer
-                2.
-            xla_device (:obj:`bool`, `optional`):
-                A flag to indicate if TPU are available or not.
-            chunk_size_feed_forward (:obj:`int`, `optional`, defaults to :obj:`0`):
-                The chunk size of all feed forward layers in the residual attention blocks.
-                A chunk size of :obj:`0` means that the feed forward layer is not chunked.
-                A chunk size of n means that the feed forward layer processes :obj:`n` < sequence_length embeddings at a time.
-                For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
+            For instance ``{1: [0, 2], 2: [2, 3]}`` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer
+            2.
+        xla_device (:obj:`bool`, `optional`):
+            A flag to indicate if TPU are available or not.
+        chunk_size_feed_forward (:obj:`int`, `optional`, defaults to :obj:`0`):
+            The chunk size of all feed forward layers in the residual attention blocks.
+            A chunk size of :obj:`0` means that the feed forward layer is not chunked.
+            A chunk size of n means that the feed forward layer processes :obj:`n` < sequence_length embeddings at a time.
+            For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
 
-        Parameters for sequence generation
-            - **max_length** (:obj:`int`, `optional`, defaults to 20) -- Maximum length that will be used by
-              default in the :obj:`generate` method of the model.
-            - **min_length** (:obj:`int`, `optional`, defaults to 10) -- Minimum length that will be used by
-              default in the :obj:`generate` method of the model.
-            - **do_sample** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default in
-              the :obj:`generate` method of the model. Whether or not to use sampling ; use greedy decoding otherwise.
-            - **early_stopping** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by
-              default in the :obj:`generate` method of the model. Whether to stop the beam search when at least
-              ``num_beams`` sentences are finished per batch or not.
-            - **num_beams** (:obj:`int`, `optional`, defaults to 1) -- Number of beams for beam search that will be
-              used by default in the :obj:`generate` method of the model. 1 means no beam search.
-            - **temperature** (:obj:`float`, `optional`, defaults to 1) -- The value used to module the next token
-              probabilities that will be used by default in the :obj:`generate` method of the model. Must be strictly
-              positive.
-            - **top_k** (:obj:`int`, `optional`, defaults to 50) -- Number of highest probability vocabulary tokens to
-              keep for top-k-filtering that will be used by default in the :obj:`generate` method of the model.
-            - **top_p** (:obj:`float`, `optional`, defaults to 1) --  Value that will be used by default in the
-              :obj:`generate` method of the model for ``top_p``. If set to float < 1, only the most probable tokens
-              with probabilities that add up to ``top_p`` or higher are kept for generation.
-            - **repetition_penalty** (:obj:`float`, `optional`, defaults to 1) -- Parameter for repetition penalty
-              that will be used by default in the :obj:`generate` method of the model. 1.0 means no penalty.
-            - **length_penalty** (:obj:`float`, `optional`, defaults to 1) -- Exponential penalty to the length that
-              will be used by default in the :obj:`generate` method of the model.
-            - **no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by default
-              in the :obj:`generate` method of the model for ``no_repeat_ngram_size``. If set to int > 0, all ngrams of
-              that size can only occur once.
-            - **bad_words_ids** (:obj:`List[int]`, `optional`) -- List of token ids that are not allowed to be
-              generated that will be used by default in the :obj:`generate` method of the model. In order to get the
-              tokens of the words that should not appear in the generated text, use
-              :obj:`tokenizer.encode(bad_word, add_prefix_space=True)`.
-            - **num_return_sequences** (:obj:`int`, `optional`, defaults to 1) -- Number of independently computed
-              returned sequences for each element in the batch that will be used by default in the :obj:`generate`
-              method of the model.
+    Parameters for sequence generation
+        - **max_length** (:obj:`int`, `optional`, defaults to 20) -- Maximum length that will be used by
+          default in the :obj:`generate` method of the model.
+        - **min_length** (:obj:`int`, `optional`, defaults to 10) -- Minimum length that will be used by
+          default in the :obj:`generate` method of the model.
+        - **do_sample** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default in
+          the :obj:`generate` method of the model. Whether or not to use sampling ; use greedy decoding otherwise.
+        - **early_stopping** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by
+          default in the :obj:`generate` method of the model. Whether to stop the beam search when at least
+          ``num_beams`` sentences are finished per batch or not.
+        - **num_beams** (:obj:`int`, `optional`, defaults to 1) -- Number of beams for beam search that will be
+          used by default in the :obj:`generate` method of the model. 1 means no beam search.
+        - **temperature** (:obj:`float`, `optional`, defaults to 1) -- The value used to module the next token
+          probabilities that will be used by default in the :obj:`generate` method of the model. Must be strictly
+          positive.
+        - **top_k** (:obj:`int`, `optional`, defaults to 50) -- Number of highest probability vocabulary tokens to
+          keep for top-k-filtering that will be used by default in the :obj:`generate` method of the model.
+        - **top_p** (:obj:`float`, `optional`, defaults to 1) --  Value that will be used by default in the
+          :obj:`generate` method of the model for ``top_p``. If set to float < 1, only the most probable tokens
+          with probabilities that add up to ``top_p`` or higher are kept for generation.
+        - **repetition_penalty** (:obj:`float`, `optional`, defaults to 1) -- Parameter for repetition penalty
+          that will be used by default in the :obj:`generate` method of the model. 1.0 means no penalty.
+        - **length_penalty** (:obj:`float`, `optional`, defaults to 1) -- Exponential penalty to the length that
+          will be used by default in the :obj:`generate` method of the model.
+        - **no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by default
+          in the :obj:`generate` method of the model for ``no_repeat_ngram_size``. If set to int > 0, all ngrams of
+          that size can only occur once.
+        - **bad_words_ids** (:obj:`List[int]`, `optional`) -- List of token ids that are not allowed to be
+          generated that will be used by default in the :obj:`generate` method of the model. In order to get the
+          tokens of the words that should not appear in the generated text, use
+          :obj:`tokenizer.encode(bad_word, add_prefix_space=True)`.
+        - **num_return_sequences** (:obj:`int`, `optional`, defaults to 1) -- Number of independently computed
+          returned sequences for each element in the batch that will be used by default in the :obj:`generate`
+          method of the model.
 
-        Parameters for fine-tuning tasks
-            - **architectures** (:obj:`List[str]`, `optional`) -- Model architectures that can be used with the
-              model pretrained weights.
-            - **finetuning_task** (:obj:`str`, `optional`) -- Name of the task used to fine-tune the model. This can be
-              used when converting from an original (TensorFlow or PyTorch) checkpoint.
-            - **id2label** (:obj:`List[str]`, `optional`) -- A map from index (for instance prediction index, or target
-              index) to label.
-            - **label2id** (:obj:`Dict[str, int]`, `optional`) -- A map from label to index for the model.
-            - **num_labels** (:obj:`int`, `optional`) -- Number of labels to use in the last layer added to the model,
-              typically for a classification task.
-            - **task_specific_params** (:obj:`Dict[str, Any]`, `optional`) -- Additional keyword arguments to store for
-              the current task.
+    Parameters for fine-tuning tasks
+        - **architectures** (:obj:`List[str]`, `optional`) -- Model architectures that can be used with the
+          model pretrained weights.
+        - **finetuning_task** (:obj:`str`, `optional`) -- Name of the task used to fine-tune the model. This can be
+          used when converting from an original (TensorFlow or PyTorch) checkpoint.
+        - **id2label** (:obj:`List[str]`, `optional`) -- A map from index (for instance prediction index, or target
+          index) to label.
+        - **label2id** (:obj:`Dict[str, int]`, `optional`) -- A map from label to index for the model.
+        - **num_labels** (:obj:`int`, `optional`) -- Number of labels to use in the last layer added to the model,
+          typically for a classification task.
+        - **task_specific_params** (:obj:`Dict[str, Any]`, `optional`) -- Additional keyword arguments to store for
+          the current task.
 
-        Parameters linked to the tokenizer
-            - **prefix** (:obj:`str`, `optional`) -- A specific prompt that should be added at the beginning of each
-              text before calling the model.
-            - **bos_token_id** (:obj:`int`, `optional`)) -- The id of the `beginning-of-stream` token.
-            - **pad_token_id** (:obj:`int`, `optional`)) -- The id of the `padding` token.
-            - **eos_token_id** (:obj:`int`, `optional`)) -- The id of the `end-of-stream` token.
-            - **decoder_start_token_id** (:obj:`int`, `optional`)) -- If an encoder-decoder model starts decoding with
-              a different token than `bos`, the id of that token.
+    Parameters linked to the tokenizer
+        - **prefix** (:obj:`str`, `optional`) -- A specific prompt that should be added at the beginning of each
+          text before calling the model.
+        - **bos_token_id** (:obj:`int`, `optional`)) -- The id of the `beginning-of-stream` token.
+        - **pad_token_id** (:obj:`int`, `optional`)) -- The id of the `padding` token.
+        - **eos_token_id** (:obj:`int`, `optional`)) -- The id of the `end-of-stream` token.
+        - **decoder_start_token_id** (:obj:`int`, `optional`)) -- If an encoder-decoder model starts decoding with
+          a different token than `bos`, the id of that token.
 
-        PyTorch specific parameters
-            - **torchscript** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should be
-              used with Torchscript.
-            - **tie_word_embeddings** (:obj:`bool`, `optional`, defaults to :obj:`True`) -- Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the model has a output word embedding layer.
+    PyTorch specific parameters
+        - **torchscript** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should be
+          used with Torchscript.
+        - **tie_word_embeddings** (:obj:`bool`, `optional`, defaults to :obj:`True`) -- Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the model has a output word embedding layer.
 
-        TensorFlow specific parameters
-            - **use_bfloat16** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should
-              use BFloat16 scalars (only used by some TensorFlow models).
+    TensorFlow specific parameters
+        - **use_bfloat16** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should
+          use BFloat16 scalars (only used by some TensorFlow models).
     """
     model_type: str = ""
 
diff --git a/src/transformers/configuration_xlm.py b/src/transformers/configuration_xlm.py
index 8ee44b56ae..201d434e2c 100644
--- a/src/transformers/configuration_xlm.py
+++ b/src/transformers/configuration_xlm.py
@@ -36,120 +36,120 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class XLMConfig(PretrainedConfig):
     """
-        This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
-        It is used to instantiate an XLM model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
+    It is used to instantiate an XLM model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
 
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
 
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30145):
-                Vocabulary size of the XLM model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
-            emb_dim (:obj:`int`, optional, defaults to 2048):
-                Dimensionality of the encoder layers and the pooler layer.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for the attention mechanism
-            gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
-                The non-linear activation function (function or string) in the
-                encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
-            sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
-            causal (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Set this to `True` for the model to behave in a causal manner.
-                Causal models use a triangular attention mask in order to only attend to the left-side context instead
-                if a bidirectional context.
-            asm (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
-                layer.
-            n_langs (:obj:`int`, optional, defaults to 1):
-                The number of languages the model handles. Set to 1 for monolingual models.
-            use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
-                Whether to use language embeddings. Some models use additional language embeddings, see
-                `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
-                for information on how to use them.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
-                The standard deviation of the truncated_normal_initializer for
-                initializing the embedding matrices.
-            init_std (:obj:`int`, optional, defaults to 50257):
-                The standard deviation of the truncated_normal_initializer for
-                initializing all weight matrices except the embedding matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            bos_index (:obj:`int`, optional, defaults to 0):
-                The index of the beginning of sentence token in the vocabulary.
-            eos_index (:obj:`int`, optional, defaults to 1):
-                The index of the end of sentence token in the vocabulary.
-            pad_index (:obj:`int`, optional, defaults to 2):
-                The index of the padding token in the vocabulary.
-            unk_index (:obj:`int`, optional, defaults to 3):
-                The index of the unknown token in the vocabulary.
-            mask_index (:obj:`int`, optional, defaults to 5):
-                The index of the masking token in the vocabulary.
-            is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
-                Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
-            summary_type (:obj:`string`, optional, defaults to "first"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Is one of the following options:
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30145):
+            Vocabulary size of the XLM model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
+        emb_dim (:obj:`int`, optional, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        n_layer (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for the attention mechanism
+        gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
+        sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
+        causal (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Set this to `True` for the model to behave in a causal manner.
+            Causal models use a triangular attention mask in order to only attend to the left-side context instead
+            if a bidirectional context.
+        asm (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
+            layer.
+        n_langs (:obj:`int`, optional, defaults to 1):
+            The number of languages the model handles. Set to 1 for monolingual models.
+        use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
+            Whether to use language embeddings. Some models use additional language embeddings, see
+            `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
+            for information on how to use them.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
+            The standard deviation of the truncated_normal_initializer for
+            initializing the embedding matrices.
+        init_std (:obj:`int`, optional, defaults to 50257):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices except the embedding matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        bos_index (:obj:`int`, optional, defaults to 0):
+            The index of the beginning of sentence token in the vocabulary.
+        eos_index (:obj:`int`, optional, defaults to 1):
+            The index of the end of sentence token in the vocabulary.
+        pad_index (:obj:`int`, optional, defaults to 2):
+            The index of the padding token in the vocabulary.
+        unk_index (:obj:`int`, optional, defaults to 3):
+            The index of the unknown token in the vocabulary.
+        mask_index (:obj:`int`, optional, defaults to 5):
+            The index of the masking token in the vocabulary.
+        is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
+            Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
+        summary_type (:obj:`string`, optional, defaults to "first"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Is one of the following options:
 
-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a dropout before the projection and activation
-            start_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            end_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            mask_token_id (:obj:`int`, optional, defaults to 0):
-                Model agnostic parameter to identify masked tokens when generating text in an MLM context.
-            lang_id (:obj:`int`, optional, defaults to 1):
-                The ID of the language used by the model. This parameter is used when generating
-                text in a given language.
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Add a dropout before the projection and activation
+        start_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        end_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        mask_token_id (:obj:`int`, optional, defaults to 0):
+            Model agnostic parameter to identify masked tokens when generating text in an MLM context.
+        lang_id (:obj:`int`, optional, defaults to 1):
+            The ID of the language used by the model. This parameter is used when generating
+            text in a given language.
 
-        Example::
+    Example::
 
-            >>> from transformers import XLMConfig, XLMModel
+        >>> from transformers import XLMConfig, XLMModel
 
-            >>> # Initializing a XLM configuration
-            >>> configuration = XLMConfig()
+        >>> # Initializing a XLM configuration
+        >>> configuration = XLMConfig()
 
-            >>> # Initializing a model from the configuration
-            >>> model = XLMModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = XLMModel(configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
     """
 
     model_type = "xlm"
@@ -191,8 +191,7 @@ class XLMConfig(PretrainedConfig):
         bos_token_id=0,
         **kwargs
     ):
-        """Constructs XLMConfig.
-        """
+        """Constructs XLMConfig."""
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
         self.vocab_size = vocab_size
         self.emb_dim = emb_dim
diff --git a/src/transformers/configuration_xlnet.py b/src/transformers/configuration_xlnet.py
index 0fd04b0c4f..f362957ede 100644
--- a/src/transformers/configuration_xlnet.py
+++ b/src/transformers/configuration_xlnet.py
@@ -31,104 +31,104 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class XLNetConfig(PretrainedConfig):
     """
-        This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
-        It is used to instantiate an XLNet model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
+    It is used to instantiate an XLNet model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
 
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
 
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 32000):
-                Vocabulary size of the XLNet model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
-            d_model (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the encoder layers and the pooler layer.
-            n_layer (:obj:`int`, optional, defaults to 24):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            d_inner (:obj:`int`, optional, defaults to 4096):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            ff_activation (:obj:`string`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Untie relative position biases
-            attn_type (:obj:`string`, optional, defaults to "bi"):
-                The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
-                The number of tokens to cache. The key/value pairs that have already been pre-computed
-                in a previous forward pass won't be re-computed. See the
-                `quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
-                for more information.
-            reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
-                The number of tokens in the current batch to be cached and reused in the future.
-            bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use bidirectional input pipeline. Usually set to `True` during
-                pretraining and `False` during finetuning.
-            clamp_len (:obj:`int`, optional, defaults to -1):
-                Clamp all relative distances larger than clamp_len.
-                Setting this attribute to -1 means no clamping.
-            same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use the same attention length for each token.
-            summary_type (:obj:`string`, optional, defaults to "last"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                Is one of the following options:
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 32000):
+            Vocabulary size of the XLNet model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
+        d_model (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        n_layer (:obj:`int`, optional, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        d_inner (:obj:`int`, optional, defaults to 4096):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        ff_activation (:obj:`string`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Untie relative position biases
+        attn_type (:obj:`string`, optional, defaults to "bi"):
+            The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
+            The number of tokens to cache. The key/value pairs that have already been pre-computed
+            in a previous forward pass won't be re-computed. See the
+            `quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
+            for more information.
+        reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
+            The number of tokens in the current batch to be cached and reused in the future.
+        bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use bidirectional input pipeline. Usually set to `True` during
+            pretraining and `False` during finetuning.
+        clamp_len (:obj:`int`, optional, defaults to -1):
+            Clamp all relative distances larger than clamp_len.
+            Setting this attribute to -1 means no clamping.
+        same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use the same attention length for each token.
+        summary_type (:obj:`string`, optional, defaults to "last"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+            Is one of the following options:
 
-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_last_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                Add a dropout after the projection and activation
-            start_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            end_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                Whether or not the model should return the last pre-computed hidden states.
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_last_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+            Add a dropout after the projection and activation
+        start_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        end_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last pre-computed hidden states.
 
-                .. note::
-                    This flag behaves differently from with other models: it just controls the inference behavior, during
-                    training the model always uses ``use_cache=True``.
+            .. note::
+                This flag behaves differently from with other models: it just controls the inference behavior, during
+                training the model always uses ``use_cache=True``.
 
-        Example::
+    Example::
 
-            >>> from transformers import XLNetConfig, XLNetModel
+        >>> from transformers import XLNetConfig, XLNetModel
 
-            >>> # Initializing a XLNet configuration
-            >>> configuration = XLNetConfig()
+        >>> # Initializing a XLNet configuration
+        >>> configuration = XLNetConfig()
 
-            >>> # Initializing a model from the configuration
-            >>> model = XLNetModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = XLNetModel(configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
     """
 
     model_type = "xlnet"
@@ -162,8 +162,7 @@ class XLNetConfig(PretrainedConfig):
         eos_token_id=2,
         **kwargs
     ):
-        """Constructs XLNetConfig.
-        """
+        """Constructs XLNetConfig."""
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
         self.vocab_size = vocab_size
         self.d_model = d_model
diff --git a/src/transformers/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
index 987e2ee78f..f2af5972f8 100644
--- a/src/transformers/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
@@ -27,5 +27,6 @@ if __name__ == "__main__":
         checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl")
         pytorch_dump_folder_path = f"./DialoGPT-{MODEL}"
         convert_dialogpt_checkpoint(
-            checkpoint_path, pytorch_dump_folder_path,
+            checkpoint_path,
+            pytorch_dump_folder_path,
         )
diff --git a/src/transformers/convert_graph_to_onnx.py b/src/transformers/convert_graph_to_onnx.py
index 990895a501..dbdc00249c 100644
--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@@ -38,24 +38,39 @@ class OnnxConverterArgumentParser(ArgumentParser):
         super().__init__("ONNX Converter")
 
         self.add_argument(
-            "--pipeline", type=str, choices=SUPPORTED_PIPELINES, default="feature-extraction",
+            "--pipeline",
+            type=str,
+            choices=SUPPORTED_PIPELINES,
+            default="feature-extraction",
         )
         self.add_argument(
-            "--model", type=str, required=True, help="Model's id or path (ex: bert-base-cased)",
+            "--model",
+            type=str,
+            required=True,
+            help="Model's id or path (ex: bert-base-cased)",
         )
         self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)")
         self.add_argument(
-            "--framework", type=str, choices=["pt", "tf"], help="Framework for loading the model",
+            "--framework",
+            type=str,
+            choices=["pt", "tf"],
+            help="Framework for loading the model",
         )
         self.add_argument("--opset", type=int, default=11, help="ONNX opset to use")
         self.add_argument(
-            "--check-loading", action="store_true", help="Check ONNX is able to load the model",
+            "--check-loading",
+            action="store_true",
+            help="Check ONNX is able to load the model",
         )
         self.add_argument(
-            "--use-external-format", action="store_true", help="Allow exporting model >= than 2Gb",
+            "--use-external-format",
+            action="store_true",
+            help="Allow exporting model >= than 2Gb",
         )
         self.add_argument(
-            "--quantize", action="store_true", help="Quantize the neural network to be run with int8",
+            "--quantize",
+            action="store_true",
+            help="Quantize the neural network to be run with int8",
         )
         self.add_argument("output")
 
@@ -376,7 +391,10 @@ def quantize(onnx_model_path: Path) -> Path:
     )
 
     quantized_model = quantize(
-        model=onnx_model, quantization_mode=QuantizationMode.IntegerOps, force_fusions=True, symmetric_weight=True,
+        model=onnx_model,
+        quantization_mode=QuantizationMode.IntegerOps,
+        force_fusions=True,
+        symmetric_weight=True,
     )
 
     # Append "-quantized" at the end of the model's name
diff --git a/src/transformers/convert_marian_to_pytorch.py b/src/transformers/convert_marian_to_pytorch.py
index 8843982870..ce001166a8 100644
--- a/src/transformers/convert_marian_to_pytorch.py
+++ b/src/transformers/convert_marian_to_pytorch.py
@@ -255,7 +255,11 @@ license: apache-2.0
 
 
 def write_model_card(
-    hf_model_name: str, repo_root="OPUS-MT-train", save_dir=Path("marian_converted"), dry_run=False, extra_metadata={},
+    hf_model_name: str,
+    repo_root="OPUS-MT-train",
+    save_dir=Path("marian_converted"),
+    dry_run=False,
+    extra_metadata={},
 ) -> str:
     """Copy the most recent model's readme section from opus, and add metadata.
     upload command: aws s3 sync model_card_dir s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
@@ -604,7 +608,9 @@ class OpusState:
 
         assert "hidden_size" not in cfg.to_dict()
         load_layers_(
-            model.model.encoder.layers, state_dict, BART_CONVERTER,
+            model.model.encoder.layers,
+            state_dict,
+            BART_CONVERTER,
         )
         load_layers_(model.model.decoder.layers, state_dict, BART_CONVERTER, is_decoder=True)
 
diff --git a/src/transformers/convert_pytorch_checkpoint_to_tf2.py b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
index 5e27ee62b6..51b57d005f 100755
--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -108,7 +108,12 @@ if is_torch_available():
 logging.set_verbosity_info()
 
 MODEL_CLASSES = {
-    "bert": (BertConfig, TFBertForPreTraining, BertForPreTraining, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,),
+    "bert": (
+        BertConfig,
+        TFBertForPreTraining,
+        BertForPreTraining,
+        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
     "bert-large-uncased-whole-word-masking-finetuned-squad": (
         BertConfig,
         TFBertForQuestionAnswering,
@@ -127,9 +132,24 @@ MODEL_CLASSES = {
         BertForSequenceClassification,
         BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
     ),
-    "gpt2": (GPT2Config, TFGPT2LMHeadModel, GPT2LMHeadModel, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,),
-    "xlnet": (XLNetConfig, TFXLNetLMHeadModel, XLNetLMHeadModel, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,),
-    "xlm": (XLMConfig, TFXLMWithLMHeadModel, XLMWithLMHeadModel, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,),
+    "gpt2": (
+        GPT2Config,
+        TFGPT2LMHeadModel,
+        GPT2LMHeadModel,
+        GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "xlnet": (
+        XLNetConfig,
+        TFXLNetLMHeadModel,
+        XLNetLMHeadModel,
+        XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "xlm": (
+        XLMConfig,
+        TFXLMWithLMHeadModel,
+        XLMWithLMHeadModel,
+        XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
     "xlm-roberta": (
         XLMRobertaConfig,
         TFXLMRobertaForMaskedLM,
@@ -148,7 +168,12 @@ MODEL_CLASSES = {
         OpenAIGPTLMHeadModel,
         OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
     ),
-    "roberta": (RobertaConfig, TFRobertaForMaskedLM, RobertaForMaskedLM, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,),
+    "roberta": (
+        RobertaConfig,
+        TFRobertaForMaskedLM,
+        RobertaForMaskedLM,
+        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
     "roberta-large-mnli": (
         RobertaConfig,
         TFRobertaForSequenceClassification,
@@ -179,10 +204,30 @@ MODEL_CLASSES = {
         DistilBertForQuestionAnswering,
         DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
     ),
-    "ctrl": (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,),
-    "albert": (AlbertConfig, TFAlbertForPreTraining, AlbertForPreTraining, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,),
-    "t5": (T5Config, TFT5ForConditionalGeneration, T5ForConditionalGeneration, T5_PRETRAINED_CONFIG_ARCHIVE_MAP,),
-    "electra": (ElectraConfig, TFElectraForPreTraining, ElectraForPreTraining, ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,),
+    "ctrl": (
+        CTRLConfig,
+        TFCTRLLMHeadModel,
+        CTRLLMHeadModel,
+        CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "albert": (
+        AlbertConfig,
+        TFAlbertForPreTraining,
+        AlbertForPreTraining,
+        ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "t5": (
+        T5Config,
+        TFT5ForConditionalGeneration,
+        T5ForConditionalGeneration,
+        T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "electra": (
+        ElectraConfig,
+        TFElectraForPreTraining,
+        ElectraForPreTraining,
+        ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
 }
 
 
diff --git a/src/transformers/convert_reformer_trax_checkpoint_to_pytorch.py b/src/transformers/convert_reformer_trax_checkpoint_to_pytorch.py
index ace650260f..97d1a63f51 100755
--- a/src/transformers/convert_reformer_trax_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_reformer_trax_checkpoint_to_pytorch.py
@@ -49,10 +49,12 @@ def set_layer_weights_in_torch_lsh(weights, torch_layer, hidden_size):
         torch.tensor(np_query_key).transpose(1, 2).contiguous().view(-1, hidden_size),
     )
     set_param(
-        torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
+        torch_layer.self_attention.value,
+        torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
     )
     set_param(
-        torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
+        torch_layer.output.dense,
+        torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
     )
 
 
@@ -64,16 +66,20 @@ def set_layer_weights_in_torch_local(weights, torch_layer, hidden_size):
     np_dense = np.asarray(weights[3])
 
     set_param(
-        torch_layer.self_attention.query, torch.tensor(np_query).transpose(1, 2).contiguous().view(-1, hidden_size),
+        torch_layer.self_attention.query,
+        torch.tensor(np_query).transpose(1, 2).contiguous().view(-1, hidden_size),
     )
     set_param(
-        torch_layer.self_attention.key, torch.tensor(np_key).transpose(1, 2).contiguous().view(-1, hidden_size),
+        torch_layer.self_attention.key,
+        torch.tensor(np_key).transpose(1, 2).contiguous().view(-1, hidden_size),
     )
     set_param(
-        torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
+        torch_layer.self_attention.value,
+        torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
     )
     set_param(
-        torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
+        torch_layer.output.dense,
+        torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
     )
 
 
@@ -83,7 +89,9 @@ def set_block_weights_in_torch(weights, torch_block, hidden_size):
     layer_norm_1_weight = np.asarray(layer_norm_1[0])
     layer_norm_1_bias = np.asarray(layer_norm_1[1])
     set_param(
-        torch_block.attention.layer_norm, torch.tensor(layer_norm_1_weight), torch.tensor(layer_norm_1_bias),
+        torch_block.attention.layer_norm,
+        torch.tensor(layer_norm_1_weight),
+        torch.tensor(layer_norm_1_bias),
     )
 
     # lsh weights + output
@@ -104,7 +112,9 @@ def set_block_weights_in_torch(weights, torch_block, hidden_size):
     layer_norm_2_weight = np.asarray(intermediate_weights[0][0])
     layer_norm_2_bias = np.asarray(intermediate_weights[0][1])
     set_param(
-        torch_block.feed_forward.layer_norm, torch.tensor(layer_norm_2_weight), torch.tensor(layer_norm_2_bias),
+        torch_block.feed_forward.layer_norm,
+        torch.tensor(layer_norm_2_weight),
+        torch.tensor(layer_norm_2_bias),
     )
 
     # intermediate dense
@@ -133,7 +143,8 @@ def set_model_weights_in_torch(weights, torch_model, hidden_size):
     # word embeds
     word_embeddings = np.asarray(weights[1])
     set_param(
-        torch_model_reformer.embeddings.word_embeddings, torch.tensor(word_embeddings),
+        torch_model_reformer.embeddings.word_embeddings,
+        torch.tensor(word_embeddings),
     )
 
     if isinstance(weights[3], tuple):
diff --git a/src/transformers/data/datasets/glue.py b/src/transformers/data/datasets/glue.py
index 9dcd1477b9..412cd47fcc 100644
--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
@@ -86,7 +86,10 @@ class GlueDataset(Dataset):
         cached_features_file = os.path.join(
             cache_dir if cache_dir is not None else args.data_dir,
             "cached_{}_{}_{}_{}".format(
-                mode.value, tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name,
+                mode.value,
+                tokenizer.__class__.__name__,
+                str(args.max_seq_length),
+                args.task_name,
             ),
         )
         label_list = self.processor.get_labels()
diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py
index 50b8b86e22..71a5950031 100644
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@@ -21,7 +21,11 @@ class TextDataset(Dataset):
     """
 
     def __init__(
-        self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False,
+        self,
+        tokenizer: PreTrainedTokenizer,
+        file_path: str,
+        block_size: int,
+        overwrite_cache=False,
     ):
         assert os.path.isfile(file_path), f"Input file path {file_path} not found"
 
@@ -29,7 +33,12 @@ class TextDataset(Dataset):
 
         directory, filename = os.path.split(file_path)
         cached_features_file = os.path.join(
-            directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,),
+            directory,
+            "cached_lm_{}_{}_{}".format(
+                tokenizer.__class__.__name__,
+                str(block_size),
+                filename,
+            ),
         )
 
         # Make sure only the first process in distributed training processes the dataset,
diff --git a/src/transformers/data/datasets/squad.py b/src/transformers/data/datasets/squad.py
index 6d51c80a93..e081ab11d7 100644
--- a/src/transformers/data/datasets/squad.py
+++ b/src/transformers/data/datasets/squad.py
@@ -119,7 +119,10 @@ class SquadDataset(Dataset):
         cached_features_file = os.path.join(
             cache_dir if cache_dir is not None else args.data_dir,
             "cached_{}_{}_{}_{}".format(
-                mode.value, tokenizer.__class__.__name__, str(args.max_seq_length), version_tag,
+                mode.value,
+                tokenizer.__class__.__name__,
+                str(args.max_seq_length),
+                version_tag,
             ),
         )
 
diff --git a/src/transformers/data/metrics/squad_metrics.py b/src/transformers/data/metrics/squad_metrics.py
index b93db890e8..5ab2473fcf 100644
--- a/src/transformers/data/metrics/squad_metrics.py
+++ b/src/transformers/data/metrics/squad_metrics.py
@@ -589,10 +589,10 @@ def compute_predictions_log_probs(
     tokenizer,
     verbose_logging,
 ):
-    """ XLNet write prediction logic (more complex than Bert's).
-        Write final predictions to the json file and log-odds of null if needed.
+    """XLNet write prediction logic (more complex than Bert's).
+    Write final predictions to the json file and log-odds of null if needed.
 
-        Requires utils_squad_evaluate.py
+    Requires utils_squad_evaluate.py
     """
     _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
         "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
diff --git a/src/transformers/data/processors/glue.py b/src/transformers/data/processors/glue.py
index 1143e08494..a496991482 100644
--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@@ -69,7 +69,10 @@ def glue_convert_examples_to_features(
 if is_tf_available():
 
     def _tf_glue_convert_examples_to_features(
-        examples: tf.data.Dataset, tokenizer: PreTrainedTokenizer, task=str, max_length: Optional[int] = None,
+        examples: tf.data.Dataset,
+        tokenizer: PreTrainedTokenizer,
+        task=str,
+        max_length: Optional[int] = None,
     ) -> tf.data.Dataset:
         """
         Returns:
diff --git a/src/transformers/data/processors/utils.py b/src/transformers/data/processors/utils.py
index 5eeb9c9b62..a3286439d8 100644
--- a/src/transformers/data/processors/utils.py
+++ b/src/transformers/data/processors/utils.py
@@ -269,7 +269,9 @@ class SingleSentenceClassificationProcessor(DataProcessor):
                 logger.info("Tokenizing example %d", ex_index)
 
             input_ids = tokenizer.encode(
-                example.text_a, add_special_tokens=True, max_length=min(max_length, tokenizer.max_len),
+                example.text_a,
+                add_special_tokens=True,
+                max_length=min(max_length, tokenizer.max_len),
             )
             all_input_ids.append(input_ids)
 
diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index bbc41595d1..b6c5a596ab 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -329,7 +329,13 @@ class TFGenerationMixin:
         if self.config.is_encoder_decoder:
 
             # create empty decoder_input_ids
-            input_ids = tf.ones((effective_batch_size * num_beams, 1), dtype=tf.int32,) * decoder_start_token_id
+            input_ids = (
+                tf.ones(
+                    (effective_batch_size * num_beams, 1),
+                    dtype=tf.int32,
+                )
+                * decoder_start_token_id
+            )
             cur_len = 1
 
             assert (
@@ -422,8 +428,8 @@ class TFGenerationMixin:
         attention_mask,
         use_cache,
     ):
-        """ Generate sequences for each example without beam search (num_beams == 1).
-            All returned sequence are generated independantly.
+        """Generate sequences for each example without beam search (num_beams == 1).
+        All returned sequence are generated independantly.
         """
 
         # length of generated sentences / unfinished sentences
@@ -587,8 +593,7 @@ class TFGenerationMixin:
         attention_mask,
         use_cache,
     ):
-        """ Generate sequences for each example with beam search.
-        """
+        """Generate sequences for each example with beam search."""
 
         # generated hypotheses
         generated_hyps = [
@@ -960,14 +965,14 @@ def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids):
 
 
 def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
-    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
-        Args:
-            logits: logits distribution shape (batch size, vocabulary size)
-            if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
-            if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
-                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
-            Make sure we keep at least min_tokens_to_keep per batch example in the output
-        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    Args:
+        logits: logits distribution shape (batch size, vocabulary size)
+        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        Make sure we keep at least min_tokens_to_keep per batch example in the output
+    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
     """
     logits_shape = shape_list(logits)
 
@@ -1001,7 +1006,8 @@ def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("In
         # Shift the indices to the right to keep also the first token above the threshold
         sorted_indices_to_remove = tf.roll(sorted_indices_to_remove, 1, axis=-1)
         sorted_indices_to_remove = tf.concat(
-            [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, 1:]], -1,
+            [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, 1:]],
+            -1,
         )
         # scatter sorted tensors to original indexing
         indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices)
@@ -1027,9 +1033,9 @@ def set_tensor_by_indices_to_value(tensor, indices, value):
 
 def sample_without_replacement(logits, num_samples):
     """
-        categorical sampling witouth replacement is currently not implemented
-        the gumbel-max trick will do for now
-        see https://github.com/tensorflow/tensorflow/issues/9260 for more info
+    categorical sampling witouth replacement is currently not implemented
+    the gumbel-max trick will do for now
+    see https://github.com/tensorflow/tensorflow/issues/9260 for more info
     """
     z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1))
     _, indices = tf.nn.top_k(logits + z, num_samples)
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 18f2230004..302fad2fc4 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -83,7 +83,11 @@ class GenerationMixin:
         # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
         if repetition_penalty != 1.0:
             self.enforce_repetition_penalty_(
-                scores, batch_size, num_beams, input_ids, repetition_penalty,
+                scores,
+                batch_size,
+                num_beams,
+                input_ids,
+                repetition_penalty,
             )
 
         # set eos token prob to zero if min_length is not reached
@@ -324,7 +328,10 @@ class GenerationMixin:
                 "or a `bos_token_id` (integer >= 0) as a first token to start the generation."
             )
             input_ids = torch.full(
-                (batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device,
+                (batch_size, 1),
+                bos_token_id,
+                dtype=torch.long,
+                device=next(self.parameters()).device,
             )
         else:
             assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)."
@@ -514,8 +521,8 @@ class GenerationMixin:
         use_cache,
         model_specific_kwargs,
     ):
-        """ Generate sequences for each example without beam search (num_beams == 1).
-            All returned sequence are generated independantly.
+        """Generate sequences for each example without beam search (num_beams == 1).
+        All returned sequence are generated independantly.
         """
         # length of generated sentences / unfinished sentences
         unfinished_sents = input_ids.new(batch_size).fill_(1)
@@ -619,8 +626,7 @@ class GenerationMixin:
         use_cache,
         model_specific_kwargs,
     ):
-        """ Generate sequences for each example with beam search.
-        """
+        """Generate sequences for each example with beam search."""
 
         # generated hypotheses
         generated_hyps = [
@@ -749,7 +755,8 @@ class GenerationMixin:
                         if is_beam_token_worse_than_top_num_beams:
                             continue
                         generated_hyps[batch_idx].add(
-                            input_ids[effective_beam_id].clone(), beam_token_score.item(),
+                            input_ids[effective_beam_id].clone(),
+                            beam_token_score.item(),
                         )
                     else:
                         # add next predicted token since it is not eos_token
@@ -806,7 +813,8 @@ class GenerationMixin:
                 assert torch.all(
                     next_scores[batch_idx, :num_beams] == beam_scores.view(batch_size, num_beams)[batch_idx]
                 ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format(
-                    next_scores[:, :num_beams][batch_idx], beam_scores.view(batch_size, num_beams)[batch_idx],
+                    next_scores[:, :num_beams][batch_idx],
+                    beam_scores.view(batch_size, num_beams)[batch_idx],
                 )
 
             # need to add best num_beams hypotheses to generated hyps
@@ -916,7 +924,7 @@ def calc_banned_bad_words_ids(prev_input_ids: Iterable[int], bad_words_ids: Iter
 
 
 def set_scores_to_inf_for_banned_tokens(scores: torch.Tensor, banned_tokens: List[List[int]]) -> None:
-    """ Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be
+    """Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be
     a list of list of banned tokens to ban in the format [[batch index, vocabulary position],...]
         Args:
             scores: logits distribution of shape (batch size, vocabulary size)
@@ -946,14 +954,14 @@ def top_k_top_p_filtering(
     filter_value: float = -float("Inf"),
     min_tokens_to_keep: int = 1,
 ) -> Tensor:
-    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
-        Args:
-            logits: logits distribution shape (batch size, vocabulary size)
-            if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
-            if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
-                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
-            Make sure we keep at least min_tokens_to_keep per batch example in the output
-        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    Args:
+        logits: logits distribution shape (batch size, vocabulary size)
+        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        Make sure we keep at least min_tokens_to_keep per batch example in the output
+    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
     """
     if top_k > 0:
         top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index bc253d9208..df91de3908 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -36,20 +36,20 @@ logger = logging.get_logger(__name__)
 
 
 class ModelCard:
-    r""" Structured Model Card class.
-        Store model card as well as methods for loading/downloading/saving model cards.
+    r"""Structured Model Card class.
+    Store model card as well as methods for loading/downloading/saving model cards.
 
-        Please read the following paper for details and explanation on the sections:
-            "Model Cards for Model Reporting"
-                by Margaret Mitchell, Simone Wu,
-                Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
-                Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards.
-            Link: https://arxiv.org/abs/1810.03993
+    Please read the following paper for details and explanation on the sections:
+        "Model Cards for Model Reporting"
+            by Margaret Mitchell, Simone Wu,
+            Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
+            Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards.
+        Link: https://arxiv.org/abs/1810.03993
 
-        Note:
-            A model card can be loaded and saved to disk.
+    Note:
+        A model card can be loaded and saved to disk.
 
-        Parameters:
+    Parameters:
     """
 
     def __init__(self, **kwargs):
@@ -73,8 +73,7 @@ class ModelCard:
                 raise err
 
     def save_pretrained(self, save_directory_or_file):
-        """ Save a model card object to the directory or file `save_directory_or_file`.
-        """
+        """Save a model card object to the directory or file `save_directory_or_file`."""
         if os.path.isdir(save_directory_or_file):
             # If we save using the predefined names, we can load using `from_pretrained`
             output_model_card_file = os.path.join(save_directory_or_file, MODEL_CARD_NAME)
@@ -86,7 +85,7 @@ class ModelCard:
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r""" Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
+        r"""Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
 
         Parameters:
             pretrained_model_name_or_path: either:
diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py
index 02b9160e2a..e2a8790ef7 100755
--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -302,7 +302,10 @@ class AlbertLayer(nn.Module):
         attention_output = self.attention(hidden_states, attention_mask, head_mask, output_attentions)
 
         ffn_output = apply_chunking_to_forward(
-            self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output[0],
+            self.ff_chunk,
+            self.chunk_size_feed_forward,
+            self.seq_len_dim,
+            attention_output[0],
         )
         hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0])
 
@@ -397,8 +400,8 @@ class AlbertTransformer(nn.Module):
 
 
 class AlbertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = AlbertConfig
@@ -406,8 +409,7 @@ class AlbertPreTrainedModel(PreTrainedModel):
     authorized_missing_keys = [r"position_ids"]
 
     def _init_weights(self, module):
-        """ Initialize the weights.
-        """
+        """Initialize the weights."""
         if isinstance(module, (nn.Linear, nn.Embedding)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
@@ -543,17 +545,17 @@ class AlbertModel(AlbertPreTrainedModel):
         return self.embeddings.word_embeddings
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            ALBERT has a different architecture in that its layers are shared across groups, which then has inner groups.
-            If an ALBERT model has 12 hidden layers and 2 hidden groups, with two inner groups, there
-            is a total of 4 different layers.
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        ALBERT has a different architecture in that its layers are shared across groups, which then has inner groups.
+        If an ALBERT model has 12 hidden layers and 2 hidden groups, with two inner groups, there
+        is a total of 4 different layers.
 
-            These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
-            while [2,3] correspond to the two inner groups of the second hidden layer.
+        These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
+        while [2,3] correspond to the two inner groups of the second hidden layer.
 
-            Any layer with in index other than [0,1,2,3] will result in an error.
-            See base class PreTrainedModel for more information about head pruning
+        Any layer with in index other than [0,1,2,3] will result in an error.
+        See base class PreTrainedModel for more information about head pruning
         """
         for layer, heads in heads_to_prune.items():
             group_idx = int(layer / self.config.inner_group_num)
@@ -672,34 +674,34 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
         **kwargs,
     ):
         r"""
-        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-        sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates original order (sequence A, then sequence B),
-            ``1`` indicates switched order (sequence B, then sequence A).
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
+            labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+                Labels for computing the masked language modeling loss.
+                Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+                in ``[0, ..., config.vocab_size]``
+            sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
+                Indices should be in ``[0, 1]``.
+                ``0`` indicates original order (sequence A, then sequence B),
+                ``1`` indicates switched order (sequence B, then sequence A).
+            kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+                Used to hide legacy arguments that have been deprecated.
 
-    Returns:
+        Returns:
 
-    Examples::
+        Examples::
 
-        >>> from transformers import AlbertTokenizer, AlbertForPreTraining
-        >>> import torch
+            >>> from transformers import AlbertTokenizer, AlbertForPreTraining
+            >>> import torch
 
-        >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2', return_dict=True)
+            >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+            >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2', return_dict=True)
 
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
+            >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            >>> outputs = model(input_ids)
 
-        >>> prediction_logits = outputs.prediction_logits
-        >>> sop_logits = outputs.sop_logits
+            >>> prediction_logits = outputs.prediction_logits
+            >>> sop_logits = outputs.sop_logits
 
         """
 
@@ -787,7 +789,8 @@ class AlbertSOPHead(nn.Module):
 
 
 @add_start_docstrings(
-    "Albert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING,
+    "Albert Model with a `language modeling` head on top.",
+    ALBERT_START_DOCSTRING,
 )
 class AlbertForMaskedLM(AlbertPreTrainedModel):
     def __init__(self, config):
@@ -952,7 +955,10 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return SequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1033,7 +1039,10 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return TokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1215,5 +1224,8 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return MultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
diff --git a/src/transformers/modeling_auto.py b/src/transformers/modeling_auto.py
index 0c647c28b0..287aa06778 100644
--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -374,12 +374,12 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
 
 class AutoModel:
     r"""
-        :class:`~transformers.AutoModel` is a generic model class
-        that will be instantiated as one of the base model classes of the library
-        when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
-        or the `AutoModel.from_config(config)` class methods.
+    :class:`~transformers.AutoModel` is a generic model class
+    that will be instantiated as one of the base model classes of the library
+    when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
+    or the `AutoModel.from_config(config)` class methods.
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -391,7 +391,7 @@ class AutoModel:
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -433,7 +433,7 @@ class AutoModel:
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
@@ -531,11 +531,11 @@ class AutoModel:
 
 class AutoModelForPreTraining:
     r"""
-        :class:`~transformers.AutoModelForPreTraining` is a generic model class
-        that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `AutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelForPreTraining` is a generic model class
+    that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `AutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -547,7 +547,7 @@ class AutoModelForPreTraining:
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -589,7 +589,7 @@ class AutoModelForPreTraining:
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration.
+        r"""Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
         based on the `model_type` property of the config object, or when it's missing,
@@ -680,12 +680,12 @@ class AutoModelForPreTraining:
 
 class AutoModelWithLMHead:
     r"""
-        :class:`~transformers.AutoModelWithLMHead` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
-        when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelWithLMHead` is a generic model class
+    that will be instantiated as one of the language modeling model classes of the library
+    when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -697,7 +697,7 @@ class AutoModelWithLMHead:
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -743,7 +743,7 @@ class AutoModelWithLMHead:
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
+        r"""Instantiates one of the language modeling model classes of the library
         from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
@@ -839,12 +839,12 @@ class AutoModelWithLMHead:
 
 class AutoModelForCausalLM:
     r"""
-        :class:`~transformers.AutoModelForCausalLM` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
-        when created with the `AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelForCausalLM` is a generic model class
+    that will be instantiated as one of the language modeling model classes of the library
+    when created with the `AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -856,7 +856,7 @@ class AutoModelForCausalLM:
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -893,7 +893,7 @@ class AutoModelForCausalLM:
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
+        r"""Instantiates one of the language modeling model classes of the library
         from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
@@ -976,12 +976,12 @@ class AutoModelForCausalLM:
 
 class AutoModelForMaskedLM:
     r"""
-        :class:`~transformers.AutoModelForMaskedLM` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
-        when created with the `AutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelForMaskedLM` is a generic model class
+    that will be instantiated as one of the language modeling model classes of the library
+    when created with the `AutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -993,7 +993,7 @@ class AutoModelForMaskedLM:
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -1033,7 +1033,7 @@ class AutoModelForMaskedLM:
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
+        r"""Instantiates one of the language modeling model classes of the library
         from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
@@ -1119,12 +1119,12 @@ class AutoModelForMaskedLM:
 
 class AutoModelForSeq2SeqLM:
     r"""
-        :class:`~transformers.AutoModelForSeq2SeqLM` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
-        when created with the `AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelForSeq2SeqLM` is a generic model class
+    that will be instantiated as one of the language modeling model classes of the library
+    when created with the `AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -1136,7 +1136,7 @@ class AutoModelForSeq2SeqLM:
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -1172,7 +1172,7 @@ class AutoModelForSeq2SeqLM:
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
+        r"""Instantiates one of the language modeling model classes of the library
         from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
@@ -1254,12 +1254,12 @@ class AutoModelForSeq2SeqLM:
 
 class AutoModelForSequenceClassification:
     r"""
-        :class:`~transformers.AutoModelForSequenceClassification` is a generic model class
-        that will be instantiated as one of the sequence classification model classes of the library
-        when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelForSequenceClassification` is a generic model class
+    that will be instantiated as one of the sequence classification model classes of the library
+    when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -1271,7 +1271,7 @@ class AutoModelForSequenceClassification:
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -1313,7 +1313,7 @@ class AutoModelForSequenceClassification:
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the sequence classification model classes of the library
+        r"""Instantiates one of the sequence classification model classes of the library
         from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
@@ -1406,12 +1406,12 @@ class AutoModelForSequenceClassification:
 
 class AutoModelForQuestionAnswering:
     r"""
-        :class:`~transformers.AutoModelForQuestionAnswering` is a generic model class
-        that will be instantiated as one of the question answering model classes of the library
-        when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelForQuestionAnswering` is a generic model class
+    that will be instantiated as one of the question answering model classes of the library
+    when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -1423,7 +1423,7 @@ class AutoModelForQuestionAnswering:
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -1462,7 +1462,7 @@ class AutoModelForQuestionAnswering:
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the question answering model classes of the library
+        r"""Instantiates one of the question answering model classes of the library
         from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
@@ -1551,12 +1551,12 @@ class AutoModelForQuestionAnswering:
 
 class AutoModelForTokenClassification:
     r"""
-        :class:`~transformers.AutoModelForTokenClassification` is a generic model class
-        that will be instantiated as one of the token classification model classes of the library
-        when created with the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelForTokenClassification` is a generic model class
+    that will be instantiated as one of the token classification model classes of the library
+    when created with the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -1568,7 +1568,7 @@ class AutoModelForTokenClassification:
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -1611,7 +1611,7 @@ class AutoModelForTokenClassification:
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the question answering model classes of the library
+        r"""Instantiates one of the question answering model classes of the library
         from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
@@ -1703,12 +1703,12 @@ class AutoModelForTokenClassification:
 
 class AutoModelForMultipleChoice:
     r"""
-        :class:`~transformers.AutoModelForMultipleChoice` is a generic model class
-        that will be instantiated as one of the multiple choice model classes of the library
-        when created with the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelForMultipleChoice` is a generic model class
+    that will be instantiated as one of the multiple choice model classes of the library
+    when created with the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index e9ec5f7e0b..6220ceed25 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -223,7 +223,9 @@ class EncoderLayer(nn.Module):
         super().__init__()
         self.embed_dim = config.d_model
         self.self_attn = SelfAttention(
-            self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout,
+            self.embed_dim,
+            config.encoder_attention_heads,
+            dropout=config.attention_dropout,
         )
         self.normalize_before = config.normalize_before
         self.self_attn_layer_norm = LayerNorm(self.embed_dim)
@@ -297,7 +299,10 @@ class BartEncoder(nn.Module):
             )
         else:
             self.embed_positions = LearnedPositionalEmbedding(
-                config.max_position_embeddings, embed_dim, self.padding_idx, config.extra_pos_embeddings,
+                config.max_position_embeddings,
+                embed_dim,
+                self.padding_idx,
+                config.extra_pos_embeddings,
             )
         self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.encoder_layers)])
         self.layernorm_embedding = LayerNorm(embed_dim) if config.normalize_embedding else nn.Identity()
@@ -370,7 +375,9 @@ class DecoderLayer(nn.Module):
         super().__init__()
         self.embed_dim = config.d_model
         self.self_attn = SelfAttention(
-            embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout,
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
         )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
@@ -477,7 +484,10 @@ class BartDecoder(nn.Module):
             )
         else:
             self.embed_positions = LearnedPositionalEmbedding(
-                config.max_position_embeddings, config.d_model, self.padding_idx, config.extra_pos_embeddings,
+                config.max_position_embeddings,
+                config.d_model,
+                self.padding_idx,
+                config.extra_pos_embeddings,
             )
         self.layers = nn.ModuleList(
             [DecoderLayer(config) for _ in range(config.decoder_layers)]
@@ -695,7 +705,10 @@ class SelfAttention(nn.Module):
         # This is part of a workaround to get around fork/join parallelism not supporting Optional types.
         if key_padding_mask is not None and key_padding_mask.dim() == 0:
             key_padding_mask = None
-        assert key_padding_mask is None or key_padding_mask.size()[:2] == (bsz, src_len,)
+        assert key_padding_mask is None or key_padding_mask.size()[:2] == (
+            bsz,
+            src_len,
+        )
 
         if key_padding_mask is not None:  # don't attend to padding symbols
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
@@ -703,7 +716,11 @@ class SelfAttention(nn.Module):
             attn_weights = attn_weights.masked_fill(reshaped, float("-inf"))
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         attn_weights = F.softmax(attn_weights, dim=-1)
-        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training,)
+        attn_probs = F.dropout(
+            attn_weights,
+            p=self.dropout,
+            training=self.training,
+        )
 
         assert v is not None
         attn_output = torch.bmm(attn_probs, v)
@@ -754,7 +771,11 @@ class BartClassificationHead(nn.Module):
     # This can trivially be shared with RobertaClassificationHead
 
     def __init__(
-        self, input_dim, inner_dim, num_classes, pooler_dropout,
+        self,
+        input_dim,
+        inner_dim,
+        num_classes,
+        pooler_dropout,
     ):
         super().__init__()
         self.dense = nn.Linear(input_dim, inner_dim)
@@ -819,7 +840,8 @@ def _get_shape(t):
 
 
 @add_start_docstrings(
-    "The bare BART Model outputting raw hidden-states without any specific head on top.", BART_START_DOCSTRING,
+    "The bare BART Model outputting raw hidden-states without any specific head on top.",
+    BART_START_DOCSTRING,
 )
 class BartModel(PretrainedBartModel):
     def __init__(self, config: BartConfig):
@@ -981,31 +1003,31 @@ class BartForConditionalGeneration(PretrainedBartModel):
         **unused,
     ):
         r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the masked language modeling loss.
-            Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens
-            with labels in ``[0, ..., config.vocab_size]``.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the masked language modeling loss.
+                Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
+                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens
+                with labels in ``[0, ..., config.vocab_size]``.
 
-    Returns:
+        Returns:
 
-    Conditional generation example::
+        Conditional generation example::
 
-            # Mask filling only works for bart-large
-            from transformers import BartTokenizer, BartForConditionalGeneration
-            tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
-            TXT = "My friends are <mask> but they eat too many carbs."
+                # Mask filling only works for bart-large
+                from transformers import BartTokenizer, BartForConditionalGeneration
+                tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+                TXT = "My friends are <mask> but they eat too many carbs."
 
-            model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
-            input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
-            logits = model(input_ids).logits
+                model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
+                input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+                logits = model(input_ids).logits
 
-            masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
-            probs = logits[0, masked_index].softmax(dim=0)
-            values, predictions = probs.topk(5)
+                masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+                probs = logits[0, masked_index].softmax(dim=0)
+                values, predictions = probs.topk(5)
 
-            tokenizer.decode(predictions).split()
-            # ['good', 'great', 'all', 'really', 'very']
+                tokenizer.decode(predictions).split()
+                # ['good', 'great', 'all', 'really', 'very']
         """
         if "lm_labels" in unused:
             warnings.warn(
@@ -1116,7 +1138,10 @@ class BartForSequenceClassification(PretrainedBartModel):
         super().__init__(config, **kwargs)
         self.model = BartModel(config)
         self.classification_head = BartClassificationHead(
-            config.d_model, config.d_model, config.num_labels, config.classif_dropout,
+            config.d_model,
+            config.d_model,
+            config.num_labels,
+            config.classif_dropout,
         )
         self.model._init_weights(self.classification_head.dense)
         self.model._init_weights(self.classification_head.out_proj)
@@ -1279,7 +1304,10 @@ class BartForQuestionAnswering(PretrainedBartModel):
             total_loss = (start_loss + end_loss) / 2
 
         if not return_dict:
-            output = (start_logits, end_logits,) + outputs[1:]
+            output = (
+                start_logits,
+                end_logits,
+            ) + outputs[1:]
             return ((total_loss,) + output) if total_loss is not None else output
 
         return Seq2SeqQuestionAnsweringModelOutput(
@@ -1307,7 +1335,7 @@ class SinusoidalPositionalEmbedding(nn.Embedding):
     @staticmethod
     def _init_weight(out: nn.Parameter):
         """Identical to the XLM create_sinusoidal_embeddings except features are not interleaved.
-            The cos features are in the 2nd half of the vector. [dim // 2:]
+        The cos features are in the 2nd half of the vector. [dim // 2:]
         """
         n_pos, dim = out.shape
         position_enc = np.array(
diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py
index 3d0b963a83..11eff3e6c7 100755
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -89,8 +89,7 @@ BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 
 def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model.
-    """
+    """Load tf checkpoints in a pytorch model."""
     try:
         import re
 
@@ -174,8 +173,7 @@ BertLayerNorm = torch.nn.LayerNorm
 
 
 class BertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
+    """Construct the embeddings from word, position and token_type embeddings."""
 
     def __init__(self, config):
         super().__init__()
@@ -343,7 +341,12 @@ class BertAttention(nn.Module):
         output_attentions=False,
     ):
         self_outputs = self.self(
-            hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions,
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            output_attentions,
         )
         attention_output = self.output(self_outputs[0], hidden_states)
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
@@ -403,7 +406,10 @@ class BertLayer(nn.Module):
         output_attentions=False,
     ):
         self_attention_outputs = self.attention(
-            hidden_states, attention_mask, head_mask, output_attentions=output_attentions,
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
         )
         attention_output = self_attention_outputs[0]
         outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
@@ -582,8 +588,8 @@ class BertPreTrainingHeads(nn.Module):
 
 
 class BertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = BertConfig
@@ -733,9 +739,9 @@ class BertModel(BertPreTrainedModel):
         self.embeddings.word_embeddings = value
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
@@ -877,34 +883,34 @@ class BertForPreTraining(BertPreTrainedModel):
         **kwargs
     ):
         r"""
-        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
+            labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+                Labels for computing the masked language modeling loss.
+                Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+                in ``[0, ..., config.vocab_size]``
+            next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
+                Indices should be in ``[0, 1]``.
+                ``0`` indicates sequence B is a continuation of sequence A,
+                ``1`` indicates sequence B is a random sequence.
+            kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+                Used to hide legacy arguments that have been deprecated.
 
-    Returns:
+        Returns:
 
-    Examples::
+        Examples::
 
-        >>> from transformers import BertTokenizer, BertForPreTraining
-        >>> import torch
+            >>> from transformers import BertTokenizer, BertForPreTraining
+            >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=True)
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            >>> model = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=True)
 
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
 
-        >>> prediction_logits = outptus.prediction_logits
-        >>> seq_relationship_logits = outputs.seq_relationship_logits
+            >>> prediction_logits = outptus.prediction_logits
+            >>> seq_relationship_logits = outputs.seq_relationship_logits
         """
         if "masked_lm_labels" in kwargs:
             warnings.warn(
@@ -986,36 +992,36 @@ class BertLMHeadModel(BertPreTrainedModel):
         return_dict=None,
     ):
         r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-            if the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
-            is used in the cross-attention if the model is configured as a decoder.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the left-to-right language modeling loss (next word prediction).
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
+            encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                if the model is configured as a decoder.
+            encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+                is used in the cross-attention if the model is configured as a decoder.
+                Mask values selected in ``[0, 1]``:
+                ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the left-to-right language modeling loss (next word prediction).
+                Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+                in ``[0, ..., config.vocab_size]``
 
-    Returns:
+        Returns:
 
-    Example::
+        Example::
 
-        >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
-        >>> import torch
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-        >>> config = BertConfig.from_pretrained("bert-base-cased")
-        >>> config.is_decoder = True
-        >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config, return_dict=True)
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> config.is_decoder = True
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config, return_dict=True)
 
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
 
-        >>> prediction_logits = outputs.logits
+            >>> prediction_logits = outputs.logits
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1049,7 +1055,10 @@ class BertLMHeadModel(BertPreTrainedModel):
             return ((lm_loss,) + output) if lm_loss is not None else output
 
         return CausalLMOutput(
-            loss=lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
     def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
@@ -1173,7 +1182,8 @@ class BertForMaskedLM(BertPreTrainedModel):
 
 
 @add_start_docstrings(
-    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
+    """Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING,
 )
 class BertForNextSentencePrediction(BertPreTrainedModel):
     def __init__(self, config):
@@ -1200,29 +1210,29 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
         return_dict=None,
     ):
         r"""
-        next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
+            next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
+                Indices should be in ``[0, 1]``.
+                ``0`` indicates sequence B is a continuation of sequence A,
+                ``1`` indicates sequence B is a random sequence.
 
-    Returns:
+        Returns:
 
-    Example::
+        Example::
 
-        >>> from transformers import BertTokenizer, BertForNextSentencePrediction
-        >>> import torch
+            >>> from transformers import BertTokenizer, BertForNextSentencePrediction
+            >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased', return_dict=True)
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased', return_dict=True)
 
-        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+            >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+            >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
 
-        >>> outputs = model(**encoding, next_sentence_label=torch.LongTensor([1]))
-        >>> logits = outputs.logits
-        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+            >>> outputs = model(**encoding, next_sentence_label=torch.LongTensor([1]))
+            >>> logits = outputs.logits
+            >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1336,7 +1346,10 @@ class BertForSequenceClassification(BertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return SequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1422,7 +1435,10 @@ class BertForMultipleChoice(BertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return MultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1505,7 +1521,10 @@ class BertForTokenClassification(BertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return TokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
diff --git a/src/transformers/modeling_camembert.py b/src/transformers/modeling_camembert.py
index 5c5e9f95dd..0d5e6c2423 100644
--- a/src/transformers/modeling_camembert.py
+++ b/src/transformers/modeling_camembert.py
@@ -68,7 +68,8 @@ class CamembertModel(RobertaModel):
 
 
 @add_start_docstrings(
-    """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING,
+    """CamemBERT Model with a `language modeling` head on top. """,
+    CAMEMBERT_START_DOCSTRING,
 )
 class CamembertForMaskedLM(RobertaForMaskedLM):
     """
diff --git a/src/transformers/modeling_ctrl.py b/src/transformers/modeling_ctrl.py
index 6bb15e999a..2be39bddc2 100644
--- a/src/transformers/modeling_ctrl.py
+++ b/src/transformers/modeling_ctrl.py
@@ -212,16 +212,15 @@ class EncoderLayer(torch.nn.Module):
 
 
 class CTRLPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = CTRLConfig
     base_model_prefix = "transformer"
 
     def _init_weights(self, module):
-        """ Initialize the weights.
-        """
+        """Initialize the weights."""
         if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
@@ -331,8 +330,8 @@ class CTRLModel(CTRLPreTrainedModel):
         self.w = new_embeddings
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-                heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
         """
         for layer, heads in heads_to_prune.items():
             self.h[layer].multi_head_attention.prune_heads(heads)
diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py
index 39117c162a..579c47f9fc 100755
--- a/src/transformers/modeling_distilbert.py
+++ b/src/transformers/modeling_distilbert.py
@@ -261,7 +261,12 @@ class TransformerBlock(nn.Module):
         """
         # Self-Attention
         sa_output = self.attention(
-            query=x, key=x, value=x, mask=attn_mask, head_mask=head_mask, output_attentions=output_attentions,
+            query=x,
+            key=x,
+            value=x,
+            mask=attn_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
         )
         if output_attentions:
             sa_output, sa_weights = sa_output  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
@@ -343,8 +348,8 @@ class Transformer(nn.Module):
 
 # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
 class DistilBertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = DistilBertConfig
@@ -352,8 +357,7 @@ class DistilBertPreTrainedModel(PreTrainedModel):
     base_model_prefix = "distilbert"
 
     def _init_weights(self, module):
-        """ Initialize the weights.
-        """
+        """Initialize the weights."""
         if isinstance(module, nn.Embedding):
             if module.weight.requires_grad:
                 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
@@ -432,9 +436,9 @@ class DistilBertModel(DistilBertPreTrainedModel):
         self.embeddings.word_embeddings = new_embeddings
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
             self.transformer.layer[layer].attention.prune_heads(heads)
@@ -493,7 +497,8 @@ class DistilBertModel(DistilBertPreTrainedModel):
 
 
 @add_start_docstrings(
-    """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
+    """DistilBert Model with a `masked language modeling` head on top. """,
+    DISTILBERT_START_DOCSTRING,
 )
 class DistilBertForMaskedLM(DistilBertPreTrainedModel):
     def __init__(self, config):
@@ -829,7 +834,10 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return TokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -863,32 +871,32 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
         return_dict=None,
     ):
         r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the multiple choice classification loss.
+                Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
+                of the input tensors. (see `input_ids` above)
 
-    Returns:
+        Returns:
 
-    Examples::
+        Examples::
 
-        >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
-        >>> import torch
+            >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
+            >>> import torch
 
-        >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased', return_dict=True)
+            >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
+            >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased', return_dict=True)
 
-        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        >>> choice0 = "It is eaten with a fork and a knife."
-        >>> choice1 = "It is eaten while held in the hand."
-        >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
+            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+            >>> choice0 = "It is eaten with a fork and a knife."
+            >>> choice1 = "It is eaten while held in the hand."
+            >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
 
-        >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True)
-        >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
+            >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True)
+            >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
 
-        >>> # the linear classifier still needs to be trained
-        >>> loss = outputs.loss
-        >>> logits = outputs.logits
+            >>> # the linear classifier still needs to be trained
+            >>> loss = outputs.loss
+            >>> logits = outputs.logits
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -930,5 +938,8 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return MultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
diff --git a/src/transformers/modeling_dpr.py b/src/transformers/modeling_dpr.py
index 5714964586..070c78b433 100644
--- a/src/transformers/modeling_dpr.py
+++ b/src/transformers/modeling_dpr.py
@@ -265,8 +265,8 @@ class DPRSpanPredictor(PreTrainedModel):
 
 
 class DPRPretrainedContextEncoder(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = DPRConfig
@@ -278,8 +278,8 @@ class DPRPretrainedContextEncoder(PreTrainedModel):
 
 
 class DPRPretrainedQuestionEncoder(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = DPRConfig
@@ -291,8 +291,8 @@ class DPRPretrainedQuestionEncoder(PreTrainedModel):
 
 
 class DPRPretrainedReader(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = DPRConfig
@@ -421,15 +421,15 @@ class DPRContextEncoder(DPRPretrainedContextEncoder):
         return_dict=None,
     ) -> Union[DPRContextEncoderOutput, Tuple[Tensor, ...]]:
         r"""
-    Return:
+        Return:
 
-    Examples::
+        Examples::
 
-        from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
-        tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
-        model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', return_dict=True)
-        input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
-        embeddings = model(input_ids).pooler_output
+            from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
+            tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
+            model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', return_dict=True)
+            input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
+            embeddings = model(input_ids).pooler_output
         """
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -499,15 +499,15 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
         return_dict=None,
     ) -> Union[DPRQuestionEncoderOutput, Tuple[Tensor, ...]]:
         r"""
-    Return:
+        Return:
 
-    Examples::
+        Examples::
 
-        from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
-        tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
-        model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', return_dict=True)
-        input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
-        embeddings = model(input_ids).pooler_output
+            from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
+            tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
+            model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', return_dict=True)
+            input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
+            embeddings = model(input_ids).pooler_output
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -553,7 +553,8 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
 
 
 @add_start_docstrings(
-    "The bare DPRReader transformer outputting span predictions.", DPR_START_DOCSTRING,
+    "The bare DPRReader transformer outputting span predictions.",
+    DPR_START_DOCSTRING,
 )
 class DPRReader(DPRPretrainedReader):
     def __init__(self, config: DPRConfig):
@@ -574,23 +575,23 @@ class DPRReader(DPRPretrainedReader):
         return_dict=None,
     ) -> Union[DPRReaderOutput, Tuple[Tensor, ...]]:
         r"""
-    Return:
+        Return:
 
-    Examples::
+        Examples::
 
-        from transformers import DPRReader, DPRReaderTokenizer
-        tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
-        model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base', return_dict=True)
-        encoded_inputs = tokenizer(
-                questions=["What is love ?"],
-                titles=["Haddaway"],
-                texts=["'What Is Love' is a song recorded by the artist Haddaway"],
-                return_tensors='pt'
-            )
-        outputs = model(**encoded_inputs)
-        start_logits = outputs.stat_logits
-        end_logits = outputs.end_logits
-        relevance_logits = outputs.relevance_logits
+            from transformers import DPRReader, DPRReaderTokenizer
+            tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
+            model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base', return_dict=True)
+            encoded_inputs = tokenizer(
+                    questions=["What is love ?"],
+                    titles=["Haddaway"],
+                    texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+                    return_tensors='pt'
+                )
+            outputs = model(**encoded_inputs)
+            start_logits = outputs.stat_logits
+            end_logits = outputs.end_logits
+            relevance_logits = outputs.relevance_logits
 
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
diff --git a/src/transformers/modeling_electra.py b/src/transformers/modeling_electra.py
index 354c08b5d9..acb2f5b416 100644
--- a/src/transformers/modeling_electra.py
+++ b/src/transformers/modeling_electra.py
@@ -46,8 +46,7 @@ ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 
 def load_tf_weights_in_electra(model, config, tf_checkpoint_path, discriminator_or_generator="discriminator"):
-    """ Load tf checkpoints in a pytorch model.
-    """
+    """Load tf checkpoints in a pytorch model."""
     try:
         import re
 
@@ -179,8 +178,8 @@ class ElectraGeneratorPredictions(nn.Module):
 
 
 class ElectraPreTrainedModel(BertPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = ElectraConfig
@@ -311,9 +310,9 @@ class ElectraModel(ElectraPreTrainedModel):
         self.embeddings.word_embeddings = value
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
@@ -512,24 +511,24 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
         return_dict=None,
     ):
         r"""
-        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see :obj:`input_ids` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates the token is an original token,
-            ``1`` indicates the token was replaced.
+            labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+                Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see :obj:`input_ids` docstring)
+                Indices should be in ``[0, 1]``.
+                ``0`` indicates the token is an original token,
+                ``1`` indicates the token was replaced.
 
-    Returns:
+        Returns:
 
-    Examples::
+        Examples::
 
-        >>> from transformers import ElectraTokenizer, ElectraForPreTraining
-        >>> import torch
+            >>> from transformers import ElectraTokenizer, ElectraForPreTraining
+            >>> import torch
 
-        >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        >>> model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+            >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+            >>> model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
 
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        >>> logits = model(input_ids).logits
+            >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            >>> logits = model(input_ids).logits
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -836,7 +835,10 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
             total_loss = (start_loss + end_loss) / 2
 
         if not return_dict:
-            output = (start_logits, end_logits,) + discriminator_hidden_states[1:]
+            output = (
+                start_logits,
+                end_logits,
+            ) + discriminator_hidden_states[1:]
             return ((total_loss,) + output) if total_loss is not None else output
 
         return QuestionAnsweringModelOutput(
diff --git a/src/transformers/modeling_encoder_decoder.py b/src/transformers/modeling_encoder_decoder.py
index ed1d651591..b737fa7791 100644
--- a/src/transformers/modeling_encoder_decoder.py
+++ b/src/transformers/modeling_encoder_decoder.py
@@ -28,11 +28,11 @@ logger = logging.get_logger(__name__)
 
 class EncoderDecoderModel(PreTrainedModel):
     r"""
-        :class:`~transformers.EncoderDecoder` is a generic model class that will be
-        instantiated as a transformer architecture with one of the base model
-        classes of the library as encoder and another one as
-        decoder when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
-        class method for the encoder and `AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)` class method for the decoder.
+    :class:`~transformers.EncoderDecoder` is a generic model class that will be
+    instantiated as a transformer architecture with one of the base model
+    classes of the library as encoder and another one as
+    decoder when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
+    class method for the encoder and `AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)` class method for the decoder.
     """
     config_class = EncoderDecoderConfig
     base_model_prefix = "encoder_decoder"
@@ -103,7 +103,7 @@ class EncoderDecoderModel(PreTrainedModel):
         *model_args,
         **kwargs
     ) -> PreTrainedModel:
-        r""" Instantiates an encoder and a decoder from one or two base classes of the library from pre-trained model checkpoints.
+        r"""Instantiates an encoder and a decoder from one or two base classes of the library from pre-trained model checkpoints.
 
 
         The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated).
diff --git a/src/transformers/modeling_flaubert.py b/src/transformers/modeling_flaubert.py
index 694f03f74b..32e8f85fc9 100644
--- a/src/transformers/modeling_flaubert.py
+++ b/src/transformers/modeling_flaubert.py
@@ -240,7 +240,11 @@ class FlaubertModel(XLMModel):
             # self attention
             if not self.pre_norm:
                 attn_outputs = self.attentions[i](
-                    tensor, attn_mask, cache=cache, head_mask=head_mask[i], output_attentions=output_attentions,
+                    tensor,
+                    attn_mask,
+                    cache=cache,
+                    head_mask=head_mask[i],
+                    output_attentions=output_attentions,
                 )
                 attn = attn_outputs[0]
                 if output_attentions:
diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py
index 889c2e2a04..1d4ceb0e2f 100644
--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -61,8 +61,7 @@ GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 
 def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model
-    """
+    """Load tf checkpoints in a pytorch model"""
     try:
         import re
 
@@ -324,8 +323,8 @@ class Block(nn.Module):
 
 
 class GPT2PreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = GPT2Config
@@ -336,8 +335,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
         super().__init__(*inputs, **kwargs)
 
     def _init_weights(self, module):
-        """ Initialize the weights.
-        """
+        """Initialize the weights."""
         if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
@@ -483,8 +481,8 @@ class GPT2Model(GPT2PreTrainedModel):
         self.wte = new_embeddings
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
         """
         for layer, heads in heads_to_prune.items():
             self.h[layer].attn.prune_heads(heads)
@@ -800,47 +798,47 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         **kwargs,
     ):
         r"""
-        mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
-            Index of the classification token in each input sequence.
-            Selected in the range ``[0, input_ids.size(-1) - 1[``.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
-            Labels for language modeling.
-            Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
-            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
-        mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
+            mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
+                Index of the classification token in each input sequence.
+                Selected in the range ``[0, input_ids.size(-1) - 1[``.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
+                Labels for language modeling.
+                Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
+                Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+                All labels set to ``-100`` are ignored (masked), the loss is only
+                computed for labels in ``[0, ..., config.vocab_size]``
+            mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
+                Labels for computing the multiple choice classification loss.
+                Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+                of the input tensors. (see `input_ids` above)
+            kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+                Used to hide legacy arguments that have been deprecated.
 
-    Return:
+        Return:
 
-    Examples::
+        Examples::
 
-        >>> import torch
-        >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
+            >>> import torch
+            >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
 
-        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2, return_dict=True)
+            >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+            >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2, return_dict=True)
 
-        >>> # Add a [CLS] to the vocabulary (we should train it also!)
-        >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+            >>> # Add a [CLS] to the vocabulary (we should train it also!)
+            >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
 
-        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
+            >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
 
-        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
-        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+            >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+            >>> encoded_choices = [tokenizer.encode(s) for s in choices]
+            >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
 
-        >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
-        >>> mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
+            >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
+            >>> mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
 
-        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        >>> lm_logits = outputs.lm_logits
-        >>> mc_logits = outputs.mc_logits
+            >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
+            >>> lm_logits = outputs.lm_logits
+            >>> mc_logits = outputs.mc_logits
 
         """
         if "lm_labels" in kwargs:
diff --git a/src/transformers/modeling_longformer.py b/src/transformers/modeling_longformer.py
index 72bbf3eab9..74623efff7 100755
--- a/src/transformers/modeling_longformer.py
+++ b/src/transformers/modeling_longformer.py
@@ -66,7 +66,7 @@ LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 def _get_question_end_index(input_ids, sep_token_id):
     """
-        Computes the index of the first occurance of `sep_token_id`.
+    Computes the index of the first occurance of `sep_token_id`.
     """
 
     sep_token_indices = (input_ids == sep_token_id).nonzero()
@@ -81,9 +81,9 @@ def _get_question_end_index(input_ids, sep_token_id):
 
 def _compute_global_attention_mask(input_ids, sep_token_id, before_sep_token=True):
     """
-        Computes global attention mask by putting attention on all tokens
-        before `sep_token_id` if `before_sep_token is True` else after
-        `sep_token_id`.
+    Computes global attention mask by putting attention on all tokens
+    before `sep_token_id` if `before_sep_token is True` else after
+    `sep_token_id`.
     """
     question_end_index = _get_question_end_index(input_ids, sep_token_id)
     question_end_index = question_end_index.unsqueeze(dim=1)  # size: batch_size x 1
@@ -135,7 +135,10 @@ class LongformerSelfAttention(nn.Module):
         self.one_sided_attn_window_size = attention_window // 2
 
     def forward(
-        self, hidden_states, attention_mask=None, output_attentions=False,
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
     ):
         """
         LongformerSelfAttention expects `len(hidden_states)` to be multiple of `attention_window`.
@@ -314,17 +317,17 @@ class LongformerSelfAttention(nn.Module):
     @staticmethod
     def _pad_and_diagonalize(chunked_hidden_states):
         """shift every row 1 step right, converting columns into diagonals.
-           Example:
-                 chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
-                                          -1.8348,  0.7672,  0.2986,  0.0285,
-                                          -0.7584,  0.4206, -0.0405,  0.1599,
-                                          2.0514, -1.1600,  0.5372,  0.2629 ]
-                 window_overlap = num_rows = 4
-                (pad & diagonilize) =>
-                [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
-                  0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
-                  0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
-                  0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629 ]
+        Example:
+              chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
+                                       -1.8348,  0.7672,  0.2986,  0.0285,
+                                       -0.7584,  0.4206, -0.0405,  0.1599,
+                                       2.0514, -1.1600,  0.5372,  0.2629 ]
+              window_overlap = num_rows = 4
+             (pad & diagonilize) =>
+             [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
+               0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
+               0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
+               0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629 ]
         """
         total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size()
         chunked_hidden_states = F.pad(
@@ -442,7 +445,7 @@ class LongformerSelfAttention(nn.Module):
         self, attn_probs: torch.Tensor, value: torch.Tensor, window_overlap: int
     ):
         """Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors.
-           Returned tensor will be of the same shape as `attn_probs`"""
+        Returned tensor will be of the same shape as `attn_probs`"""
         batch_size, seq_len, num_heads, head_dim = value.size()
 
         assert seq_len % (window_overlap * 2) == 0
@@ -622,7 +625,10 @@ class LongformerSelfAttention(nn.Module):
             is_local_index_no_global_attn_nonzero[0], :, is_local_index_no_global_attn_nonzero[1], :
         ] = -10000.0
 
-        global_attn_scores = global_attn_scores.masked_fill(is_index_masked[:, None, None, :], -10000.0,)
+        global_attn_scores = global_attn_scores.masked_fill(
+            is_index_masked[:, None, None, :],
+            -10000.0,
+        )
 
         global_attn_scores = global_attn_scores.view(batch_size * self.num_heads, max_num_global_attn_indices, seq_len)
 
@@ -676,9 +682,16 @@ class LongformerAttention(nn.Module):
         self.pruned_heads = self.pruned_heads.union(heads)
 
     def forward(
-        self, hidden_states, attention_mask=None, output_attentions=False,
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
     ):
-        self_outputs = self.self(hidden_states, attention_mask, output_attentions,)
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            output_attentions,
+        )
         attn_output = self.output(self_outputs[0], hidden_states)
         outputs = (attn_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
@@ -694,9 +707,16 @@ class LongformerLayer(nn.Module):
         self.seq_len_dim = 1
 
     def forward(
-        self, hidden_states, attention_mask=None, output_attentions=False,
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
     ):
-        self_attn_outputs = self.attention(hidden_states, attention_mask, output_attentions=output_attentions,)
+        self_attn_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            output_attentions=output_attentions,
+        )
         attn_output = self_attn_outputs[0]
         outputs = self_attn_outputs[1:]  # add self attentions if we output attention weights
 
@@ -741,10 +761,16 @@ class LongformerEncoder(nn.Module):
                     return custom_forward
 
                 layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module), hidden_states, attention_mask,
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
                 )
             else:
-                layer_outputs = layer_module(hidden_states, attention_mask, output_attentions,)
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
             hidden_states = layer_outputs[0]
 
             if output_attentions:
@@ -762,9 +788,9 @@ class LongformerEncoder(nn.Module):
 
 
 class LongformerPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained
-        models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained
+    models.
     """
 
     config_class = LongformerConfig
@@ -896,9 +922,9 @@ class LongformerModel(LongformerPreTrainedModel):
         self.embeddings.word_embeddings = value
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
@@ -938,7 +964,9 @@ class LongformerModel(LongformerPreTrainedModel):
                 position_ids = F.pad(position_ids, (0, padding_len), value=pad_token_id)
             if inputs_embeds is not None:
                 input_ids_padding = inputs_embeds.new_full(
-                    (batch_size, padding_len), self.config.pad_token_id, dtype=torch.long,
+                    (batch_size, padding_len),
+                    self.config.pad_token_id,
+                    dtype=torch.long,
                 )
                 inputs_embeds_padding = self.embeddings(input_ids_padding)
                 inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2)
@@ -976,28 +1004,28 @@ class LongformerModel(LongformerPreTrainedModel):
     ):
         r"""
 
-    Returns:
+        Returns:
 
-    Examples::
+        Examples::
 
-        >>> import torch
-        >>> from transformers import LongformerModel, LongformerTokenizer
+            >>> import torch
+            >>> from transformers import LongformerModel, LongformerTokenizer
 
-        >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096', return_dict=True)
-        >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+            >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096', return_dict=True)
+            >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
 
-        >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
-        >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
+            >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
+            >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
 
-        >>> # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
-        >>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
-        >>> attention_mask[:, [1, 4, 21,]] = 2  # Set global attention based on the task. For example,
-        ...                                     # classification: the <s> token
-        ...                                     # QA: question tokens
-        ...                                     # LM: potentially on the beginning of sentences and paragraphs
-        >>> outputs = model(input_ids, attention_mask=attention_mask)
-        >>> sequence_output = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output
+            >>> # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
+            >>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
+            >>> attention_mask[:, [1, 4, 21,]] = 2  # Set global attention based on the task. For example,
+            ...                                     # classification: the <s> token
+            ...                                     # QA: question tokens
+            ...                                     # LM: potentially on the beginning of sentences and paragraphs
+            >>> outputs = model(input_ids, attention_mask=attention_mask)
+            >>> sequence_output = outputs.last_hidden_state
+            >>> pooled_output = outputs.pooler_output
         """
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1102,32 +1130,32 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
         **kwargs
     ):
         r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the masked language modeling loss.
+                Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+                in ``[0, ..., config.vocab_size]``
+            kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+                Used to hide legacy arguments that have been deprecated.
 
-    Returns:
+        Returns:
 
-    Examples::
+        Examples::
 
-        >>> import torch
-        >>> from transformers import LongformerForMaskedLM, LongformerTokenizer
+            >>> import torch
+            >>> from transformers import LongformerForMaskedLM, LongformerTokenizer
 
-        >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096', return_dict=True)
-        >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+            >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096', return_dict=True)
+            >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
 
-        >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
-        >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
+            >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
+            >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
 
-        >>> attention_mask = None  # default is local attention everywhere, which is a good choice for MaskedLM
-        ...                        # check ``LongformerModel.forward`` for more details how to set `attention_mask`
-        >>> outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
-        >>> loss = outputs.loss
-        >>> prediction_logits = output.logits
+            >>> attention_mask = None  # default is local attention everywhere, which is a good choice for MaskedLM
+            ...                        # check ``LongformerModel.forward`` for more details how to set `attention_mask`
+            >>> outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
+            >>> loss = outputs.loss
+            >>> prediction_logits = output.logits
         """
 
         if "masked_lm_labels" in kwargs:
@@ -1252,7 +1280,10 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return SequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1310,39 +1341,39 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
         return_dict=None,
     ):
         r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-    Returns:
+            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+        Returns:
 
-    Examples::
+        Examples::
 
-        >>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering
-        >>> import torch
+            >>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering
+            >>> import torch
 
-        >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
-        >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa", return_dict=True)
+            >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
+            >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa", return_dict=True)
 
-        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        >>> encoding = tokenizer(question, text, return_tensors="pt")
-        >>> input_ids = encoding["input_ids"]
+            >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+            >>> encoding = tokenizer(question, text, return_tensors="pt")
+            >>> input_ids = encoding["input_ids"]
 
-        >>> # default is local attention everywhere
-        >>> # the forward method will automatically set global attention on question tokens
-        >>> attention_mask = encoding["attention_mask"]
+            >>> # default is local attention everywhere
+            >>> # the forward method will automatically set global attention on question tokens
+            >>> attention_mask = encoding["attention_mask"]
 
-        >>> outputs = model(input_ids, attention_mask=attention_mask)
-        >>> start_logits = outputs.start_logits
-        >>> end_logits = outputs.end_logits
-        >>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
+            >>> outputs = model(input_ids, attention_mask=attention_mask)
+            >>> start_logits = outputs.start_logits
+            >>> end_logits = outputs.end_logits
+            >>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
 
-        >>> answer_tokens = all_tokens[torch.argmax(start_logits) :torch.argmax(end_logits)+1]
-        >>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
+            >>> answer_tokens = all_tokens[torch.argmax(start_logits) :torch.argmax(end_logits)+1]
+            >>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
 
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1487,7 +1518,10 @@ class LongformerForTokenClassification(BertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return TokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1592,5 +1626,8 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return MultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
diff --git a/src/transformers/modeling_mmbt.py b/src/transformers/modeling_mmbt.py
index faf9155a9e..a4293a3f33 100644
--- a/src/transformers/modeling_mmbt.py
+++ b/src/transformers/modeling_mmbt.py
@@ -32,8 +32,7 @@ _CONFIG_FOR_DOC = "MMBTConfig"
 
 
 class ModalEmbeddings(nn.Module):
-    """Generic Modal Embeddings which takes in an encoder, and a transformer embedding.
-    """
+    """Generic Modal Embeddings which takes in an encoder, and a transformer embedding."""
 
     def __init__(self, config, encoder, embeddings):
         super().__init__()
@@ -154,7 +153,8 @@ MMBT_INPUTS_DOCSTRING = r"""    Inputs:
 
 
 @add_start_docstrings(
-    "The bare MMBT Model outputting raw hidden-states without any specific head on top.", MMBT_START_DOCSTRING,
+    "The bare MMBT Model outputting raw hidden-states without any specific head on top.",
+    MMBT_START_DOCSTRING,
 )
 class MMBTModel(nn.Module, ModuleUtilsMixin):
     def __init__(self, config, transformer, encoder):
@@ -288,34 +288,34 @@ class MMBTModel(nn.Module, ModuleUtilsMixin):
 )
 class MMBTForClassification(nn.Module):
     r"""
-            **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-                Labels for computing the sequence classification/regression loss.
-                Indices should be in ``[0, ..., config.num_labels - 1]``.
-                If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
-                If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
 
-        Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-            **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-                Classification (or regression if config.num_labels==1) loss.
-            **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
-                Classification (or regression if config.num_labels==1) scores (before SoftMax).
-            **hidden_states**: (`optional`, returned when ``output_hidden_states=True``)
-                list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-                of shape ``(batch_size, sequence_length, hidden_size)``:
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            **attentions**: (`optional`, returned when ``output_attentions=True``)
-                list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
 
-        Examples::
+    Examples::
 
-            # For example purposes. Not runnable.
-            transformer = BertModel.from_pretrained('bert-base-uncased')
-            encoder = ImageEncoder(args)
-            model = MMBTForClassification(config, transformer, encoder)
-            outputs = model(input_modal, input_ids, labels=labels)
-            loss, logits = outputs[:2]
-        """
+        # For example purposes. Not runnable.
+        transformer = BertModel.from_pretrained('bert-base-uncased')
+        encoder = ImageEncoder(args)
+        model = MMBTForClassification(config, transformer, encoder)
+        outputs = model(input_modal, input_ids, labels=labels)
+        loss, logits = outputs[:2]
+    """
 
     def __init__(self, config, transformer, encoder):
         super().__init__()
@@ -378,5 +378,8 @@ class MMBTForClassification(nn.Module):
             return ((loss,) + output) if loss is not None else output
 
         return SequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
diff --git a/src/transformers/modeling_mobilebert.py b/src/transformers/modeling_mobilebert.py
index 0c39f61a52..aa44afaeab 100644
--- a/src/transformers/modeling_mobilebert.py
+++ b/src/transformers/modeling_mobilebert.py
@@ -64,8 +64,7 @@ MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = ["google/mobilebert-uncased"]
 
 
 def load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model.
-    """
+    """Load tf checkpoints in a pytorch model."""
     try:
         import re
 
@@ -161,8 +160,7 @@ NORM2FN = {"layer_norm": torch.nn.LayerNorm, "no_norm": NoNorm}
 
 
 class MobileBertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
+    """Construct the embeddings from word, position and token_type embeddings."""
 
     def __init__(self, config):
         super().__init__()
@@ -663,8 +661,8 @@ class MobileBertPreTrainingHeads(nn.Module):
 
 
 class MobileBertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = MobileBertConfig
@@ -788,7 +786,7 @@ MOBILEBERT_INPUTS_DOCSTRING = r"""
 )
 class MobileBertModel(MobileBertPreTrainedModel):
     """
-        https://arxiv.org/pdf/2004.02984.pdf
+    https://arxiv.org/pdf/2004.02984.pdf
     """
 
     authorized_missing_keys = [r"position_ids"]
@@ -809,9 +807,9 @@ class MobileBertModel(MobileBertPreTrainedModel):
         self.embeddings.word_embeddings = value
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
@@ -965,31 +963,31 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
         return_dict=None,
     ):
         r"""
-        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
-    Returns:
+            labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+                Labels for computing the masked language modeling loss.
+                Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+                in ``[0, ..., config.vocab_size]``
+            next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
+                Indices should be in ``[0, 1]``.
+                ``0`` indicates sequence B is a continuation of sequence A,
+                ``1`` indicates sequence B is a random sequence.
+        Returns:
 
-    Examples::
+        Examples::
 
-        >>> from transformers import MobileBertTokenizer, MobileBertForPreTraining
-        >>> import torch
+            >>> from transformers import MobileBertTokenizer, MobileBertForPreTraining
+            >>> import torch
 
-        >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
-        >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased", return_dict=True)
+            >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
+            >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased", return_dict=True)
 
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
+            >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            >>> outputs = model(input_ids)
 
-        >>> prediction_logits = outptus.prediction_logits
-        >>> seq_relationship_logits = outputs.seq_relationship_logits
+            >>> prediction_logits = outptus.prediction_logits
+            >>> seq_relationship_logits = outputs.seq_relationship_logits
 
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1176,29 +1174,29 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
         return_dict=None,
     ):
         r"""
-        next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
+            next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
+                Indices should be in ``[0, 1]``.
+                ``0`` indicates sequence B is a continuation of sequence A,
+                ``1`` indicates sequence B is a random sequence.
 
-    Returns:
+        Returns:
 
-    Examples::
+        Examples::
 
-        >>> from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
-        >>> import torch
+            >>> from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
+            >>> import torch
 
-        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
-        >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased', return_dict=True)
+            >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+            >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased', return_dict=True)
 
-        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+            >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+            >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
 
-        >>> outputs = model(**encoding, next_sentence_label=torch.LongTensor([1]))
-        >>> loss = outputs.loss
-        >>> logits = outputs.logits
+            >>> outputs = model(**encoding, next_sentence_label=torch.LongTensor([1]))
+            >>> loss = outputs.loss
+            >>> logits = outputs.logits
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1308,7 +1306,10 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return SequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1491,7 +1492,10 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return MultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1574,5 +1578,8 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return TokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py
index 846c8ff188..1920880b28 100644
--- a/src/transformers/modeling_openai.py
+++ b/src/transformers/modeling_openai.py
@@ -59,8 +59,7 @@ OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 
 def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
-    """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
-    """
+    """Load tf pre-trained weights in a pytorch model (from NumPy arrays here)"""
     import re
 
     import numpy as np
@@ -257,7 +256,10 @@ class Block(nn.Module):
 
     def forward(self, x, attention_mask=None, head_mask=None, output_attentions=False):
         attn_outputs = self.attn(
-            x, attention_mask=attention_mask, head_mask=head_mask, output_attentions=output_attentions,
+            x,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
         )
         a = attn_outputs[0]
 
@@ -270,8 +272,8 @@ class Block(nn.Module):
 
 
 class OpenAIGPTPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = OpenAIGPTConfig
@@ -280,8 +282,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
     authorized_missing_keys = [r"position_ids"]
 
     def _init_weights(self, module):
-        """ Initialize the weights.
-        """
+        """Initialize the weights."""
         if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
@@ -408,8 +409,8 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         self.tokens_embed = new_embeddings
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
         """
         for layer, heads in heads_to_prune.items():
             self.h[layer].attn.prune_heads(heads)
@@ -506,7 +507,9 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
             return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
 
         return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions,
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
         )
 
 
@@ -632,41 +635,41 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         **kwargs
     ):
         r"""
-        mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
-            Index of the classification token in each input sequence.
-            Selected in the range ``[0, input_ids.size(-1) - 1]``.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
-            Labels for language modeling.
-            Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
-            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
-        mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
+            mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
+                Index of the classification token in each input sequence.
+                Selected in the range ``[0, input_ids.size(-1) - 1]``.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
+                Labels for language modeling.
+                Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
+                Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+                All labels set to ``-100`` are ignored (masked), the loss is only
+                computed for labels in ``[0, ..., config.vocab_size]``
+            mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
+                Labels for computing the multiple choice classification loss.
+                Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+                of the input tensors. (see `input_ids` above)
+            kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+                Used to hide legacy arguments that have been deprecated.
 
-    Return:
+        Return:
 
-    Examples::
+        Examples::
 
-        from transformers import OpenAIGPTTokenizer, OpenAIGPTDoubleHeadsModel
-        import torch
+            from transformers import OpenAIGPTTokenizer, OpenAIGPTDoubleHeadsModel
+            import torch
 
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt', return_dict=True)
-        tokenizer.add_special_tokens({'cls_token': '[CLS]'})  # Add a [CLS] to the vocabulary (we should train it also!)
-        model.resize_token_embeddings(len(tokenizer))
+            tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+            model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt', return_dict=True)
+            tokenizer.add_special_tokens({'cls_token': '[CLS]'})  # Add a [CLS] to the vocabulary (we should train it also!)
+            model.resize_token_embeddings(len(tokenizer))
 
-        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        mc_token_ids = torch.tensor([input_ids.size(-1)-1, input_ids.size(-1)-1]).unsqueeze(0)  # Batch size 1
+            choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+            input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+            mc_token_ids = torch.tensor([input_ids.size(-1)-1, input_ids.size(-1)-1]).unsqueeze(0)  # Batch size 1
 
-        outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        lm_logits = outputs.lm_logits
-        mc_logits = outputs.mc_logits
+            outputs = model(input_ids, mc_token_ids=mc_token_ids)
+            lm_logits = outputs.lm_logits
+            mc_logits = outputs.mc_logits
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if "lm_labels" in kwargs:
diff --git a/src/transformers/modeling_reformer.py b/src/transformers/modeling_reformer.py
index 89971f1da9..8132567533 100755
--- a/src/transformers/modeling_reformer.py
+++ b/src/transformers/modeling_reformer.py
@@ -78,7 +78,8 @@ ReformerBackwardOutput = namedtuple(
     "ReformerBackwardOutput", ["attn_output", "hidden_states", "grad_attn_output", "grad_hidden_states"]
 )
 ReformerEncoderOutput = namedtuple(
-    "ReformerEncoderOutput", ["hidden_states", "all_hidden_states", "all_attentions", "past_buckets_states"],
+    "ReformerEncoderOutput",
+    ["hidden_states", "all_hidden_states", "all_attentions", "past_buckets_states"],
 )
 
 
@@ -192,7 +193,9 @@ class AxialPositionEmbeddings(nn.Module):
             assert (
                 reduce(mul, self.axial_pos_shape) >= sequence_length
             ), "Make sure that config.axial_pos_shape factors: {} multiply at least to max(sequence_length, least_common_mult_chunk_length): max({}, {})".format(
-                self.axial_pos_shape, sequence_length, self.least_common_mult_chunk_length,
+                self.axial_pos_shape,
+                sequence_length,
+                self.least_common_mult_chunk_length,
             )
 
             # compute how many columns are needed
@@ -218,8 +221,7 @@ class AxialPositionEmbeddings(nn.Module):
 
 
 class PositionEmbeddings(nn.Module):
-    """Constructs conventional position embeddings of shape `[max_pos_embeddings, hidden_size]`.
-    """
+    """Constructs conventional position embeddings of shape `[max_pos_embeddings, hidden_size]`."""
 
     def __init__(self, config):
         super().__init__()
@@ -233,8 +235,7 @@ class PositionEmbeddings(nn.Module):
 
 
 class ReformerEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
+    """Construct the embeddings from word, position and token_type embeddings."""
 
     def __init__(self, config):
         super().__init__()
@@ -285,16 +286,16 @@ class EfficientAttentionMixin:
     """
 
     def _look_adjacent(self, vectors, num_chunks_before, num_chunks_after):
-        """ Used to implement attention between consecutive chunks.
+        """Used to implement attention between consecutive chunks.
 
-            Args:
-                vectors: array of shape [batch_size, num_attention_heads, n_chunks, chunk_len, ...]
-                num_chunks_before: chunks before current chunk to include in attention
-                num_chunks_after: chunks after current chunk to include in attention
+        Args:
+            vectors: array of shape [batch_size, num_attention_heads, n_chunks, chunk_len, ...]
+            num_chunks_before: chunks before current chunk to include in attention
+            num_chunks_after: chunks after current chunk to include in attention
 
-            Returns:
-                tensor of shape [num_chunks, N * chunk_length, ...], where
-                N = (1 + num_chunks_before + num_chunks_after).
+        Returns:
+            tensor of shape [num_chunks, N * chunk_length, ...], where
+            N = (1 + num_chunks_before + num_chunks_after).
         """
         if num_chunks_before == 0 and num_chunks_after == 0:
             return vectors
@@ -309,7 +310,7 @@ class EfficientAttentionMixin:
 
     def _split_hidden_size_dim(self, x, num_attn_heads, attn_head_size):
         """
-            splits hidden_size dim into attn_head_size and num_attn_heads
+        splits hidden_size dim into attn_head_size and num_attn_heads
         """
         new_x_shape = x.size()[:-1] + (num_attn_heads, attn_head_size)
         x = x.view(*new_x_shape)
@@ -317,14 +318,14 @@ class EfficientAttentionMixin:
 
     def _merge_hidden_size_dims(self, x, num_attn_heads, attn_head_size):
         """
-            merges attn_head_size dim and num_attn_heads dim into hidden_size
+        merges attn_head_size dim and num_attn_heads dim into hidden_size
         """
         x = x.permute(0, 2, 1, 3)
         return torch.reshape(x, (x.size()[0], -1, num_attn_heads * attn_head_size))
 
     def _split_seq_length_dim_to(self, vectors, dim_factor_1, dim_factor_2, num_attn_heads, attn_head_size=None):
         """
-            splits sequence length dim of vectors into `dim_factor_1` and `dim_factor_2` dims
+        splits sequence length dim of vectors into `dim_factor_1` and `dim_factor_2` dims
         """
         batch_size = vectors.shape[0]
         split_dim_shape = (batch_size, num_attn_heads, dim_factor_1, dim_factor_2)
@@ -418,10 +419,18 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
                 # split key & value vectors by num hashes to apply
                 # self attention on each separately
                 query_key_vectors = self._split_seq_length_dim_to(
-                    query_key_vectors, num_hashes, -1, self.num_attention_heads, self.attention_head_size,
+                    query_key_vectors,
+                    num_hashes,
+                    -1,
+                    self.num_attention_heads,
+                    self.attention_head_size,
                 )
                 value_vectors = self._split_seq_length_dim_to(
-                    value_vectors, num_hashes, -1, self.num_attention_heads, self.attention_head_size,
+                    value_vectors,
+                    num_hashes,
+                    -1,
+                    self.num_attention_heads,
+                    self.attention_head_size,
                 )
                 # repeat query vectors across hash dimension
                 query_vectors = query_vectors.unsqueeze(2).repeat(1, 1, num_hashes, 1, 1)
@@ -496,10 +505,18 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
             query_key_vectors = self._gather_by_expansion(query_key_vectors, sorted_bucket_idx_per_hash, num_hashes)
             value_vectors = self._gather_by_expansion(value_vectors, sorted_bucket_idx_per_hash, num_hashes)
             query_key_vectors = self._split_seq_length_dim_to(
-                query_key_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size,
+                query_key_vectors,
+                -1,
+                self.chunk_length,
+                self.num_attention_heads,
+                self.attention_head_size,
             )
             value_vectors = self._split_seq_length_dim_to(
-                value_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size,
+                value_vectors,
+                -1,
+                self.chunk_length,
+                self.num_attention_heads,
+                self.attention_head_size,
             )
 
             if self.chunk_length is None:
@@ -548,10 +565,18 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
             # sum up all hash rounds
             if num_hashes > 1:
                 out_vectors = self._split_seq_length_dim_to(
-                    out_vectors, num_hashes, sequence_length, self.num_attention_heads, self.attention_head_size,
+                    out_vectors,
+                    num_hashes,
+                    sequence_length,
+                    self.num_attention_heads,
+                    self.attention_head_size,
                 )
                 logits = self._split_seq_length_dim_to(
-                    logits, num_hashes, sequence_length, self.num_attention_heads, self.attention_head_size,
+                    logits,
+                    num_hashes,
+                    sequence_length,
+                    self.num_attention_heads,
+                    self.attention_head_size,
                 ).unsqueeze(-1)
 
                 probs_vectors = torch.exp(logits - torch.logsumexp(logits, dim=2, keepdim=True))
@@ -697,7 +722,8 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
 
         # factorize `num_buckets` if `num_buckets` becomes too large
         num_buckets_limit = 2 * max(
-            int((self.max_position_embeddings // self.chunk_length) ** (0.5)), self.chunk_length,
+            int((self.max_position_embeddings // self.chunk_length) ** (0.5)),
+            self.chunk_length,
         )
         if num_buckets > num_buckets_limit:
             num_buckets = [2 ** (num_buckets_pow_2 // 2), 2 ** (num_buckets_pow_2 - num_buckets_pow_2 // 2)]
@@ -946,7 +972,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
 
     def _len_and_dim_norm(self, vectors):
         """
-            length and attention head size dim normalization
+        length and attention head size dim normalization
         """
         vectors = self._len_norm(vectors)
         vectors = vectors * torch.rsqrt(
@@ -956,7 +982,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
 
     def _len_norm(self, x, epsilon=1e-6):
         """
-            length normalization
+        length normalization
         """
         variance = torch.mean(x ** 2, -1, keepdim=True)
         norm_x = x * torch.rsqrt(variance + epsilon)
@@ -964,7 +990,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
 
     def _gather_by_expansion(self, vectors, idxs, num_hashes):
         """
-            expand dims of idxs and vectors for all hashes and gather
+        expand dims of idxs and vectors for all hashes and gather
         """
         expanded_idxs = idxs.unsqueeze(-1).expand(-1, -1, -1, self.attention_head_size)
         vectors = vectors.repeat(1, 1, num_hashes, 1)
@@ -973,11 +999,11 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
 
 class ReverseSort(Function):
     """
-        After chunked attention is applied which sorted clusters,
-        original ordering has to be restored.
-        Since customized backward function is used for Reformer,
-        the gradients of the output vectors have to be explicitely
-        sorted here.
+    After chunked attention is applied which sorted clusters,
+    original ordering has to be restored.
+    Since customized backward function is used for Reformer,
+    the gradients of the output vectors have to be explicitely
+    sorted here.
     """
 
     @staticmethod
@@ -1113,13 +1139,25 @@ class LocalSelfAttention(nn.Module, EfficientAttentionMixin):
             # chunk vectors
             # B x Num_Attn_Head x Seq_Len // chunk_len x chunk_len  x  attn_head_size
             query_vectors = self._split_seq_length_dim_to(
-                query_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size,
+                query_vectors,
+                -1,
+                self.chunk_length,
+                self.num_attention_heads,
+                self.attention_head_size,
             )
             key_vectors = self._split_seq_length_dim_to(
-                key_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size,
+                key_vectors,
+                -1,
+                self.chunk_length,
+                self.num_attention_heads,
+                self.attention_head_size,
             )
             value_vectors = self._split_seq_length_dim_to(
-                value_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size,
+                value_vectors,
+                -1,
+                self.chunk_length,
+                self.num_attention_heads,
+                self.attention_head_size,
             )
 
             # chunk indices
@@ -1179,7 +1217,12 @@ class LocalSelfAttention(nn.Module, EfficientAttentionMixin):
         if not do_standard_self_attention:
             out_vectors = out_vectors.flatten(start_dim=2, end_dim=3)
 
-        assert out_vectors.shape == (batch_size, self.num_attention_heads, sequence_length, self.attention_head_size,)
+        assert out_vectors.shape == (
+            batch_size,
+            self.num_attention_heads,
+            sequence_length,
+            self.attention_head_size,
+        )
 
         out_vectors = self._merge_hidden_size_dims(out_vectors, self.num_attention_heads, self.attention_head_size)
 
@@ -1321,7 +1364,9 @@ class ReformerAttention(nn.Module):
         attention_output = self.output(self_attention_outputs.hidden_states)
 
         return AttentionOutput(
-            hidden_states=attention_output, attention_probs=self_attention_outputs.attention_probs, buckets=buckets,
+            hidden_states=attention_output,
+            attention_probs=self_attention_outputs.attention_probs,
+            buckets=buckets,
         )
 
 
@@ -1369,7 +1414,10 @@ class ChunkReformerFeedForward(nn.Module):
 
     def forward(self, attention_output):
         return apply_chunking_to_forward(
-            self.forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output,
+            self.forward_chunk,
+            self.chunk_size_feed_forward,
+            self.seq_len_dim,
+            attention_output,
         )
 
     def forward_chunk(self, hidden_states):
@@ -1391,11 +1439,11 @@ class ReformerLayer(nn.Module):
 
     def _init_attention_seed(self):
         """
-            This function sets a new seed for the
-            attention layer to make dropout deterministic
-            for both forward calls: 1 normal forward
-            call and 1 forward call in backward
-            to recalculate activations.
+        This function sets a new seed for the
+        attention layer to make dropout deterministic
+        for both forward calls: 1 normal forward
+        call and 1 forward call in backward
+        to recalculate activations.
         """
 
         # randomize seeds
@@ -1412,11 +1460,11 @@ class ReformerLayer(nn.Module):
 
     def _init_feed_forward_seed(self):
         """
-            This function sets a new seed for the
-            feed forward layer to make dropout deterministic
-            for both forward calls: 1 normal forward
-            call and 1 forward call in backward
-            to recalculate activations.
+        This function sets a new seed for the
+        feed forward layer to make dropout deterministic
+        for both forward calls: 1 normal forward
+        call and 1 forward call in backward
+        to recalculate activations.
         """
         # randomize seeds
         # use cuda generator if available
@@ -1520,7 +1568,10 @@ class ReformerLayer(nn.Module):
             # f(X_2)
             # use cached buckets for backprob if buckets not None for LSHSelfAttention
             output = self.attention(
-                hidden_states=hidden_states, head_mask=head_mask, attention_mask=attention_mask, buckets=buckets,
+                hidden_states=hidden_states,
+                head_mask=head_mask,
+                attention_mask=attention_mask,
+                buckets=buckets,
             ).hidden_states
             output.backward(grad_attn_output, retain_graph=True)
 
@@ -1738,8 +1789,8 @@ class ReformerOnlyLMHead(nn.Module):
 
 
 class ReformerPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = ReformerConfig
@@ -1947,9 +1998,9 @@ class ReformerModel(ReformerPreTrainedModel):
         self.embeddings.word_embeddings = value
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
@@ -2099,7 +2150,10 @@ class ReformerModel(ReformerPreTrainedModel):
         )
 
         padded_input_ids = torch.full(
-            (input_shape[0], padding_length), self.config.pad_token_id, device=device, dtype=torch.long,
+            (input_shape[0], padding_length),
+            self.config.pad_token_id,
+            device=device,
+            dtype=torch.long,
         )
 
         # Extend `attention_mask`
@@ -2369,11 +2423,11 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel):
         return_dict=None,
     ):
         r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-                Labels for computing the sequence classification/regression loss.
-                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -2407,7 +2461,10 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return SequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
diff --git a/src/transformers/modeling_retribert.py b/src/transformers/modeling_retribert.py
index 9d889ce07b..285e79b7fc 100644
--- a/src/transformers/modeling_retribert.py
+++ b/src/transformers/modeling_retribert.py
@@ -40,8 +40,8 @@ RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
 class RetriBertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = RetriBertConfig
@@ -73,7 +73,8 @@ RETRIBERT_START_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    """Bert Based model to embed queries or document for document retreival. """, RETRIBERT_START_DOCSTRING,
+    """Bert Based model to embed queries or document for document retreival. """,
+    RETRIBERT_START_DOCSTRING,
 )
 class RetriBertModel(RetriBertPreTrainedModel):
     def __init__(self, config):
@@ -91,7 +92,11 @@ class RetriBertModel(RetriBertPreTrainedModel):
         self.init_weights()
 
     def embed_sentences_checkpointed(
-        self, input_ids, attention_mask, sent_encoder, checkpoint_batch_size=-1,
+        self,
+        input_ids,
+        attention_mask,
+        sent_encoder,
+        checkpoint_batch_size=-1,
     ):
         # reproduces BERT forward pass with checkpointing
         if checkpoint_batch_size < 0 or input_ids.shape[0] < checkpoint_batch_size:
@@ -108,7 +113,11 @@ class RetriBertModel(RetriBertPreTrainedModel):
 
             # define function for cehckpointing
             def partial_encode(*inputs):
-                encoder_outputs = sent_encoder.encoder(inputs[0], attention_mask=inputs[1], head_mask=head_mask,)
+                encoder_outputs = sent_encoder.encoder(
+                    inputs[0],
+                    attention_mask=inputs[1],
+                    head_mask=head_mask,
+                )
                 sequence_output = encoder_outputs[0]
                 pooled_output = sent_encoder.pooler(sequence_output)
                 return pooled_output
@@ -127,13 +136,24 @@ class RetriBertModel(RetriBertPreTrainedModel):
             return torch.cat(pooled_output_list, dim=0)
 
     def embed_questions(
-        self, input_ids, attention_mask=None, checkpoint_batch_size=-1,
+        self,
+        input_ids,
+        attention_mask=None,
+        checkpoint_batch_size=-1,
     ):
-        q_reps = self.embed_sentences_checkpointed(input_ids, attention_mask, self.bert_query, checkpoint_batch_size,)
+        q_reps = self.embed_sentences_checkpointed(
+            input_ids,
+            attention_mask,
+            self.bert_query,
+            checkpoint_batch_size,
+        )
         return self.project_query(q_reps)
 
     def embed_answers(
-        self, input_ids, attention_mask=None, checkpoint_batch_size=-1,
+        self,
+        input_ids,
+        attention_mask=None,
+        checkpoint_batch_size=-1,
     ):
         a_reps = self.embed_sentences_checkpointed(
             input_ids,
@@ -147,33 +167,33 @@ class RetriBertModel(RetriBertPreTrainedModel):
         self, input_ids_query, attention_mask_query, input_ids_doc, attention_mask_doc, checkpoint_batch_size=-1
     ):
         r"""
-    Args:
-        input_ids_query (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary for the queries in a batch.
+        Args:
+            input_ids_query (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary for the queries in a batch.
 
-            Indices can be obtained using :class:`transformers.RetriBertTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.__call__` for details.
+                Indices can be obtained using :class:`transformers.RetriBertTokenizer`.
+                See :func:`transformers.PreTrainedTokenizer.encode` and
+                :func:`transformers.PreTrainedTokenizer.__call__` for details.
 
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask_query (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on queries padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask_query (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Mask to avoid performing attention on queries padding token indices.
+                Mask values selected in ``[0, 1]``:
+                ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
 
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        input_ids_doc (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary for the documents in a batch.
-        attention_mask_doc (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on documents padding token indices.
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            input_ids_doc (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary for the documents in a batch.
+            attention_mask_doc (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Mask to avoid performing attention on documents padding token indices.
 
-        checkpoint_batch_size (:obj:`int`, `optional`, defaults to `:obj:`-1`):
-            If greater than 0, uses gradient checkpointing to only compute sequence representation on checkpoint_batch_size examples at a time
-            on the GPU. All query representations are still compared to all document representations in the batch.
+            checkpoint_batch_size (:obj:`int`, `optional`, defaults to `:obj:`-1`):
+                If greater than 0, uses gradient checkpointing to only compute sequence representation on checkpoint_batch_size examples at a time
+                on the GPU. All query representations are still compared to all document representations in the batch.
 
-    Return:
-        :obj:`torch.FloatTensor` the bi-directional cross-entropy loss obtained while trying to match each query to its corresponding document
-        and each cocument to its corresponding query in the batch
+        Return:
+            :obj:`torch.FloatTensor` the bi-directional cross-entropy loss obtained while trying to match each query to its corresponding document
+            and each cocument to its corresponding query in the batch
         """
         device = input_ids_query.device
         q_reps = self.embed_questions(input_ids_query, attention_mask_query, checkpoint_batch_size)
diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py
index ae4bdf1fa8..8da4aea592 100644
--- a/src/transformers/modeling_roberta.py
+++ b/src/transformers/modeling_roberta.py
@@ -83,7 +83,7 @@ class RobertaEmbeddings(BertEmbeddings):
         )
 
     def create_position_ids_from_inputs_embeds(self, inputs_embeds):
-        """ We are provided embeddings directly. We cannot infer which are padded so just generate
+        """We are provided embeddings directly. We cannot infer which are padded so just generate
         sequential position ids.
 
         :param torch.Tensor inputs_embeds:
@@ -220,36 +220,36 @@ class RobertaForCausalLM(BertPreTrainedModel):
         return_dict=None,
     ):
         r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-            if the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
-            is used in the cross-attention if the model is configured as a decoder.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the left-to-right language modeling loss (next word prediction).
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
+            encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                if the model is configured as a decoder.
+            encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+                is used in the cross-attention if the model is configured as a decoder.
+                Mask values selected in ``[0, 1]``:
+                ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the left-to-right language modeling loss (next word prediction).
+                Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+                in ``[0, ..., config.vocab_size]``
 
-    Returns:
+        Returns:
 
-    Example::
+        Example::
 
-        >>> from transformers import RobertaTokenizer, RobertaLMHeadModel, RobertaConfig
-        >>> import torch
+            >>> from transformers import RobertaTokenizer, RobertaLMHeadModel, RobertaConfig
+            >>> import torch
 
-        >>> tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        >>> config = RobertaConfig.from_pretrained("roberta-base")
-        >>> config.is_decoder = True
-        >>> model = RobertaLMHeadModel.from_pretrained('roberta-base', config=config, return_dict=True)
+            >>> tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+            >>> config = RobertaConfig.from_pretrained("roberta-base")
+            >>> config.is_decoder = True
+            >>> model = RobertaLMHeadModel.from_pretrained('roberta-base', config=config, return_dict=True)
 
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
 
-        >>> prediction_logits = outputs.logits
+            >>> prediction_logits = outputs.logits
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -283,7 +283,10 @@ class RobertaForCausalLM(BertPreTrainedModel):
             return ((lm_loss,) + output) if lm_loss is not None else output
 
         return CausalLMOutput(
-            loss=lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
     def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
@@ -493,7 +496,10 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return SequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -581,7 +587,10 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return MultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -667,7 +676,10 @@ class RobertaForTokenClassification(BertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return TokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -791,7 +803,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
 
 
 def create_position_ids_from_input_ids(input_ids, padding_idx):
-    """ Replace non-padding symbols with their position numbers. Position numbers begin at
+    """Replace non-padding symbols with their position numbers. Position numbers begin at
     padding_idx+1. Padding symbols are ignored. This is modified from fairseq's
     `utils.make_positions`.
 
diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py
index 26a2cd813e..6e5d3c4c83 100644
--- a/src/transformers/modeling_t5.py
+++ b/src/transformers/modeling_t5.py
@@ -62,8 +62,7 @@ T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
 # More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
 ####################################################
 def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model.
-    """
+    """Load tf checkpoints in a pytorch model."""
     try:
         import re
 
@@ -156,8 +155,8 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
 
 class T5LayerNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
-        """ Construct a layernorm module in the T5 style
-            No bias and no substraction of mean.
+        """Construct a layernorm module in the T5 style
+        No bias and no substraction of mean.
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -569,8 +568,8 @@ class T5Block(nn.Module):
 
 
 class T5PreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = T5Config
@@ -913,9 +912,9 @@ class T5Model(T5PreTrainedModel):
         return self.decoder
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
@@ -940,19 +939,19 @@ class T5Model(T5PreTrainedModel):
         **kwargs,
     ):
         r"""
-    Returns:
+        Returns:
 
-    Example::
+        Example::
 
-        >>> from transformers import T5Tokenizer, T5Model
+            >>> from transformers import T5Tokenizer, T5Model
 
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = T5Model.from_pretrained('t5-small')
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> model = T5Model.from_pretrained('t5-small')
 
-        >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        >>> outputs = model(input_ids=input_ids)
+            >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
+            >>> outputs = model(input_ids=input_ids)
 
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+            >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
         """
         if "decoder_past_key_value_states" in kwargs:
             warnings.warn(
@@ -1093,31 +1092,31 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
         **kwargs,
     ):
         r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`.
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`.
+                All labels set to ``-100`` are ignored (masked), the loss is only
+                computed for labels in ``[0, ..., config.vocab_size]``
+            kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+                Used to hide legacy arguments that have been deprecated.
 
-    Returns:
+        Returns:
 
-    Examples::
+        Examples::
 
-        >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
+            >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
 
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
-        >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        >>> outputs = model(input_ids=input_ids, labels=input_ids)
-        >>> loss = outputs.loss
-        >>> logits = outputs.logits
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
+            >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
+            >>> outputs = model(input_ids=input_ids, labels=input_ids)
+            >>> loss = outputs.loss
+            >>> logits = outputs.logits
 
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
-        >>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        >>> outputs = model.generate(input_ids)
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
+            >>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt")  # Batch size 1
+            >>> outputs = model.generate(input_ids)
         """
 
         if "lm_labels" in kwargs:
diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py
index 5958101c0c..45fa546bc8 100644
--- a/src/transformers/modeling_tf_albert.py
+++ b/src/transformers/modeling_tf_albert.py
@@ -74,8 +74,7 @@ TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 
 class TFAlbertEmbeddings(tf.keras.layers.Layer):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
+    """Construct the embeddings from word, position and token_type embeddings."""
 
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
@@ -169,10 +168,10 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
 
     def _linear(self, inputs):
         """Computes logits by running inputs through a linear layer.
-            Args:
-                inputs: A float32 tensor with shape [batch_size, length, embedding_size]
-            Returns:
-                float32 tensor with shape [batch_size, length, vocab_size].
+        Args:
+            inputs: A float32 tensor with shape [batch_size, length, embedding_size]
+        Returns:
+            float32 tensor with shape [batch_size, length, vocab_size].
         """
         batch_size = shape_list(inputs)[0]
         length = shape_list(inputs)[1]
@@ -478,8 +477,8 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
 
 
 class TFAlbertPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = AlbertConfig
@@ -551,9 +550,9 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
         raise NotImplementedError
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
         """
         raise NotImplementedError
 
@@ -655,7 +654,10 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
         pooled_output = self.pooler(sequence_output[:, 0])
 
         if not return_dict:
-            return (sequence_output, pooled_output,) + encoder_outputs[1:]
+            return (
+                sequence_output,
+                pooled_output,
+            ) + encoder_outputs[1:]
 
         return TFBaseModelOutputWithPooling(
             last_hidden_state=sequence_output,
@@ -821,16 +823,16 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel):
     @replace_return_docstrings(output_type=TFAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def call(self, inputs, **kwargs):
         r"""
-    Return:
+        Return:
 
-    Examples::
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForPreTraining
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForPreTraining.from_pretrained('albert-base-v2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, sop_scores = outputs[:2]
+        Examples::
+            import tensorflow as tf
+            from transformers import AlbertTokenizer, TFAlbertForPreTraining
+            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+            model = TFAlbertForPreTraining.from_pretrained('albert-base-v2')
+            input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+            outputs = model(input_ids)
+            prediction_scores, sop_scores = outputs[:2]
         """
         return_dict = kwargs.get("return_dict")
         return_dict = return_dict if return_dict is not None else self.albert.return_dict
@@ -856,7 +858,9 @@ class TFAlbertSOPHead(tf.keras.layers.Layer):
 
         self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob)
         self.classifier = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier",
+            config.num_labels,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="classifier",
         )
 
     def call(self, pooled_output, training: bool):
@@ -935,7 +939,10 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
             return ((loss,) + output) if loss is not None else output
 
         return TFMaskedLMOutput(
-            loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1016,7 +1023,10 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
             return ((loss,) + output) if loss is not None else output
 
         return TFSequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1095,7 +1105,10 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
             return ((loss,) + output) if loss is not None else output
 
         return TFTokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1211,7 +1224,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
 
     @property
     def dummy_inputs(self):
-        """ Dummy inputs to build the network.
+        """Dummy inputs to build the network.
 
         Returns:
             tf.Tensor with dummy inputs
@@ -1316,5 +1329,8 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
             return ((loss,) + output) if loss is not None else output
 
         return TFMultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
diff --git a/src/transformers/modeling_tf_auto.py b/src/transformers/modeling_tf_auto.py
index 77f0cf4c48..b7b221798a 100644
--- a/src/transformers/modeling_tf_auto.py
+++ b/src/transformers/modeling_tf_auto.py
@@ -310,27 +310,27 @@ TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
 
 class TFAutoModel(object):
     r"""
-        :class:`~transformers.TFAutoModel` is a generic model class
-        that will be instantiated as one of the base model classes of the library
-        when created with the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.TFAutoModel` is a generic model class
+    that will be instantiated as one of the base model classes of the library
+    when created with the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string:
+    The `from_pretrained()` method takes care of returning the correct model class instance
+    based on the `model_type` property of the config object, or when it's missing,
+    falling back to using pattern matching on the `pretrained_model_name_or_path` string:
 
-            - `t5`: TFT5Model (T5 model)
-            - `distilbert`: TFDistilBertModel (DistilBERT model)
-            - `roberta`: TFRobertaModel (RoBERTa model)
-            - `bert`: TFBertModel (Bert model)
-            - `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
-            - `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
-            - `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
-            - `xlnet`: TFXLNetModel (XLNet model)
-            - `xlm`: TFXLMModel (XLM model)
-            - `ctrl`: TFCTRLModel (CTRL model)
+        - `t5`: TFT5Model (T5 model)
+        - `distilbert`: TFDistilBertModel (DistilBERT model)
+        - `roberta`: TFRobertaModel (RoBERTa model)
+        - `bert`: TFBertModel (Bert model)
+        - `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
+        - `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
+        - `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
+        - `xlnet`: TFXLNetModel (XLNet model)
+        - `xlm`: TFXLMModel (XLM model)
+        - `ctrl`: TFCTRLModel (CTRL model)
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -342,7 +342,7 @@ class TFAutoModel(object):
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -381,7 +381,7 @@ class TFAutoModel(object):
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
@@ -477,11 +477,11 @@ class TFAutoModel(object):
 
 class TFAutoModelForPreTraining(object):
     r"""
-        :class:`~transformers.TFAutoModelForPreTraining` is a generic model class
-        that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `TFAutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.TFAutoModelForPreTraining` is a generic model class
+    that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `TFAutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -493,7 +493,7 @@ class TFAutoModelForPreTraining(object):
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -532,7 +532,7 @@ class TFAutoModelForPreTraining(object):
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration.
+        r"""Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
         based on the `model_type` property of the config object, or when it's missing,
@@ -630,27 +630,27 @@ class TFAutoModelForPreTraining(object):
 
 class TFAutoModelWithLMHead(object):
     r"""
-        :class:`~transformers.TFAutoModelWithLMHead` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
-        when created with the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.TFAutoModelWithLMHead` is a generic model class
+    that will be instantiated as one of the language modeling model classes of the library
+    when created with the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string:
+    The `from_pretrained()` method takes care of returning the correct model class instance
+    based on the `model_type` property of the config object, or when it's missing,
+    falling back to using pattern matching on the `pretrained_model_name_or_path` string:
 
-            - `t5`: TFT5ForConditionalGeneration (T5 model)
-            - `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
-            - `roberta`: TFRobertaForMaskedLM (RoBERTa model)
-            - `bert`: TFBertForMaskedLM (Bert model)
-            - `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
-            - `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
-            - `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
-            - `xlnet`: TFXLNetLMHeadModel (XLNet model)
-            - `xlm`: TFXLMWithLMHeadModel (XLM model)
-            - `ctrl`: TFCTRLLMHeadModel (CTRL model)
+        - `t5`: TFT5ForConditionalGeneration (T5 model)
+        - `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
+        - `roberta`: TFRobertaForMaskedLM (RoBERTa model)
+        - `bert`: TFBertForMaskedLM (Bert model)
+        - `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
+        - `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
+        - `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
+        - `xlnet`: TFXLNetLMHeadModel (XLNet model)
+        - `xlm`: TFXLMWithLMHeadModel (XLM model)
+        - `ctrl`: TFCTRLLMHeadModel (CTRL model)
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -662,7 +662,7 @@ class TFAutoModelWithLMHead(object):
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -705,7 +705,7 @@ class TFAutoModelWithLMHead(object):
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
+        r"""Instantiates one of the language modeling model classes of the library
         from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
@@ -808,18 +808,18 @@ class TFAutoModelWithLMHead(object):
 
 class TFAutoModelForMultipleChoice:
     r"""
-        :class:`~transformers.TFAutoModelForMultipleChoice` is a generic model class
-        that will be instantiated as one of the multiple choice model classes of the library
-        when created with the `TFAutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.TFAutoModelForMultipleChoice` is a generic model class
+    that will be instantiated as one of the multiple choice model classes of the library
+    when created with the `TFAutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string:
-            - `albert`: TFAlbertForMultipleChoice (Albert model)
-            - `bert`: TFBertForMultipleChoice (Bert model)
+    The `from_pretrained()` method takes care of returning the correct model class instance
+    based on the `model_type` property of the config object, or when it's missing,
+    falling back to using pattern matching on the `pretrained_model_name_or_path` string:
+        - `albert`: TFAlbertForMultipleChoice (Albert model)
+        - `bert`: TFBertForMultipleChoice (Bert model)
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -831,7 +831,7 @@ class TFAutoModelForMultipleChoice:
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -864,7 +864,7 @@ class TFAutoModelForMultipleChoice:
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the multiple choice model classes of the library
+        r"""Instantiates one of the multiple choice model classes of the library
         from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
@@ -958,12 +958,12 @@ class TFAutoModelForMultipleChoice:
 
 class TFAutoModelForCausalLM:
     r"""
-        :class:`~transformers.TFAutoModelForCausalLM` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
-        when created with the `TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.TFAutoModelForCausalLM` is a generic model class
+    that will be instantiated as one of the language modeling model classes of the library
+    when created with the `TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -975,7 +975,7 @@ class TFAutoModelForCausalLM:
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -1011,7 +1011,7 @@ class TFAutoModelForCausalLM:
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
+        r"""Instantiates one of the language modeling model classes of the library
         from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
@@ -1093,12 +1093,12 @@ class TFAutoModelForCausalLM:
 
 class TFAutoModelForMaskedLM:
     r"""
-        :class:`~transformers.TFAutoModelForMaskedLM` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
-        when created with the `TFAutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.TFAutoModelForMaskedLM` is a generic model class
+    that will be instantiated as one of the language modeling model classes of the library
+    when created with the `TFAutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -1110,7 +1110,7 @@ class TFAutoModelForMaskedLM:
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -1149,7 +1149,7 @@ class TFAutoModelForMaskedLM:
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
+        r"""Instantiates one of the language modeling model classes of the library
         from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
@@ -1235,12 +1235,12 @@ class TFAutoModelForMaskedLM:
 
 class TFAutoModelForSeq2SeqLM:
     r"""
-        :class:`~transformers.TFAutoModelForSeq2SeqLM` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
-        when created with the `TFAutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.TFAutoModelForSeq2SeqLM` is a generic model class
+    that will be instantiated as one of the language modeling model classes of the library
+    when created with the `TFAutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -1252,7 +1252,7 @@ class TFAutoModelForSeq2SeqLM:
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -1285,7 +1285,7 @@ class TFAutoModelForSeq2SeqLM:
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
+        r"""Instantiates one of the language modeling model classes of the library
         from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
@@ -1364,22 +1364,22 @@ class TFAutoModelForSeq2SeqLM:
 
 class TFAutoModelForSequenceClassification(object):
     r"""
-        :class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class
-        that will be instantiated as one of the sequence classification model classes of the library
-        when created with the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class
+    that will be instantiated as one of the sequence classification model classes of the library
+    when created with the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string:
+    The `from_pretrained()` method takes care of returning the correct model class instance
+    based on the `model_type` property of the config object, or when it's missing,
+    falling back to using pattern matching on the `pretrained_model_name_or_path` string:
 
-            - `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
-            - `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
-            - `bert`: TFBertForSequenceClassification (Bert model)
-            - `xlnet`: TFXLNetForSequenceClassification (XLNet model)
-            - `xlm`: TFXLMForSequenceClassification (XLM model)
+        - `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
+        - `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
+        - `bert`: TFBertForSequenceClassification (Bert model)
+        - `xlnet`: TFXLNetForSequenceClassification (XLNet model)
+        - `xlm`: TFXLMForSequenceClassification (XLM model)
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -1391,7 +1391,7 @@ class TFAutoModelForSequenceClassification(object):
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -1428,7 +1428,7 @@ class TFAutoModelForSequenceClassification(object):
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the sequence classification model classes of the library
+        r"""Instantiates one of the sequence classification model classes of the library
         from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
@@ -1525,23 +1525,23 @@ class TFAutoModelForSequenceClassification(object):
 
 class TFAutoModelForQuestionAnswering(object):
     r"""
-        :class:`~transformers.TFAutoModelForQuestionAnswering` is a generic model class
-        that will be instantiated as one of the question answering model classes of the library
-        when created with the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.TFAutoModelForQuestionAnswering` is a generic model class
+    that will be instantiated as one of the question answering model classes of the library
+    when created with the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string:
+    The `from_pretrained()` method takes care of returning the correct model class instance
+    based on the `model_type` property of the config object, or when it's missing,
+    falling back to using pattern matching on the `pretrained_model_name_or_path` string:
 
-            - `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
-            - `albert`: TFAlbertForQuestionAnswering (ALBERT model)
-            - `roberta`: TFRobertaForQuestionAnswering (RoBERTa model)
-            - `bert`: TFBertForQuestionAnswering (Bert model)
-            - `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
-            - `xlm`: TFXLMForQuestionAnswering (XLM model)
+        - `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
+        - `albert`: TFAlbertForQuestionAnswering (ALBERT model)
+        - `roberta`: TFRobertaForQuestionAnswering (RoBERTa model)
+        - `bert`: TFBertForQuestionAnswering (Bert model)
+        - `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
+        - `xlm`: TFXLMForQuestionAnswering (XLM model)
 
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -1553,7 +1553,7 @@ class TFAutoModelForQuestionAnswering(object):
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -1591,7 +1591,7 @@ class TFAutoModelForQuestionAnswering(object):
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the question answering model classes of the library
+        r"""Instantiates one of the question answering model classes of the library
         from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
@@ -1697,7 +1697,7 @@ class TFAutoModelForTokenClassification:
 
     @classmethod
     def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
         from a configuration.
 
         Note:
@@ -1733,7 +1733,7 @@ class TFAutoModelForTokenClassification:
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the question answering model classes of the library
+        r"""Instantiates one of the question answering model classes of the library
         from a pre-trained model configuration.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py
index c0f7050f51..f40414826a 100644
--- a/src/transformers/modeling_tf_bert.py
+++ b/src/transformers/modeling_tf_bert.py
@@ -89,7 +89,7 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 
 def gelu(x):
-    """ Gaussian Error Linear Unit.
+    """Gaussian Error Linear Unit.
     Original Implementation of the gelu activation function in Google Bert repo when initially created.
         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
@@ -127,8 +127,7 @@ ACT2FN = {
 
 
 class TFBertEmbeddings(tf.keras.layers.Layer):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
+    """Construct the embeddings from word, position and token_type embeddings."""
 
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
@@ -225,10 +224,10 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
 
     def _linear(self, inputs):
         """Computes logits by running inputs through a linear layer.
-            Args:
-                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
-            Returns:
-                float32 tensor with shape [batch_size, length, vocab_size].
+        Args:
+            inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+        Returns:
+            float32 tensor with shape [batch_size, length, vocab_size].
         """
         batch_size = shape_list(inputs)[0]
         length = shape_list(inputs)[1]
@@ -551,9 +550,9 @@ class TFBertMainLayer(tf.keras.layers.Layer):
         self.embeddings.vocab_size = value.shape[0]
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
         """
         raise NotImplementedError
 
@@ -656,7 +655,10 @@ class TFBertMainLayer(tf.keras.layers.Layer):
         pooled_output = self.pooler(sequence_output)
 
         if not return_dict:
-            return (sequence_output, pooled_output,) + encoder_outputs[1:]
+            return (
+                sequence_output,
+                pooled_output,
+            ) + encoder_outputs[1:]
 
         return TFBaseModelOutputWithPooling(
             last_hidden_state=sequence_output,
@@ -667,8 +669,8 @@ class TFBertMainLayer(tf.keras.layers.Layer):
 
 
 class TFBertPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = BertConfig
@@ -824,18 +826,18 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
     @replace_return_docstrings(output_type=TFBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def call(self, inputs, **kwargs):
         r"""
-    Return:
+        Return:
 
-    Examples::
+        Examples::
 
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForPreTraining
+            import tensorflow as tf
+            from transformers import BertTokenizer, TFBertForPreTraining
 
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForPreTraining.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, seq_relationship_scores = outputs[:2]
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            model = TFBertForPreTraining.from_pretrained('bert-base-uncased')
+            input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+            outputs = model(input_ids)
+            prediction_scores, seq_relationship_scores = outputs[:2]
 
         """
         return_dict = kwargs.get("return_dict")
@@ -933,7 +935,10 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
             return ((loss,) + output) if loss is not None else output
 
         return TFMaskedLMOutput(
-            loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1011,12 +1016,16 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
             return ((loss,) + output) if loss is not None else output
 
         return TFCausalLMOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
 @add_start_docstrings(
-    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
+    """Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING,
 )
 class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
     def __init__(self, config, *inputs, **kwargs):
@@ -1029,22 +1038,22 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
     @replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
     def call(self, inputs, **kwargs):
         r"""
-    Return:
+        Return:
 
-    Examples::
+        Examples::
 
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForNextSentencePrediction
+            import tensorflow as tf
+            from transformers import BertTokenizer, TFBertForNextSentencePrediction
 
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
 
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
+            prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+            next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+            encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
 
-        logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
-        assert logits[0][0] < logits[0][1] # the next sentence was random
+            logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
+            assert logits[0][0] < logits[0][1] # the next sentence was random
         """
         return_dict = kwargs.get("return_dict")
         return_dict = return_dict if return_dict is not None else self.bert.return_dict
@@ -1057,7 +1066,9 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
             return (seq_relationship_score,) + outputs[2:]
 
         return TFNextSentencePredictorOutput(
-            logits=seq_relationship_score, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1138,7 +1149,10 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
             return ((loss,) + output) if loss is not None else output
 
         return TFSequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1159,7 +1173,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
 
     @property
     def dummy_inputs(self):
-        """ Dummy inputs to build the network.
+        """Dummy inputs to build the network.
 
         Returns:
             tf.Tensor with dummy inputs
@@ -1261,7 +1275,10 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
             return ((loss,) + output) if loss is not None else output
 
         return TFMultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1340,7 +1357,10 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
             return ((loss,) + output) if loss is not None else output
 
         return TFTokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
diff --git a/src/transformers/modeling_tf_camembert.py b/src/transformers/modeling_tf_camembert.py
index d59bd7c657..2a6768d18d 100644
--- a/src/transformers/modeling_tf_camembert.py
+++ b/src/transformers/modeling_tf_camembert.py
@@ -77,7 +77,8 @@ class TFCamembertModel(TFRobertaModel):
 
 
 @add_start_docstrings(
-    """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING,
+    """CamemBERT Model with a `language modeling` head on top. """,
+    CAMEMBERT_START_DOCSTRING,
 )
 class TFCamembertForMaskedLM(TFRobertaForMaskedLM):
     """
diff --git a/src/transformers/modeling_tf_ctrl.py b/src/transformers/modeling_tf_ctrl.py
index f2b7e13954..b9e856996e 100644
--- a/src/transformers/modeling_tf_ctrl.py
+++ b/src/transformers/modeling_tf_ctrl.py
@@ -245,8 +245,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
         raise NotImplementedError
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-                heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
         """
         raise NotImplementedError
 
@@ -426,8 +426,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
 
 
 class TFCTRLPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = CTRLConfig
diff --git a/src/transformers/modeling_tf_distilbert.py b/src/transformers/modeling_tf_distilbert.py
index b196f26390..93e0760c34 100644
--- a/src/transformers/modeling_tf_distilbert.py
+++ b/src/transformers/modeling_tf_distilbert.py
@@ -70,7 +70,7 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 # UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
 def gelu(x):
-    """ Gaussian Error Linear Unit.
+    """Gaussian Error Linear Unit.
     Original Implementation of the gelu activation function in Google Bert repo when initially created.
         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
@@ -177,10 +177,10 @@ class TFEmbeddings(tf.keras.layers.Layer):
 
     def _linear(self, inputs):
         """Computes logits by running inputs through a linear layer.
-            Args:
-                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
-            Returns:
-                float32 tensor with shape [batch_size, length, vocab_size].
+        Args:
+            inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+        Returns:
+            float32 tensor with shape [batch_size, length, vocab_size].
         """
         batch_size = shape_list(inputs)[0]
         length = shape_list(inputs)[1]
@@ -518,8 +518,8 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
 
 # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
 class TFDistilBertPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = DistilBertConfig
@@ -634,7 +634,8 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
 
 
 @add_start_docstrings(
-    """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
+    """DistilBert Model with a `masked language modeling` head on top. """,
+    DISTILBERT_START_DOCSTRING,
 )
 class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModelingLoss):
     def __init__(self, config, *inputs, **kwargs):
@@ -875,7 +876,10 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
             return ((loss,) + output) if loss is not None else output
 
         return TFTokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -902,7 +906,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
 
     @property
     def dummy_inputs(self):
-        """ Dummy inputs to build the network.
+        """Dummy inputs to build the network.
 
         Returns:
             tf.Tensor with dummy inputs
diff --git a/src/transformers/modeling_tf_electra.py b/src/transformers/modeling_tf_electra.py
index a562882bf1..05d7469996 100644
--- a/src/transformers/modeling_tf_electra.py
+++ b/src/transformers/modeling_tf_electra.py
@@ -54,8 +54,7 @@ TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 
 class TFElectraEmbeddings(tf.keras.layers.Layer):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
+    """Construct the embeddings from word, position and token_type embeddings."""
 
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
@@ -94,7 +93,13 @@ class TFElectraEmbeddings(tf.keras.layers.Layer):
         super().build(input_shape)
 
     def call(
-        self, input_ids, position_ids=None, token_type_ids=None, inputs_embeds=None, mode="embedding", training=False,
+        self,
+        input_ids,
+        position_ids=None,
+        token_type_ids=None,
+        inputs_embeds=None,
+        mode="embedding",
+        training=False,
     ):
         """Get token embeddings of inputs.
         Args:
@@ -144,10 +149,10 @@ class TFElectraEmbeddings(tf.keras.layers.Layer):
 
     def _linear(self, inputs):
         """Computes logits by running inputs through a linear layer.
-            Args:
-                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
-            Returns:
-                float32 tensor with shape [batch_size, length, vocab_size].
+        Args:
+            inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+        Returns:
+            float32 tensor with shape [batch_size, length, vocab_size].
         """
         batch_size = shape_list(inputs)[0]
         length = shape_list(inputs)[1]
@@ -250,9 +255,9 @@ class TFElectraMainLayer(TFElectraPreTrainedModel):
         raise NotImplementedError
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
         """
         raise NotImplementedError
 
@@ -491,18 +496,18 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel):
         training=False,
     ):
         r"""
-    Returns:
+        Returns:
 
-    Examples::
+        Examples::
 
-        import tensorflow as tf
-        from transformers import ElectraTokenizer, TFElectraForPreTraining
+            import tensorflow as tf
+            from transformers import ElectraTokenizer, TFElectraForPreTraining
 
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        scores = outputs[0]
+            tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+            model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+            input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+            outputs = model(input_ids)
+            scores = outputs[0]
         """
         return_dict = return_dict if return_dict is not None else self.electra.config.return_dict
 
@@ -729,7 +734,10 @@ class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceCla
             return ((loss,) + output) if loss is not None else output
 
         return TFSequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -752,7 +760,7 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss)
 
     @property
     def dummy_inputs(self):
-        """ Dummy inputs to build the network.
+        """Dummy inputs to build the network.
 
         Returns:
             tf.Tensor with dummy inputs
@@ -853,7 +861,10 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss)
             return ((loss,) + output) if loss is not None else output
 
         return TFMultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1020,7 +1031,10 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
             loss = self.compute_loss(labels, (start_logits, end_logits))
 
         if not return_dict:
-            output = (start_logits, end_logits,) + discriminator_hidden_states[1:]
+            output = (
+                start_logits,
+                end_logits,
+            ) + discriminator_hidden_states[1:]
             return ((loss,) + output) if loss is not None else output
 
         return TFQuestionAnsweringModelOutput(
diff --git a/src/transformers/modeling_tf_gpt2.py b/src/transformers/modeling_tf_gpt2.py
index 86db28a177..e603643c25 100644
--- a/src/transformers/modeling_tf_gpt2.py
+++ b/src/transformers/modeling_tf_gpt2.py
@@ -252,8 +252,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
         self.wte.vocab_size = self.wte.weight.shape[0]
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
         """
         raise NotImplementedError
 
@@ -417,8 +417,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
 
 
 class TFGPT2PreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = GPT2Config
@@ -698,34 +698,34 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
         training=False,
     ):
         r"""
-        mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
-            Index of the classification token in each input sequence.
-            Selected in the range ``[0, input_ids.size(-1) - 1[``.
+            mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
+                Index of the classification token in each input sequence.
+                Selected in the range ``[0, input_ids.size(-1) - 1[``.
 
-    Return:
+        Return:
 
-    Examples::
+        Examples::
 
-        >>> import tensorflow as tf
-        >>> from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
+            >>> import tensorflow as tf
+            >>> from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
 
-        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        >>> model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
+            >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+            >>> model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
 
-        >>> # Add a [CLS] to the vocabulary (we should train it also!)
-        >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+            >>> # Add a [CLS] to the vocabulary (we should train it also!)
+            >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
 
-        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
+            >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
 
-        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
-        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+            >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+            >>> encoded_choices = [tokenizer.encode(s) for s in choices]
+            >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
 
-        >>> input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
-        >>> mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1
+            >>> input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
+            >>> mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1
 
-        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
+            >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
+            >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
         """
         if isinstance(inputs, (tuple, list)):
diff --git a/src/transformers/modeling_tf_longformer.py b/src/transformers/modeling_tf_longformer.py
index 948c5e6714..52768b5dd9 100644
--- a/src/transformers/modeling_tf_longformer.py
+++ b/src/transformers/modeling_tf_longformer.py
@@ -55,9 +55,9 @@ TF_LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_sep_token=True):
     """
-        Computes global attention mask by putting attention on all tokens
-        before `sep_token_id` if `before_sep_token is True` else after
-        `sep_token_id`.
+    Computes global attention mask by putting attention on all tokens
+    before `sep_token_id` if `before_sep_token is True` else after
+    `sep_token_id`.
     """
 
     assert sep_token_indices.shape[1] == 2, "`input_ids` should have two dimensions"
@@ -72,11 +72,14 @@ def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_se
         )
     else:
         # last token is separation token and should not be counted and in the middle are two separation tokens
-        attention_mask = tf.cast(
-            tf.broadcast_to(attention_mask, input_ids_shape)
-            > tf.broadcast_to(question_end_index + 1, input_ids_shape),
-            tf.dtypes.int32,
-        ) * tf.cast(tf.broadcast_to(attention_mask, input_ids_shape) < input_ids_shape[-1], tf.dtypes.int32)
+        attention_mask = (
+            tf.cast(
+                tf.broadcast_to(attention_mask, input_ids_shape)
+                > tf.broadcast_to(question_end_index + 1, input_ids_shape),
+                tf.dtypes.int32,
+            )
+            * tf.cast(tf.broadcast_to(attention_mask, input_ids_shape) < input_ids_shape[-1], tf.dtypes.int32)
+        )
 
     return attention_mask
 
@@ -130,7 +133,9 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
         self.one_sided_attn_window_size = attention_window // 2
 
     def call(
-        self, inputs, training=False,
+        self,
+        inputs,
+        training=False,
     ):
         """
         LongformerSelfAttention expects `len(hidden_states)` to be multiple of `attention_window`.
@@ -433,7 +438,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
     def _sliding_chunks_matmul_attn_probs_value(self, attn_probs, value, window_overlap):
 
         """Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors.
-           Returned tensor will be of the same shape as `attn_probs`"""
+        Returned tensor will be of the same shape as `attn_probs`"""
 
         batch_size, seq_len, num_heads, head_dim = shape_list(value)
 
@@ -508,17 +513,17 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
     @staticmethod
     def _pad_and_diagonalize(chunked_hidden_states):
         """shift every row 1 step right, converting columns into diagonals.
-           Example:
-                 chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
-                                          -1.8348,  0.7672,  0.2986,  0.0285,
-                                          -0.7584,  0.4206, -0.0405,  0.1599,
-                                          2.0514, -1.1600,  0.5372,  0.2629 ]
-                 window_overlap = num_rows = 4
-                (pad & diagonilize) =>
-                [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
-                  0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
-                  0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
-                  0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629 ]
+        Example:
+              chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
+                                       -1.8348,  0.7672,  0.2986,  0.0285,
+                                       -0.7584,  0.4206, -0.0405,  0.1599,
+                                       2.0514, -1.1600,  0.5372,  0.2629 ]
+              window_overlap = num_rows = 4
+             (pad & diagonilize) =>
+             [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
+               0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
+               0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
+               0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629 ]
         """
         total_num_heads, num_chunks, window_overlap, hidden_dim = shape_list(chunked_hidden_states)
 
@@ -779,7 +784,8 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
             tf.transpose(global_attn_output, (0, 2, 1, 3)), is_local_index_global_attn_nonzero
         )
         nonzero_global_attn_output = tf.reshape(
-            nonzero_global_attn_output, (shape_list(is_local_index_global_attn_nonzero)[0], -1),
+            nonzero_global_attn_output,
+            (shape_list(is_local_index_global_attn_nonzero)[0], -1),
         )
 
         # overwrite values with global attention
@@ -910,9 +916,9 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
         self.embeddings.vocab_size = value.shape[0]
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
         """
         raise NotImplementedError
 
@@ -1021,7 +1027,10 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
             sequence_output = sequence_output[:, :-padding_len]
 
         if not return_dict:
-            return (sequence_output, pooled_output,) + encoder_outputs[1:]
+            return (
+                sequence_output,
+                pooled_output,
+            ) + encoder_outputs[1:]
 
         return TFBaseModelOutputWithPooling(
             last_hidden_state=sequence_output,
@@ -1031,7 +1040,13 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
         )
 
     def _pad_to_window_size(
-        self, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds, pad_token_id,
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        inputs_embeds,
+        pad_token_id,
     ):
         """A helper function to pad tokens and mask to work with implementation of Longformer selfattention."""
         # padding
@@ -1083,8 +1098,8 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
 
 
 class TFLongformerPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = LongformerConfig
@@ -1286,7 +1301,10 @@ class TFLongformerForMaskedLM(TFLongformerPreTrainedModel, TFMaskedLanguageModel
             return ((loss,) + output) if loss is not None else output
 
         return TFMaskedLMOutput(
-            loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
diff --git a/src/transformers/modeling_tf_mobilebert.py b/src/transformers/modeling_tf_mobilebert.py
index 6f7c2d087d..67681be039 100644
--- a/src/transformers/modeling_tf_mobilebert.py
+++ b/src/transformers/modeling_tf_mobilebert.py
@@ -99,8 +99,7 @@ NORM2FN = {"layer_norm": TFLayerNorm, "no_norm": TFNoNorm}
 
 
 class TFMobileBertEmbeddings(tf.keras.layers.Layer):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
+    """Construct the embeddings from word, position and token_type embeddings."""
 
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
@@ -223,10 +222,10 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer):
 
     def _linear(self, inputs):
         """Computes logits by running inputs through a linear layer.
-            Args:
-                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
-            Returns:
-                float32 tensor with shape [batch_size, length, vocab_size].
+        Args:
+            inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+        Returns:
+            float32 tensor with shape [batch_size, length, vocab_size].
         """
         batch_size = shape_list(inputs)[0]
         length = shape_list(inputs)[1]
@@ -696,9 +695,9 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
         raise NotImplementedError
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
         """
         raise NotImplementedError
 
@@ -799,7 +798,10 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
         pooled_output = self.pooler(sequence_output)
 
         if not return_dict:
-            return (sequence_output, pooled_output,) + encoder_outputs[1:]
+            return (
+                sequence_output,
+                pooled_output,
+            ) + encoder_outputs[1:]
 
         return TFBaseModelOutputWithPooling(
             last_hidden_state=sequence_output,
@@ -810,8 +812,8 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
 
 
 class TFMobileBertPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = MobileBertConfig
@@ -967,18 +969,18 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
     @replace_return_docstrings(output_type=TFMobileBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def call(self, inputs, **kwargs):
         r"""
-    Return:
+        Return:
 
-    Examples::
+        Examples::
 
-        >>> import tensorflow as tf
-        >>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
+            >>> import tensorflow as tf
+            >>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
 
-        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
-        >>> model = TFMobileBertForPreTraining.from_pretrained('google/mobilebert-uncased')
-        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> prediction_scores, seq_relationship_scores = outputs[:2]
+            >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+            >>> model = TFMobileBertForPreTraining.from_pretrained('google/mobilebert-uncased')
+            >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+            >>> outputs = model(input_ids)
+            >>> prediction_scores, seq_relationship_scores = outputs[:2]
 
         """
         return_dict = kwargs.get("return_dict")
@@ -1069,7 +1071,10 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel
             return ((loss,) + output) if loss is not None else output
 
         return TFMaskedLMOutput(
-            loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1098,21 +1103,21 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
     @replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
     def call(self, inputs, **kwargs):
         r"""
-    Return:
+        Return:
 
-    Examples::
+        Examples::
 
-        >>> import tensorflow as tf
-        >>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
+            >>> import tensorflow as tf
+            >>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
 
-        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
-        >>> model = TFMobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
+            >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+            >>> model = TFMobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
 
-        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
+            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+            >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+            >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
 
-        >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
+            >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
         """
         return_dict = kwargs.get("return_dict")
         return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict
@@ -1125,7 +1130,9 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
             return (seq_relationship_score,) + outputs[2:]
 
         return TFNextSentencePredictorOutput(
-            logits=seq_relationship_score, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1206,7 +1213,10 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
             return ((loss,) + output) if loss is not None else output
 
         return TFSequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1323,7 +1333,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
 
     @property
     def dummy_inputs(self):
-        """ Dummy inputs to build the network.
+        """Dummy inputs to build the network.
 
         Returns:
             tf.Tensor with dummy inputs
@@ -1425,7 +1435,10 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
             return ((loss,) + output) if loss is not None else output
 
         return TFMultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -1504,5 +1517,8 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
             return ((loss,) + output) if loss is not None else output
 
         return TFTokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
diff --git a/src/transformers/modeling_tf_openai.py b/src/transformers/modeling_tf_openai.py
index 931f3b7108..49ca4de86c 100644
--- a/src/transformers/modeling_tf_openai.py
+++ b/src/transformers/modeling_tf_openai.py
@@ -243,8 +243,8 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
         self.tokens_embed.vocab_size = value.shape[0]
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
         """
         raise NotImplementedError
 
@@ -373,13 +373,15 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
             return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
 
         return TFBaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions,
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
         )
 
 
 class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = OpenAIGPTConfig
@@ -630,31 +632,31 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
         training=False,
     ):
         r"""
-        mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
-            Index of the classification token in each input sequence.
-            Selected in the range ``[0, input_ids.size(-1) - 1]``.
+            mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
+                Index of the classification token in each input sequence.
+                Selected in the range ``[0, input_ids.size(-1) - 1]``.
 
-    Return:
+        Return:
 
-    Examples::
+        Examples::
 
-        >>> import tensorflow as tf
-        >>> from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
+            >>> import tensorflow as tf
+            >>> from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
 
-        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        >>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
+            >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+            >>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
 
-        >>> # Add a [CLS] to the vocabulary (we should train it also!)
-        >>> tokenizer.add_special_tokens({'cls_token': '[CLS]'})
-        >>> model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
-        >>> print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
+            >>> # Add a [CLS] to the vocabulary (we should train it also!)
+            >>> tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+            >>> model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
+            >>> print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
 
-        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        >>> encoding = tokenizer(choices, return_tensors="tf")
-        >>> inputs = {k: tf.expand_dims(v, 0) for k, v in encoding.items()}
-        >>> inputs["mc_token_ids"]= tf.constant([inputs["input_ids"].shape[-1] - 1, inputs["input_ids"].shape[-1] - 1])[None, :]  # Batch size 1
-        >>> outputs = model(inputs)
-        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
+            >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+            >>> encoding = tokenizer(choices, return_tensors="tf")
+            >>> inputs = {k: tf.expand_dims(v, 0) for k, v in encoding.items()}
+            >>> inputs["mc_token_ids"]= tf.constant([inputs["input_ids"].shape[-1] - 1, inputs["input_ids"].shape[-1] - 1])[None, :]  # Batch size 1
+            >>> outputs = model(inputs)
+            >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
         """
 
         if isinstance(inputs, (tuple, list)):
diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
index 40e32f8bac..5600f8d663 100644
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -28,15 +28,15 @@ logger = logging.get_logger(__name__)
 
 
 def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=""):
-    """ Convert a TF 2.0 model variable name in a pytorch model weight name.
+    """Convert a TF 2.0 model variable name in a pytorch model weight name.
 
-        Conventions for TF2.0 scopes -> PyTorch attribute names conversions:
-            - '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
-            - '_._' is replaced by a new level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
+    Conventions for TF2.0 scopes -> PyTorch attribute names conversions:
+        - '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
+        - '_._' is replaced by a new level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
 
-        return tuple with:
-            - pytorch model weight name
-            - transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each other
+    return tuple with:
+        - pytorch model weight name
+        - transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each other
     """
     tf_name = tf_name.replace(":0", "")  # device ids
     tf_name = re.sub(
@@ -72,8 +72,7 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="")
 
 
 def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
-    """ Load pytorch checkpoints in a TF 2.0 model
-    """
+    """Load pytorch checkpoints in a TF 2.0 model"""
     try:
         import tensorflow as tf  # noqa: F401
         import torch  # noqa: F401
@@ -96,8 +95,7 @@ def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_i
 
 
 def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False):
-    """ Load pytorch checkpoints in a TF 2.0 model
-    """
+    """Load pytorch checkpoints in a TF 2.0 model"""
     pt_state_dict = pt_model.state_dict()
 
     return load_pytorch_weights_in_tf2_model(
@@ -106,8 +104,7 @@ def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_mi
 
 
 def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, allow_missing_keys=False):
-    """ Load pytorch state_dict in a TF 2.0 model.
-    """
+    """Load pytorch state_dict in a TF 2.0 model."""
     try:
         import tensorflow as tf  # noqa: F401
         import torch  # noqa: F401
@@ -230,9 +227,9 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
 
 
 def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
-    """ Load TF 2.0 HDF5 checkpoint in a PyTorch model
-        We use HDF5 to easily do transfer learning
-        (see https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357).
+    """Load TF 2.0 HDF5 checkpoint in a PyTorch model
+    We use HDF5 to easily do transfer learning
+    (see https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357).
     """
     try:
         import tensorflow as tf  # noqa: F401
@@ -265,16 +262,14 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
 
 
 def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False):
-    """ Load TF 2.0 model in a pytorch model
-    """
+    """Load TF 2.0 model in a pytorch model"""
     weights = tf_model.weights
 
     return load_tf2_weights_in_pytorch_model(pt_model, weights, allow_missing_keys=allow_missing_keys)
 
 
 def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=False):
-    """ Load TF2.0 symbolic weights in a PyTorch model
-    """
+    """Load TF2.0 symbolic weights in a PyTorch model"""
     try:
         import tensorflow as tf  # noqa: F401
         import torch  # noqa: F401
diff --git a/src/transformers/modeling_tf_roberta.py b/src/transformers/modeling_tf_roberta.py
index cd2befbea1..2ad8456933 100644
--- a/src/transformers/modeling_tf_roberta.py
+++ b/src/transformers/modeling_tf_roberta.py
@@ -73,7 +73,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
         self.padding_idx = 1
 
     def create_position_ids_from_input_ids(self, x):
-        """ Replace non-padding symbols with their position numbers. Position numbers begin at
+        """Replace non-padding symbols with their position numbers. Position numbers begin at
         padding_idx+1. Padding symbols are ignored. This is modified from fairseq's
         `utils.make_positions`.
         :param tf.Tensor x:
@@ -84,7 +84,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
         return incremental_indicies + self.padding_idx
 
     def create_position_ids_from_inputs_embeds(self, inputs_embeds):
-        """ We are provided embeddings directly. We cannot infer which are padded so just generate
+        """We are provided embeddings directly. We cannot infer which are padded so just generate
         sequential position ids.
         :param tf.Tensor inputs_embeds:
         :return tf.Tensor:
@@ -120,8 +120,8 @@ class TFRobertaMainLayer(TFBertMainLayer):
 
 
 class TFRobertaPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = RobertaConfig
@@ -330,7 +330,10 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
             return ((loss,) + output) if loss is not None else output
 
         return TFMaskedLMOutput(
-            loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -431,7 +434,10 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
             return ((loss,) + output) if loss is not None else output
 
         return TFSequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -452,7 +458,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
 
     @property
     def dummy_inputs(self):
-        """ Dummy inputs to build the network.
+        """Dummy inputs to build the network.
 
         Returns:
             tf.Tensor with dummy inputs
@@ -549,7 +555,10 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
             return ((loss,) + output) if loss is not None else output
 
         return TFMultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -628,7 +637,10 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
             return ((loss,) + output) if loss is not None else output
 
         return TFTokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py
index 80e4af724f..6a4379c0f6 100644
--- a/src/transformers/modeling_tf_t5.py
+++ b/src/transformers/modeling_tf_t5.py
@@ -67,8 +67,8 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 class TFT5LayerNorm(tf.keras.layers.Layer):
     def __init__(self, epsilon=1e-6, **kwargs):
-        """ Construct a layernorm module in the T5 style
-            No bias and no substraction of mean.
+        """Construct a layernorm module in the T5 style
+        No bias and no substraction of mean.
         """
         super().__init__(**kwargs)
         self.variance_epsilon = epsilon
@@ -140,7 +140,9 @@ class TFT5Attention(tf.keras.layers.Layer):
 
         if self.has_relative_attention_bias:
             self.relative_attention_bias = tf.keras.layers.Embedding(
-                self.relative_attention_num_buckets, self.n_heads, name="relative_attention_bias",
+                self.relative_attention_num_buckets,
+                self.n_heads,
+                name="relative_attention_bias",
             )
         self.pruned_heads = set()
 
@@ -199,7 +201,9 @@ class TFT5Attention(tf.keras.layers.Layer):
         memory_position = tf.range(klen)[None, :]
         relative_position = memory_position - context_position  # shape (qlen, klen)
         rp_bucket = self._relative_position_bucket(
-            relative_position, bidirectional=not self.is_decoder, num_buckets=self.relative_attention_num_buckets,
+            relative_position,
+            bidirectional=not self.is_decoder,
+            num_buckets=self.relative_attention_num_buckets,
         )
         values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
         values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0)  # shape (1, num_heads, qlen, klen)
@@ -316,7 +320,9 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
     def __init__(self, config, has_relative_attention_bias=False, **kwargs):
         super().__init__(**kwargs)
         self.SelfAttention = TFT5Attention(
-            config, has_relative_attention_bias=has_relative_attention_bias, name="SelfAttention",
+            config,
+            has_relative_attention_bias=has_relative_attention_bias,
+            name="SelfAttention",
         )
         self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
@@ -353,7 +359,9 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer):
     def __init__(self, config, has_relative_attention_bias=False, **kwargs):
         super().__init__(**kwargs)
         self.EncDecAttention = TFT5Attention(
-            config, has_relative_attention_bias=has_relative_attention_bias, name="EncDecAttention",
+            config,
+            has_relative_attention_bias=has_relative_attention_bias,
+            name="EncDecAttention",
         )
         self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
@@ -396,12 +404,18 @@ class TFT5Block(tf.keras.layers.Layer):
         self.is_decoder = config.is_decoder
         self.layer = []
         self.layer.append(
-            TFT5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._0",)
+            TFT5LayerSelfAttention(
+                config,
+                has_relative_attention_bias=has_relative_attention_bias,
+                name="layer_._0",
+            )
         )
         if self.is_decoder:
             self.layer.append(
                 TFT5LayerCrossAttention(
-                    config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._1",
+                    config,
+                    has_relative_attention_bias=has_relative_attention_bias,
+                    name="layer_._1",
                 )
             )
 
@@ -490,9 +504,9 @@ class TFT5Block(tf.keras.layers.Layer):
 
 class _NoLayerEmbedTokens:
     """
-     this class wraps a the TFSharedEmbeddingTokens layer into a python 'no-keras-layer'
-     class to avoid problem with weight restoring. Also it makes sure that the layer is
-     called from the correct scope to avoid problem with saving/storing the correct weights
+    this class wraps a the TFSharedEmbeddingTokens layer into a python 'no-keras-layer'
+    class to avoid problem with weight restoring. Also it makes sure that the layer is
+    called from the correct scope to avoid problem with saving/storing the correct weights
     """
 
     def __init__(self, layer, abs_scope_name=None):
@@ -539,7 +553,11 @@ class TFT5MainLayer(tf.keras.layers.Layer):
         self.num_hidden_layers = config.num_layers
 
         self.block = [
-            TFT5Block(config, has_relative_attention_bias=bool(i == 0), name="block_._{}".format(i),)
+            TFT5Block(
+                config,
+                has_relative_attention_bias=bool(i == 0),
+                name="block_._{}".format(i),
+            )
             for i in range(config.num_layers)
         ]
         self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="final_layer_norm")
@@ -654,7 +672,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
             if self.is_decoder:
                 seq_ids = tf.range(mask_seq_length)
                 causal_mask = tf.less_equal(
-                    tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)), seq_ids[None, :, None],
+                    tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)),
+                    seq_ids[None, :, None],
                 )
                 causal_mask = tf.cast(causal_mask, dtype=tf.float32)
                 extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
@@ -765,8 +784,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
 # pointers for your model.
 ####################################################
 class TFT5PreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = T5Config
@@ -961,17 +980,17 @@ class TFT5Model(TFT5PreTrainedModel):
         training=False,
     ):
         r"""
-    Returns:
+        Returns:
 
-    Examples::
+        Examples::
 
-        >>> from transformers import T5Tokenizer, TFT5Model
+            >>> from transformers import T5Tokenizer, TFT5Model
 
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = TFT5Model.from_pretrained('t5-small')
-        >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
-        >>> outputs = model(inputs, decoder_input_ids=inputs)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> model = TFT5Model.from_pretrained('t5-small')
+            >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
+            >>> outputs = model(inputs, decoder_input_ids=inputs)
+            >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
         """
         if isinstance(inputs, (tuple, list)):
@@ -1157,26 +1176,26 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
         training=False,
     ):
         r"""
-        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the cross entropy classification loss.
-            Indices should be in ``[0, ..., config.vocab_size - 1]``.
+            labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the cross entropy classification loss.
+                Indices should be in ``[0, ..., config.vocab_size - 1]``.
 
-    Returns:
+        Returns:
 
-    Examples::
+        Examples::
 
-        >>> from transformers import T5Tokenizer, TFT5ForConditionalGeneration
+            >>> from transformers import T5Tokenizer, TFT5ForConditionalGeneration
 
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
-        >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
-        >>> outputs = model(inputs, decoder_input_ids=inputs)
-        >>> prediction_scores = outputs[0]
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
+            >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
+            >>> outputs = model(inputs, decoder_input_ids=inputs)
+            >>> prediction_scores = outputs[0]
 
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
-        >>> inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf")  # Batch size 1
-        >>> result = model.generate(inputs)
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
+            >>> inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf")  # Batch size 1
+            >>> result = model.generate(inputs)
 
         """
         if isinstance(inputs, (tuple, list)):
diff --git a/src/transformers/modeling_tf_transfo_xl.py b/src/transformers/modeling_tf_transfo_xl.py
index 4766322843..a9ae5a66be 100644
--- a/src/transformers/modeling_tf_transfo_xl.py
+++ b/src/transformers/modeling_tf_transfo_xl.py
@@ -628,7 +628,13 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
                 hids.append(core_out)
                 mems_i = None if mems is None else mems[i]
                 layer_outputs = layer(
-                    core_out, pos_emb, dec_attn_mask, mems_i, head_mask[i], output_attentions, training=training,
+                    core_out,
+                    pos_emb,
+                    dec_attn_mask,
+                    mems_i,
+                    head_mask[i],
+                    output_attentions,
+                    training=training,
                 )
                 core_out = layer_outputs[0]
                 if output_attentions:
@@ -657,13 +663,16 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
             return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None)
 
         return TFTransfoXLModelOutput(
-            last_hidden_state=core_out, mems=new_mems, hidden_states=hids, attentions=attentions,
+            last_hidden_state=core_out,
+            mems=new_mems,
+            hidden_states=hids,
+            attentions=attentions,
         )
 
 
 class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = TransfoXLConfig
@@ -852,8 +861,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
         )
 
     def get_output_embeddings(self):
-        """ Double-check if you are using adaptive softmax.
-        """
+        """Double-check if you are using adaptive softmax."""
         if len(self.crit.out_layers) > 0:
             return self.crit.out_layers[-1]
         return None
diff --git a/src/transformers/modeling_tf_transfo_xl_utilities.py b/src/transformers/modeling_tf_transfo_xl_utilities.py
index 7b7757eeb5..656f463da6 100644
--- a/src/transformers/modeling_tf_transfo_xl_utilities.py
+++ b/src/transformers/modeling_tf_transfo_xl_utilities.py
@@ -64,7 +64,10 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
                 else:
                     self.out_projs.append(None)
                 weight = self.add_weight(
-                    shape=(self.vocab_size, self.d_embed,),
+                    shape=(
+                        self.vocab_size,
+                        self.d_embed,
+                    ),
                     initializer="zeros",
                     trainable=True,
                     name="out_layers_._{}_._weight".format(i),
@@ -86,7 +89,10 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
                 )
                 self.out_projs.append(weight)
                 weight = self.add_weight(
-                    shape=(r_idx - l_idx, d_emb_i,),
+                    shape=(
+                        r_idx - l_idx,
+                        d_emb_i,
+                    ),
                     initializer="zeros",
                     trainable=True,
                     name="out_layers_._{}_._weight".format(i),
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 1fdc062202..c1d41beb06 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -207,13 +207,12 @@ class TFMultipleChoiceLoss(TFSequenceClassificationLoss):
 
 class TFMaskedLanguageModelingLoss(TFCausalLanguageModelingLoss):
     """
-   Loss function suitable for masked language modeling (MLM), that is, the task of guessing the masked tokens.
+    Loss function suitable for masked language modeling (MLM), that is, the task of guessing the masked tokens.
 
-   .. note::
+    .. note::
 
-        Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
-
-"""
+         Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+    """
 
 
 class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
diff --git a/src/transformers/modeling_tf_xlm.py b/src/transformers/modeling_tf_xlm.py
index 7f9b8572a6..55e72697c2 100644
--- a/src/transformers/modeling_tf_xlm.py
+++ b/src/transformers/modeling_tf_xlm.py
@@ -83,7 +83,7 @@ def create_sinusoidal_embeddings(n_pos, dim, out):
 
 
 def gelu(x):
-    """ Gaussian Error Linear Unit.
+    """Gaussian Error Linear Unit.
     Original Implementation of the gelu activation function in Google Bert repo when initially created.
         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
@@ -333,9 +333,9 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
         raise NotImplementedError
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
         """
         raise NotImplementedError
 
@@ -516,8 +516,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
 
 
 class TFXLMPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = XLMConfig
@@ -858,7 +858,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
 
     @property
     def dummy_inputs(self):
-        """ Dummy inputs to build the network.
+        """Dummy inputs to build the network.
 
         Returns:
             tf.Tensor with dummy inputs
diff --git a/src/transformers/modeling_tf_xlm_roberta.py b/src/transformers/modeling_tf_xlm_roberta.py
index 8f4c2ae6d7..5ea919b3e4 100644
--- a/src/transformers/modeling_tf_xlm_roberta.py
+++ b/src/transformers/modeling_tf_xlm_roberta.py
@@ -77,7 +77,8 @@ class TFXLMRobertaModel(TFRobertaModel):
 
 
 @add_start_docstrings(
-    """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING,
+    """XLM-RoBERTa Model with a `language modeling` head on top. """,
+    XLM_ROBERTA_START_DOCSTRING,
 )
 class TFXLMRobertaForMaskedLM(TFRobertaForMaskedLM):
     """
diff --git a/src/transformers/modeling_tf_xlnet.py b/src/transformers/modeling_tf_xlnet.py
index 333d0ab72d..f9568c7e3d 100644
--- a/src/transformers/modeling_tf_xlnet.py
+++ b/src/transformers/modeling_tf_xlnet.py
@@ -62,9 +62,9 @@ TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 
 def gelu(x):
-    """ Implementation of the gelu activation function.
-        XLNet is using OpenAI GPT's gelu
-        Also see https://arxiv.org/abs/1606.08415
+    """Implementation of the gelu activation function.
+    XLNet is using OpenAI GPT's gelu
+    Also see https://arxiv.org/abs/1606.08415
     """
     cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
     return x * cdf
@@ -807,8 +807,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
 
 
 class TFXLNetPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = XLNetConfig
@@ -1213,33 +1213,33 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
         training=False,
     ):
         r"""
-        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the cross entropy classification loss.
-            Indices should be in ``[0, ..., config.vocab_size - 1]``.
+            labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the cross entropy classification loss.
+                Indices should be in ``[0, ..., config.vocab_size - 1]``.
 
-    Return:
+        Return:
 
-    Examples::
+        Examples::
 
-        import tensorflow as tf
-        import numpy as np
-        from transformers import XLNetTokenizer, TFXLNetLMHeadModel
+            import tensorflow as tf
+            import numpy as np
+            from transformers import XLNetTokenizer, TFXLNetLMHeadModel
 
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased')
+            tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+            model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased')
 
-        # We show how to setup inputs to predict a next token using a bi-directional context.
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :]  # We will predict the masked token
+            # We show how to setup inputs to predict a next token using a bi-directional context.
+            input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :]  # We will predict the masked token
 
-        perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
-        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+            perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
+            perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
 
-        target_mapping = np.zeros((1, 1, input_ids.shape[1]))  # Shape [1, 1, seq_length] => let's predict one token
-        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+            target_mapping = np.zeros((1, 1, input_ids.shape[1]))  # Shape [1, 1, seq_length] => let's predict one token
+            target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
 
-        outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32))
+            outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32))
 
-        next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+            next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
 
         """
         return_dict = return_dict if return_dict is not None else self.transformer.return_dict
@@ -1401,7 +1401,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
 
     @property
     def dummy_inputs(self):
-        """ Dummy inputs to build the network.
+        """Dummy inputs to build the network.
 
         Returns:
             tf.Tensor with dummy inputs
diff --git a/src/transformers/modeling_transfo_xl.py b/src/transformers/modeling_transfo_xl.py
index 10fae29e97..c57be4afd3 100644
--- a/src/transformers/modeling_transfo_xl.py
+++ b/src/transformers/modeling_transfo_xl.py
@@ -45,8 +45,8 @@ TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 
 def build_tf_to_pytorch_map(model, config):
-    """ A map of modules from TF to PyTorch.
-        This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
+    """A map of modules from TF to PyTorch.
+    This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
     """
     tf_to_pt_map = {}
 
@@ -112,8 +112,7 @@ def build_tf_to_pytorch_map(model, config):
 
 
 def load_tf_weights_in_transfo_xl(model, config, tf_path):
-    """ Load tf checkpoints in a pytorch model
-    """
+    """Load tf checkpoints in a pytorch model"""
     try:
         import numpy as np
         import tensorflow as tf
@@ -386,7 +385,12 @@ class RelPartialLearnableDecoderLayer(nn.Module):
     def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None, output_attentions=False):
 
         attn_outputs = self.dec_attn(
-            dec_inp, r, attn_mask=dec_attn_mask, mems=mems, head_mask=head_mask, output_attentions=output_attentions,
+            dec_inp,
+            r,
+            attn_mask=dec_attn_mask,
+            mems=mems,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
         )
         ff_output = self.pos_ff(attn_outputs[0])
 
@@ -456,8 +460,8 @@ class AdaptiveEmbedding(nn.Module):
 
 
 class TransfoXLPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = TransfoXLConfig
@@ -474,8 +478,7 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
         nn.init.constant_(bias, 0.0)
 
     def _init_weights(self, m):
-        """ Initialize the weights.
-        """
+        """Initialize the weights."""
         classname = m.__class__.__name__
         if classname.find("Linear") != -1:
             if hasattr(m, "weight") and m.weight is not None:
@@ -515,7 +518,7 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
                 self._init_bias(m.r_bias)
 
     def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, layer: Optional[int] = -1):
-        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
+        """Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
         Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
 
         Arguments:
@@ -948,7 +951,10 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
             return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None)
 
         return TransfoXLModelOutput(
-            last_hidden_state=core_out, mems=new_mems, hidden_states=hids, attentions=attentions,
+            last_hidden_state=core_out,
+            mems=new_mems,
+            hidden_states=hids,
+            attentions=attentions,
         )
 
 
@@ -1064,8 +1070,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         )
 
     def get_output_embeddings(self):
-        """ Double-check if you are using adaptive softmax.
-        """
+        """Double-check if you are using adaptive softmax."""
         if self.sample_softmax > 0:
             return self.out_layer
         else:
diff --git a/src/transformers/modeling_transfo_xl_utilities.py b/src/transformers/modeling_transfo_xl_utilities.py
index c5043db79b..edd58104bb 100644
--- a/src/transformers/modeling_transfo_xl_utilities.py
+++ b/src/transformers/modeling_transfo_xl_utilities.py
@@ -85,17 +85,17 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
 
     def forward(self, hidden, labels=None, keep_order=False):
         """
-            Params:
-                hidden :: [len*bsz x d_proj]
-                labels :: [len*bsz]
-            Return:
-                if labels is None:
-                    out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
-                else:
-                    out :: [(len-1)*bsz] Negative log likelihood
-            We could replace this implementation by the native PyTorch one
-            if their's had an option to set bias on all clusters in the native one.
-            here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
+        Params:
+            hidden :: [len*bsz x d_proj]
+            labels :: [len*bsz]
+        Return:
+            if labels is None:
+                out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
+            else:
+                out :: [(len-1)*bsz] Negative log likelihood
+        We could replace this implementation by the native PyTorch one
+        if their's had an option to set bias on all clusters in the native one.
+        here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
         """
 
         if labels is not None:
@@ -191,7 +191,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
         return out
 
     def log_prob(self, hidden):
-        r""" Computes log probabilities for all :math:`n\_classes`
+        r"""Computes log probabilities for all :math:`n\_classes`
         From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py
         Args:
             hidden (Tensor): a minibatch of examples
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index ef7e2dfcc7..2d93d7975b 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -51,8 +51,7 @@ try:
 except ImportError:
     # Older PyTorch compatibility
     class Identity(nn.Module):
-        r"""A placeholder identity operator that is argument-insensitive.
-        """
+        r"""A placeholder identity operator that is argument-insensitive."""
 
         def __init__(self, *args, **kwargs):
             super().__init__()
@@ -488,8 +487,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
             )
 
     def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
-        """ Tie or clone module weights depending of whether we are using TorchScript or not
-        """
+        """Tie or clone module weights depending of whether we are using TorchScript or not"""
         if self.config.torchscript:
             output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone())
         else:
@@ -498,7 +496,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
         if getattr(output_embeddings, "bias", None) is not None:
             output_embeddings.bias.data = torch.nn.functional.pad(
                 output_embeddings.bias.data,
-                (0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],),
+                (
+                    0,
+                    output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],
+                ),
                 "constant",
                 0,
             )
@@ -906,7 +907,13 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
             def load(module: nn.Module, prefix=""):
                 local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
                 module._load_from_state_dict(
-                    state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs,
+                    state_dict,
+                    prefix,
+                    local_metadata,
+                    True,
+                    missing_keys,
+                    unexpected_keys,
+                    error_msgs,
                 )
                 for name, child in module._modules.items():
                     if child is not None:
@@ -1242,24 +1249,24 @@ class SQuADHead(nn.Module):
         return_dict: bool = False,
     ) -> Union[SquadHeadOutput, Tuple[torch.FloatTensor]]:
         """
-    Args:
-        hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
-            Final hidden states of the model on the sequence tokens.
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Positions of the first token for the labeled span.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Positions of the last token for the labeled span.
-        cls_index (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Position of the CLS token for each sentence in the batch. If :obj:`None`, takes the last token.
-        is_impossible (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Whether the question has a possible answer in the paragraph or not.
-        p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
-            Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
-            1.0 means token should be masked.
-        return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to return a :class:`~transformers.file_utils.ModelOuput` instead of a plain tuple.
+        Args:
+            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
+                Final hidden states of the model on the sequence tokens.
+            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+                Positions of the first token for the labeled span.
+            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+                Positions of the last token for the labeled span.
+            cls_index (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+                Position of the CLS token for each sentence in the batch. If :obj:`None`, takes the last token.
+            is_impossible (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+                Whether the question has a possible answer in the paragraph or not.
+            p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
+                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
+                1.0 means token should be masked.
+            return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOuput` instead of a plain tuple.
 
-    Returns:
+        Returns:
         """
         start_logits = self.start_logits(hidden_states, p_mask=p_mask)
 
@@ -1375,7 +1382,7 @@ class SequenceSummary(nn.Module):
             self.summary = nn.Linear(config.hidden_size, num_classes)
 
         activation_string = getattr(config, "summary_activation", None)
-        self.activation: Callable = (get_activation(activation_string) if activation_string else Identity())
+        self.activation: Callable = get_activation(activation_string) if activation_string else Identity()
 
         self.first_dropout = Identity()
         if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
@@ -1409,7 +1416,11 @@ class SequenceSummary(nn.Module):
             output = hidden_states.mean(dim=1)
         elif self.summary_type == "cls_index":
             if cls_index is None:
-                cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2] - 1, dtype=torch.long,)
+                cls_index = torch.full_like(
+                    hidden_states[..., :1, :],
+                    hidden_states.shape[-2] - 1,
+                    dtype=torch.long,
+                )
             else:
                 cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
                 cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py
index c70eaf2638..0b5bf6daf6 100755
--- a/src/transformers/modeling_xlm.py
+++ b/src/transformers/modeling_xlm.py
@@ -228,8 +228,8 @@ class TransformerFFN(nn.Module):
 
 
 class XLMPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = XLMConfig
@@ -462,9 +462,9 @@ class XLMModel(XLMPreTrainedModel):
         self.embeddings = new_embeddings
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
             self.attentions[layer].prune_heads(heads)
@@ -572,7 +572,11 @@ class XLMModel(XLMPreTrainedModel):
 
             # self attention
             attn_outputs = self.attentions[i](
-                tensor, attn_mask, cache=cache, head_mask=head_mask[i], output_attentions=output_attentions,
+                tensor,
+                attn_mask,
+                cache=cache,
+                head_mask=head_mask[i],
+                output_attentions=output_attentions,
             )
             attn = attn_outputs[0]
             if output_attentions:
@@ -633,8 +637,7 @@ class XLMPredLayer(nn.Module):
             )
 
     def forward(self, x, y=None):
-        """ Compute the loss, and optionally the scores.
-        """
+        """Compute the loss, and optionally the scores."""
         outputs = ()
         if self.asm is False:
             scores = self.proj(x)
@@ -969,38 +972,38 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
         return_dict=None,
     ):
         r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels whether a question has an answer or no answer (SQuAD 2.0)
-        cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
-        p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
-            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
-            1.0 means token should be masked. 0.0 mean token is not masked.
+            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+            is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+                Labels whether a question has an answer or no answer (SQuAD 2.0)
+            cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
+            p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+                Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
+                1.0 means token should be masked. 0.0 mean token is not masked.
 
-    Returns:
+        Returns:
 
-    Example::
+        Example::
 
-        >>> from transformers import XLMTokenizer, XLMForQuestionAnswering
-        >>> import torch
+            >>> from transformers import XLMTokenizer, XLMForQuestionAnswering
+            >>> import torch
 
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048', return_dict=True)
+            >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+            >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048', return_dict=True)
 
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        >>> start_positions = torch.tensor([1])
-        >>> end_positions = torch.tensor([3])
+            >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            >>> start_positions = torch.tensor([1])
+            >>> end_positions = torch.tensor([3])
 
-        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        >>> loss = outputs.loss
+            >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+            >>> loss = outputs.loss
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1131,7 +1134,10 @@ class XLMForTokenClassification(XLMPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return TokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
diff --git a/src/transformers/modeling_xlm_roberta.py b/src/transformers/modeling_xlm_roberta.py
index 00be724410..31bd816844 100644
--- a/src/transformers/modeling_xlm_roberta.py
+++ b/src/transformers/modeling_xlm_roberta.py
@@ -68,7 +68,8 @@ class XLMRobertaModel(RobertaModel):
 
 
 @add_start_docstrings(
-    """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING,
+    """XLM-RoBERTa Model with a `language modeling` head on top. """,
+    XLM_ROBERTA_START_DOCSTRING,
 )
 class XLMRobertaForMaskedLM(RobertaForMaskedLM):
     """
diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py
index f8949ddadb..04d760acb9 100755
--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -58,9 +58,9 @@ XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 
 def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
-    """ A map of modules from TF to PyTorch.
-        I use a map to keep the PyTorch model as
-        identical to the original PyTorch model as possible.
+    """A map of modules from TF to PyTorch.
+    I use a map to keep the PyTorch model as
+    identical to the original PyTorch model as possible.
     """
 
     tf_to_pt_map = {}
@@ -141,8 +141,7 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
 
 
 def load_tf_weights_in_xlnet(model, config, tf_path):
-    """ Load tf checkpoints in a pytorch model
-    """
+    """Load tf checkpoints in a pytorch model"""
     try:
         import numpy as np
         import tensorflow as tf
@@ -548,8 +547,8 @@ class XLNetLayer(nn.Module):
 
 
 class XLNetPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = XLNetConfig
@@ -557,8 +556,7 @@ class XLNetPreTrainedModel(PreTrainedModel):
     base_model_prefix = "transformer"
 
     def _init_weights(self, module):
-        """ Initialize the weights.
-        """
+        """Initialize the weights."""
         if isinstance(module, (nn.Linear, nn.Embedding)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
@@ -1350,46 +1348,46 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         return_dict=None,
     ):
         r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`):
-            Labels for masked language modeling.
-            `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`.
-            The labels should correspond to the masked input words that should be predicted and depends on `target_mapping`. Note in order to perform standard auto-regressive language modeling a `<mask>` token has to be added to the `input_ids` (see `prepare_inputs_for_generation` fn and examples below)
-            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored, the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`):
+                Labels for masked language modeling.
+                `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`.
+                The labels should correspond to the masked input words that should be predicted and depends on `target_mapping`. Note in order to perform standard auto-regressive language modeling a `<mask>` token has to be added to the `input_ids` (see `prepare_inputs_for_generation` fn and examples below)
+                Indices are selected in ``[-100, 0, ..., config.vocab_size]``
+                All labels set to ``-100`` are ignored, the loss is only
+                computed for labels in ``[0, ..., config.vocab_size]``
 
-    Return:
+        Return:
 
-    Examples::
+        Examples::
 
-        from transformers import XLNetTokenizer, XLNetLMHeadModel
-        import torch
+            from transformers import XLNetTokenizer, XLNetLMHeadModel
+            import torch
 
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased', return_dict=True)
+            tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+            model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased', return_dict=True)
 
-        # We show how to setup inputs to predict a next token using a bi-directional context.
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0)  # We will predict the masked token
-        perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
-        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-        target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
-        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+            # We show how to setup inputs to predict a next token using a bi-directional context.
+            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0)  # We will predict the masked token
+            perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
+            perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+            target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
+            target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
 
-        outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
-        next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+            outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
+            next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
 
-        # The same way can the XLNetLMHeadModel be used to be trained by standard auto-regressive language modeling.
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0)  # We will predict the masked token
-        labels = torch.tensor(tokenizer.encode("cute", add_special_tokens=False)).unsqueeze(0)
-        assert labels.shape[0] == 1, 'only one word will be predicted'
-        perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
-        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token as is done in standard auto-regressive lm training
-        target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
-        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+            # The same way can the XLNetLMHeadModel be used to be trained by standard auto-regressive language modeling.
+            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0)  # We will predict the masked token
+            labels = torch.tensor(tokenizer.encode("cute", add_special_tokens=False)).unsqueeze(0)
+            assert labels.shape[0] == 1, 'only one word will be predicted'
+            perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
+            perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token as is done in standard auto-regressive lm training
+            target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
+            target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
 
-        outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping, labels=labels)
-        loss = outputs.loss
-        next_token_logits = outputs.logits  # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+            outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping, labels=labels)
+            loss = outputs.loss
+            next_token_logits = outputs.logits  # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
@@ -1867,38 +1865,38 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
         return_dict=None,
     ):
         r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels whether a question has an answer or no answer (SQuAD 2.0)
-        cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
-        p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
-            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
-            1.0 means token should be masked. 0.0 mean token is not masked.
+            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+            is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+                Labels whether a question has an answer or no answer (SQuAD 2.0)
+            cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
+            p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+                Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
+                1.0 means token should be masked. 0.0 mean token is not masked.
 
-    Returns:
+        Returns:
 
-    Example::
+        Example::
 
-        >>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering
-        >>> import torch
+            >>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering
+            >>> import torch
 
-        >>> tokenizer =  XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        >>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased', return_dict=True)
+            >>> tokenizer =  XLNetTokenizer.from_pretrained('xlnet-base-cased')
+            >>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased', return_dict=True)
 
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        >>> start_positions = torch.tensor([1])
-        >>> end_positions = torch.tensor([3])
-        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+            >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            >>> start_positions = torch.tensor([1])
+            >>> end_positions = torch.tensor([3])
+            >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
 
-        >>> loss = outputs.loss
+            >>> loss = outputs.loss
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py
index 610a7b23bb..a3e6e0423f 100644
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -122,7 +122,9 @@ def create_optimizer(
     )
     if num_warmup_steps:
         lr_schedule = WarmUp(
-            initial_learning_rate=init_lr, decay_schedule_fn=lr_schedule, warmup_steps=num_warmup_steps,
+            initial_learning_rate=init_lr,
+            decay_schedule_fn=lr_schedule,
+            warmup_steps=num_warmup_steps,
         )
     if weight_decay_rate > 0.0:
         optimizer = AdamWeightDecay(
diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index 8d6799a722..26c8b91e3f 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -208,7 +208,11 @@ class PipelineDataFormat:
     SUPPORTED_FORMATS = ["json", "csv", "pipe"]
 
     def __init__(
-        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite: bool = False,
+        self,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite: bool = False,
     ):
         self.output_path = output_path
         self.input_path = input_path
@@ -261,7 +265,11 @@ class PipelineDataFormat:
 
     @staticmethod
     def from_str(
-        format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
+        format: str,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite=False,
     ) -> "PipelineDataFormat":
         """
         Creates an instance of the right subclass of :class:`~transformers.pipelines.PipelineDataFormat` depending
@@ -305,7 +313,11 @@ class CsvPipelineDataFormat(PipelineDataFormat):
     """
 
     def __init__(
-        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
+        self,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite=False,
     ):
         super().__init__(output_path, input_path, column, overwrite=overwrite)
 
@@ -346,7 +358,11 @@ class JsonPipelineDataFormat(PipelineDataFormat):
     """
 
     def __init__(
-        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
+        self,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite=False,
     ):
         super().__init__(output_path, input_path, column, overwrite=overwrite)
 
@@ -610,7 +626,10 @@ class Pipeline(_ScikitCompat):
         # Parse arguments
         inputs = self._args_parser(*args, **kwargs)
         inputs = self.tokenizer(
-            inputs, add_special_tokens=add_special_tokens, return_tensors=self.framework, padding=padding,
+            inputs,
+            add_special_tokens=add_special_tokens,
+            return_tensors=self.framework,
+            padding=padding,
         )
 
         return inputs
@@ -1349,7 +1368,10 @@ class TokenClassificationPipeline(Pipeline):
             with self.device_placement():
 
                 tokens = self.tokenizer(
-                    sentence, return_attention_mask=False, return_tensors=self.framework, truncation=True,
+                    sentence,
+                    return_attention_mask=False,
+                    return_tensors=self.framework,
+                    truncation=True,
                 )
 
                 # Forward
@@ -1925,7 +1947,9 @@ class SummarizationPipeline(Pipeline):
                 )
 
             summaries = self.model.generate(
-                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
+                inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                **generate_kwargs,
             )
 
             results = []
@@ -1935,7 +1959,9 @@ class SummarizationPipeline(Pipeline):
                     record["summary_token_ids"] = summary
                 if return_text:
                     record["summary_text"] = self.tokenizer.decode(
-                        summary, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                        summary,
+                        skip_special_tokens=True,
+                        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                     )
                 results.append(record)
             return results
@@ -2032,7 +2058,9 @@ class TranslationPipeline(Pipeline):
                 )
 
             translations = self.model.generate(
-                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
+                inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                **generate_kwargs,
             )
             results = []
             for translation in translations:
@@ -2271,7 +2299,9 @@ class ConversationalPipeline(Pipeline):
                     "You might consider trimming the early phase of the conversation".format(input_length, max_length)
                 )
             generated_responses = self.model.generate(
-                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
+                inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                **generate_kwargs,
             )
 
             cleaned_history = self._clean_padding_history(generated_responses)
@@ -2355,7 +2385,8 @@ class ConversationalPipeline(Pipeline):
         max_len = max([len(item) for item in outputs])
         outputs = [output + [self.pad_token_id] * (max_len - len(output)) for output in outputs]
         outputs = BatchEncoding(
-            {"input_ids": outputs, "attention_mask": [[1] * len(outputs)]}, tensor_type=self.framework,
+            {"input_ids": outputs, "attention_mask": [[1] * len(outputs)]},
+            tensor_type=self.framework,
         )
         return outputs
 
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index a3339a0fce..17d6b0e5bf 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -169,7 +169,7 @@ def assert_screenout(out, what):
 
 
 class CaptureStd:
-    """ Context manager to capture:
+    """Context manager to capture:
     stdout, clean it up and make it available via obj.out
     stderr, and make it available via obj.err
 
diff --git a/src/transformers/tokenization_auto.py b/src/transformers/tokenization_auto.py
index 49b80f2f8f..f35b445287 100644
--- a/src/transformers/tokenization_auto.py
+++ b/src/transformers/tokenization_auto.py
@@ -105,31 +105,31 @@ TOKENIZER_MAPPING = OrderedDict(
 
 class AutoTokenizer:
     r""":class:`~transformers.AutoTokenizer` is a generic tokenizer class
-        that will be instantiated as one of the tokenizer classes of the library
-        when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    that will be instantiated as one of the tokenizer classes of the library
+    when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
+    class method.
 
-        The `from_pretrained()` method takes care of returning the correct tokenizer class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string:
+    The `from_pretrained()` method takes care of returning the correct tokenizer class instance
+    based on the `model_type` property of the config object, or when it's missing,
+    falling back to using pattern matching on the `pretrained_model_name_or_path` string:
 
-            - `t5`: T5Tokenizer (T5 model)
-            - `distilbert`: DistilBertTokenizer (DistilBert model)
-            - `albert`: AlbertTokenizer (ALBERT model)
-            - `camembert`: CamembertTokenizer (CamemBERT model)
-            - `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model)
-            - `longformer`: LongformerTokenizer (AllenAI Longformer model)
-            - `roberta`: RobertaTokenizer (RoBERTa model)
-            - `bert`: BertTokenizer (Bert model)
-            - `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
-            - `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
-            - `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
-            - `xlnet`: XLNetTokenizer (XLNet model)
-            - `xlm`: XLMTokenizer (XLM model)
-            - `ctrl`: CTRLTokenizer (Salesforce CTRL model)
-            - `electra`: ElectraTokenizer (Google ELECTRA model)
+        - `t5`: T5Tokenizer (T5 model)
+        - `distilbert`: DistilBertTokenizer (DistilBert model)
+        - `albert`: AlbertTokenizer (ALBERT model)
+        - `camembert`: CamembertTokenizer (CamemBERT model)
+        - `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model)
+        - `longformer`: LongformerTokenizer (AllenAI Longformer model)
+        - `roberta`: RobertaTokenizer (RoBERTa model)
+        - `bert`: BertTokenizer (Bert model)
+        - `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
+        - `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
+        - `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
+        - `xlnet`: XLNetTokenizer (XLNet model)
+        - `xlm`: XLMTokenizer (XLM model)
+        - `ctrl`: CTRLTokenizer (Salesforce CTRL model)
+        - `electra`: ElectraTokenizer (Google ELECTRA model)
 
-        This class cannot be instantiated using `__init__()` (throw an error).
+    This class cannot be instantiated using `__init__()` (throw an error).
     """
 
     def __init__(self):
@@ -140,7 +140,7 @@ class AutoTokenizer:
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        r""" Instantiate one of the tokenizer classes of the library
+        r"""Instantiate one of the tokenizer classes of the library
         from a pre-trained model vocabulary.
 
         The tokenizer class to instantiate is selected
diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py
index 10a5b46540..e4c56c1b1a 100644
--- a/src/transformers/tokenization_bert.py
+++ b/src/transformers/tokenization_bert.py
@@ -359,7 +359,7 @@ class BasicTokenizer(object):
     """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
 
     def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
-        """ Constructs a BasicTokenizer.
+        """Constructs a BasicTokenizer.
 
         Args:
             **do_lower_case**: Whether to lower case the input.
@@ -383,7 +383,7 @@ class BasicTokenizer(object):
         self.strip_accents = strip_accents
 
     def tokenize(self, text, never_split=None):
-        """ Basic Tokenization of a piece of text.
+        """Basic Tokenization of a piece of text.
             Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
 
         Args:
diff --git a/src/transformers/tokenization_ctrl.py b/src/transformers/tokenization_ctrl.py
index 8130b1f21e..bcff0dca2f 100644
--- a/src/transformers/tokenization_ctrl.py
+++ b/src/transformers/tokenization_ctrl.py
@@ -202,8 +202,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
         return word
 
     def _tokenize(self, text):
-        """ Tokenize a string.
-        """
+        """Tokenize a string."""
         split_tokens = []
 
         words = re.findall(r"\S+\n?", text)
diff --git a/src/transformers/tokenization_dpr.py b/src/transformers/tokenization_dpr.py
index 41f6efcc2c..b1b2d66ea4 100644
--- a/src/transformers/tokenization_dpr.py
+++ b/src/transformers/tokenization_dpr.py
@@ -330,7 +330,11 @@ class CustomDPRReaderTokenizerMixin:
         return nbest_spans_predictions[:num_spans]
 
     def _get_best_spans(
-        self, start_logits: List[int], end_logits: List[int], max_answer_length: int, top_spans: int,
+        self,
+        start_logits: List[int],
+        end_logits: List[int],
+        max_answer_length: int,
+        top_spans: int,
     ) -> List[DPRSpanPrediction]:
         """
         Finds the best answer span for the extractive Q&A model for one passage.
diff --git a/src/transformers/tokenization_marian.py b/src/transformers/tokenization_marian.py
index 7584531c72..f883d41288 100644
--- a/src/transformers/tokenization_marian.py
+++ b/src/transformers/tokenization_marian.py
@@ -137,9 +137,7 @@ class MarianTokenizer(PreTrainedTokenizer):
         padding="longest",
         **unused,
     ) -> BatchEncoding:
-        """Prepare model inputs for translation. For best performance, translate one sentence at a time.
-
-        """
+        """Prepare model inputs for translation. For best performance, translate one sentence at a time."""
         if "" in src_texts:
             raise ValueError(f"found empty string in src_texts: {src_texts}")
         self.current_spm = self.spm_source
diff --git a/src/transformers/tokenization_reformer.py b/src/transformers/tokenization_reformer.py
index 81ac73b31e..0cfd2c51cb 100644
--- a/src/transformers/tokenization_reformer.py
+++ b/src/transformers/tokenization_reformer.py
@@ -53,29 +53,29 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class ReformerTokenizer(PreTrainedTokenizer):
     """
-        Constructs an Reformer tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
+    Constructs an Reformer tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
 
-        This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
-        should refer to the superclass for more information regarding methods.
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
 
-        Args:
-            vocab_file (:obj:`string`):
-                `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
-                contains the vocabulary necessary to instantiate a tokenizer.
-            eos_token (:obj:`string`, `optional`, defaults to "</s>"):
-                The end of sequence token.
+    Args:
+        vocab_file (:obj:`string`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        eos_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The end of sequence token.
 
-                .. note::
+            .. note::
 
-                    When building a sequence using special tokens, this is not the token that is used for the end
-                    of sequence. The token used is the :obj:`sep_token`.
-            unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
-                The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-                token instead.
-            pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
-                The token used for padding, for example when batching sequences of different lengths.
-            additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`):
-                Additional special tokens used by the tokenizer.
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
+            The token used for padding, for example when batching sequences of different lengths.
+        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`):
+            Additional special tokens used by the tokenizer.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -142,8 +142,7 @@ class ReformerTokenizer(PreTrainedTokenizer):
         self.sp_model.Load(self.vocab_file)
 
     def _tokenize(self, text, sample=False):
-        """ Take as input a string and return a list of strings (tokens) for words/sub-words
-        """
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
         if not sample:
             pieces = self.sp_model.EncodeAsPieces(text)
         else:
@@ -166,8 +165,8 @@ class ReformerTokenizer(PreTrainedTokenizer):
         return out_string
 
     def save_vocabulary(self, save_directory):
-        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
-            to a directory.
+        """Save the sentencepiece vocabulary (copy original file) and special tokens file
+        to a directory.
         """
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
diff --git a/src/transformers/tokenization_t5.py b/src/transformers/tokenization_t5.py
index a21bfa766b..ce686612c0 100644
--- a/src/transformers/tokenization_t5.py
+++ b/src/transformers/tokenization_t5.py
@@ -63,34 +63,34 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class T5Tokenizer(PreTrainedTokenizer):
     """
-        Constructs a T5 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
+    Constructs a T5 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
 
-        This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
-        should refer to the superclass for more information regarding methods.
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
 
-        Args:
-            vocab_file (:obj:`string`):
-                `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
-                contains the vocabulary necessary to instantiate a tokenizer.
-            eos_token (:obj:`string`, `optional`, defaults to "</s>"):
-                The end of sequence token.
+    Args:
+        vocab_file (:obj:`string`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        eos_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The end of sequence token.
 
-                .. note::
+            .. note::
 
-                    When building a sequence using special tokens, this is not the token that is used for the end
-                    of sequence. The token used is the :obj:`sep_token`.
-            unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
-                The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-                token instead.
-            pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
-                The token used for padding, for example when batching sequences of different lengths.
-            extra_ids (:obj:`List[str]`, `optional`, defaults to :obj:`100`):
-                Add a number of extra ids added to the end of the vocabulary for use as sentinels.
-                These tokens are accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1.
-                Extra tokens are indexed from the end of the vocabulary up to beginnning ("<extra_id_0>" is the last token in the vocabulary like in T5 preprocessing
-                see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
-            additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`):
-                Additional special tokens used by the tokenizer.
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
+            The token used for padding, for example when batching sequences of different lengths.
+        extra_ids (:obj:`List[str]`, `optional`, defaults to :obj:`100`):
+            Add a number of extra ids added to the end of the vocabulary for use as sentinels.
+            These tokens are accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1.
+            Extra tokens are indexed from the end of the vocabulary up to beginnning ("<extra_id_0>" is the last token in the vocabulary like in T5 preprocessing
+            see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
+        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`):
+            Additional special tokens used by the tokenizer.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -236,8 +236,7 @@ class T5Tokenizer(PreTrainedTokenizer):
         self.sp_model.Load(self.vocab_file)
 
     def _tokenize(self, text, sample=False):
-        """ Take as input a string and return a list of strings (tokens) for words/sub-words
-        """
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
         if not sample:
             pieces = self.sp_model.EncodeAsPieces(text)
         else:
@@ -266,8 +265,8 @@ class T5Tokenizer(PreTrainedTokenizer):
         return out_string
 
     def save_vocabulary(self, save_directory):
-        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
-            to a directory.
+        """Save the sentencepiece vocabulary (copy original file) and special tokens file
+        to a directory.
         """
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
diff --git a/src/transformers/tokenization_transfo_xl.py b/src/transformers/tokenization_transfo_xl.py
index 531fc5b06a..3f9035a5e0 100644
--- a/src/transformers/tokenization_transfo_xl.py
+++ b/src/transformers/tokenization_transfo_xl.py
@@ -163,7 +163,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
 
     def count_sents(self, sents, verbose=False):
         """
-            sents : a list of sentences, each a list of tokenized symbols
+        sents : a list of sentences, each a list of tokenized symbols
         """
         if verbose:
             logger.info("counting {} sents ...".format(len(sents)))
@@ -496,7 +496,7 @@ class TransfoXLTokenizerFast(PreTrainedTokenizerFast):
 class LMOrderedIterator(object):
     def __init__(self, data, bsz, bptt, device="cpu", ext_len=None):
         """
-            data -- LongTensor -- the LongTensor is strictly ordered
+        data -- LongTensor -- the LongTensor is strictly ordered
         """
         self.bsz = bsz
         self.bptt = bptt
@@ -555,7 +555,7 @@ class LMOrderedIterator(object):
 class LMShuffledIterator(object):
     def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
         """
-            data -- list[LongTensor] -- there is no order among the LongTensors
+        data -- list[LongTensor] -- there is no order among the LongTensors
         """
         self.data = data
 
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 4f133f4ec6..2ddae67a2f 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -139,12 +139,12 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
         return_length: bool = False,
         verbose: bool = True,
     ) -> Dict[str, Any]:
-        """ Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict.
+        """Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict.
 
-            Overflowing tokens are converted to additional examples (like batches) so the output values of
-            the dict are lists (overflows) of lists (tokens).
+        Overflowing tokens are converted to additional examples (like batches) so the output values of
+        the dict are lists (overflows) of lists (tokens).
 
-            Output shape: (overflows, sequence length)
+        Output shape: (overflows, sequence length)
         """
         if return_token_type_ids is None:
             return_token_type_ids = "token_type_ids" in self.model_input_names
diff --git a/src/transformers/tokenization_xlm.py b/src/transformers/tokenization_xlm.py
index 91ff481113..529be4b16b 100644
--- a/src/transformers/tokenization_xlm.py
+++ b/src/transformers/tokenization_xlm.py
@@ -902,7 +902,12 @@ class XLMTokenizer(PreTrainedTokenizer):
                     "You should not supply a second sequence if the provided sequence of "
                     "ids is already formated with special tokens for the model."
                 )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0,))
+            return list(
+                map(
+                    lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
+                    token_ids_0,
+                )
+            )
 
         if token_ids_1 is not None:
             return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 25eabe6e5b..a9e0948dfe 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -141,10 +141,12 @@ class TrainingArguments:
     do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
     do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
     evaluate_during_training: bool = field(
-        default=False, metadata={"help": "Run evaluation during training at each logging step."},
+        default=False,
+        metadata={"help": "Run evaluation during training at each logging step."},
     )
     prediction_loss_only: bool = field(
-        default=False, metadata={"help": "When performing evaluation and predictions, only returns the loss."},
+        default=False,
+        metadata={"help": "When performing evaluation and predictions, only returns the loss."},
     )
 
     per_device_train_batch_size: int = field(
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index b1f3666389..94e5c3f320 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -100,7 +100,8 @@ class TFTrainingArguments(TrainingArguments):
     """
 
     tpu_name: str = field(
-        default=None, metadata={"help": "Name of TPU"},
+        default=None,
+        metadata={"help": "Name of TPU"},
     )
 
     @cached_property
diff --git a/templates/adding_a_new_example_script/utils_xxx.py b/templates/adding_a_new_example_script/utils_xxx.py
index bc2a219f9d..48967b3366 100644
--- a/templates/adding_a_new_example_script/utils_xxx.py
+++ b/templates/adding_a_new_example_script/utils_xxx.py
@@ -703,10 +703,10 @@ def write_predictions_extended(
     tokenizer,
     verbose_logging,
 ):
-    """ XLNet write prediction logic (more complex than Bert's).
-        Write final predictions to the json file and log-odds of null if needed.
+    """XLNet write prediction logic (more complex than Bert's).
+    Write final predictions to the json file and log-odds of null if needed.
 
-        Requires utils_squad_evaluate.py
+    Requires utils_squad_evaluate.py
     """
     _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
         "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
diff --git a/templates/adding_a_new_model/configuration_xxx.py b/templates/adding_a_new_model/configuration_xxx.py
index 5d59949e10..411a9d78c1 100644
--- a/templates/adding_a_new_model/configuration_xxx.py
+++ b/templates/adding_a_new_model/configuration_xxx.py
@@ -31,47 +31,47 @@ XXX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class XxxConfig(PretrainedConfig):
     r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.XXXModel`.
-        It is used to instantiate a XXX model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the XXX `xxx-base-uncased <https://huggingface.co/xxx/xxx-base-uncased>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.XXXModel`.
+    It is used to instantiate a XXX model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the XXX `xxx-base-uncased <https://huggingface.co/xxx/xxx-base-uncased>`__ architecture.
 
-        Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
+    for more information.
 
 
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the XXX model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XXXModel`.
-            hidden_size (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to :obj:`"gelu"`):
-                The non-linear activation function (function or string) in the encoder and pooler.
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the XXX model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XXXModel`.
+        hidden_size (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler.
 
-                If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the :obj:`truncated_normal_initializer` for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-5):
-                The epsilon used by the layer normalization layers.
-            gradient_checkpointing (:obj:`bool`, optional, defaults to :obj:`False`):
-                If :obj:`True`, use gradient checkpointing to save memory at the expense of slower backward pass.
-            kwargs:
-                Additional arguments for common configurations, passed to :class:`~transformers.PretrainedConfig`.
+            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the :obj:`truncated_normal_initializer` for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        gradient_checkpointing (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, use gradient checkpointing to save memory at the expense of slower backward pass.
+        kwargs:
+            Additional arguments for common configurations, passed to :class:`~transformers.PretrainedConfig`.
     """
     model_type = "xxx"
 
diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py
index 76dd0f08d3..f83d5de4ec 100644
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -223,7 +223,10 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
         pooled_output = self.pooler(sequence_output)
 
         if not return_dict:
-            return (sequence_output, pooled_output,) + encoder_outputs[1:]
+            return (
+                sequence_output,
+                pooled_output,
+            ) + encoder_outputs[1:]
 
         return TFBaseModelOutputWithPooling(
             last_hidden_state=sequence_output,
@@ -241,8 +244,8 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
 # pointers for your model.
 ####################################################
 class TFXxxPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = XxxConfig
@@ -422,7 +425,10 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss):
             return ((loss,) + output) if loss is not None else output
 
         return TFMaskedLMOutput(
-            loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -503,7 +509,10 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat
             return ((loss,) + output) if loss is not None else output
 
         return TFSequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -524,7 +533,7 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
 
     @property
     def dummy_inputs(self):
-        """ Dummy inputs to build the network.
+        """Dummy inputs to build the network.
 
         Returns:
             tf.Tensor with dummy inputs
@@ -631,7 +640,10 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
             return ((loss,) + output) if loss is not None else output
 
         return TFMultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -710,7 +722,10 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos
             return ((loss,) + output) if loss is not None else output
 
         return TFTokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py
index b073d2d158..8cb24d032f 100644
--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -59,8 +59,7 @@ XXX_PRETRAINED_MODEL_ARCHIVE_LIST = [
 # More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
 ####################################################
 def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model.
-    """
+    """Load tf checkpoints in a pytorch model."""
     try:
         import re
 
@@ -189,8 +188,8 @@ XxxPooler = nn.Module
 
 
 class XxxPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
     """
 
     config_class = XxxConfig
@@ -290,9 +289,9 @@ class XxxModel(XxxPreTrainedModel):
         self.embeddings.word_embeddings = new_embeddings
 
     def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
@@ -517,7 +516,10 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return SequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -603,7 +605,10 @@ class XxxForMultipleChoice(XxxPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return MultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
@@ -686,7 +691,10 @@ class XxxForTokenClassification(XxxPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
 
         return TokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
diff --git a/tests/test_modeling_albert.py b/tests/test_modeling_albert.py
index d26f8b40c3..9040e1a548 100644
--- a/tests/test_modeling_albert.py
+++ b/tests/test_modeling_albert.py
@@ -39,7 +39,8 @@ if is_torch_available():
 
 class AlbertModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 285f7af5d7..74306556a6 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -54,7 +54,8 @@ PGE_ARTICLE = """ PG&E stated it scheduled the blackouts in response to forecast
 @require_torch
 class ModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
@@ -76,7 +77,9 @@ class ModelTester:
         torch.manual_seed(0)
 
     def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(3,)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
+            3,
+        )
         input_ids[:, -1] = 2  # Eos Token
 
         config = BartConfig(
@@ -100,7 +103,9 @@ class ModelTester:
 
 
 def prepare_bart_inputs_dict(
-    config, input_ids, attention_mask=None,
+    config,
+    input_ids,
+    attention_mask=None,
 ):
     if attention_mask is None:
         attention_mask = input_ids.ne(config.pad_token_id)
@@ -261,7 +266,11 @@ class BartHeadTests(unittest.TestCase):
         sequence_labels = ids_tensor([batch_size], 2).to(torch_device)
         model = BartForQuestionAnswering(config)
         model.to(torch_device)
-        outputs = model(input_ids=input_ids, start_positions=sequence_labels, end_positions=sequence_labels,)
+        outputs = model(
+            input_ids=input_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
 
         self.assertEqual(outputs["start_logits"].shape, input_ids.shape)
         self.assertEqual(outputs["end_logits"].shape, input_ids.shape)
@@ -491,7 +500,11 @@ class BartModelIntegrationTests(unittest.TestCase):
 
         EXPECTED_SUMMARY = "California's largest power company has begun shutting off electricity to thousands of customers in the state."
         dct = tok.batch_encode_plus(
-            [PGE_ARTICLE], max_length=1024, padding="max_length", truncation=True, return_tensors="pt",
+            [PGE_ARTICLE],
+            max_length=1024,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
         ).to(torch_device)
 
         hypotheses_batch = model.generate(
@@ -506,7 +519,10 @@ class BartModelIntegrationTests(unittest.TestCase):
             decoder_start_token_id=model.config.eos_token_id,
         )
 
-        decoded = tok.batch_decode(hypotheses_batch, skip_special_tokens=True,)
+        decoded = tok.batch_decode(
+            hypotheses_batch,
+            skip_special_tokens=True,
+        )
         self.assertEqual(EXPECTED_SUMMARY, decoded[0])
 
     def test_xsum_config_generation_params(self):
diff --git a/tests/test_modeling_bert.py b/tests/test_modeling_bert.py
index 0e640f5f35..a24de56362 100755
--- a/tests/test_modeling_bert.py
+++ b/tests/test_modeling_bert.py
@@ -264,7 +264,10 @@ class BertModelTester:
         model.to(torch_device)
         model.eval()
         result = model(
-            input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels,
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            next_sentence_label=sequence_labels,
         )
         self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
 
diff --git a/tests/test_modeling_camembert.py b/tests/test_modeling_camembert.py
index a536b4a8f5..f278d72216 100644
--- a/tests/test_modeling_camembert.py
+++ b/tests/test_modeling_camembert.py
@@ -33,7 +33,9 @@ class CamembertModelIntegrationTest(unittest.TestCase):
         model.to(torch_device)
 
         input_ids = torch.tensor(
-            [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]], device=torch_device, dtype=torch.long,
+            [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]],
+            device=torch_device,
+            dtype=torch.long,
         )  # J'aime le camembert !
         output = model(input_ids)["last_hidden_state"]
         expected_shape = torch.Size((1, 10, 768))
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 696c4f59f6..6d0306d744 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -330,7 +330,9 @@ class ModelTesterMixin:
             # Prepare head_mask
             # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
             head_mask = torch.ones(
-                self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device,
+                self.model_tester.num_hidden_layers,
+                self.model_tester.num_attention_heads,
+                device=torch_device,
             )
             head_mask[0, 0] = 0
             head_mask[-1, :-1] = 0
@@ -370,7 +372,10 @@ class ModelTesterMixin:
             return
 
         for model_class in self.all_model_classes:
-            (config, inputs_dict,) = self.model_tester.prepare_config_and_inputs_for_common()
+            (
+                config,
+                inputs_dict,
+            ) = self.model_tester.prepare_config_and_inputs_for_common()
 
             if "head_mask" in inputs_dict:
                 del inputs_dict["head_mask"]
@@ -399,7 +404,10 @@ class ModelTesterMixin:
             return
 
         for model_class in self.all_model_classes:
-            (config, inputs_dict,) = self.model_tester.prepare_config_and_inputs_for_common()
+            (
+                config,
+                inputs_dict,
+            ) = self.model_tester.prepare_config_and_inputs_for_common()
 
             if "head_mask" in inputs_dict:
                 del inputs_dict["head_mask"]
@@ -432,7 +440,10 @@ class ModelTesterMixin:
             return
 
         for model_class in self.all_model_classes:
-            (config, inputs_dict,) = self.model_tester.prepare_config_and_inputs_for_common()
+            (
+                config,
+                inputs_dict,
+            ) = self.model_tester.prepare_config_and_inputs_for_common()
 
             if "head_mask" in inputs_dict:
                 del inputs_dict["head_mask"]
@@ -463,7 +474,10 @@ class ModelTesterMixin:
             return
 
         for model_class in self.all_model_classes:
-            (config, inputs_dict,) = self.model_tester.prepare_config_and_inputs_for_common()
+            (
+                config,
+                inputs_dict,
+            ) = self.model_tester.prepare_config_and_inputs_for_common()
 
             if "head_mask" in inputs_dict:
                 del inputs_dict["head_mask"]
@@ -534,7 +548,8 @@ class ModelTesterMixin:
                 seq_length = self.model_tester.seq_length
 
             self.assertListEqual(
-                list(hidden_states[0].shape[-2:]), [seq_length, self.model_tester.hidden_size],
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
             )
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -550,7 +565,10 @@ class ModelTesterMixin:
             check_hidden_states_output(inputs_dict, config, model_class)
 
     def test_feed_forward_chunking(self):
-        (original_config, inputs_dict,) = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            original_config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             torch.manual_seed(0)
             config = copy.deepcopy(original_config)
@@ -570,7 +588,10 @@ class ModelTesterMixin:
             self.assertTrue(torch.allclose(hidden_states_no_chunk, hidden_states_with_chunk, atol=1e-3))
 
     def test_resize_tokens_embeddings(self):
-        (original_config, inputs_dict,) = self.model_tester.prepare_config_and_inputs_for_common()
+        (
+            original_config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.test_resize_embeddings:
             return
 
@@ -844,7 +865,14 @@ class ModelTesterMixin:
                 model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
 
             # num_return_sequences > 1, sample
-            self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2, num_return_sequences=2,))
+            self._check_generated_ids(
+                model.generate(
+                    input_ids,
+                    do_sample=True,
+                    num_beams=2,
+                    num_return_sequences=2,
+                )
+            )
             # num_return_sequences > 1, greedy
             self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2))
 
diff --git a/tests/test_modeling_ctrl.py b/tests/test_modeling_ctrl.py
index 2acafe3dba..39598b8ee6 100644
--- a/tests/test_modeling_ctrl.py
+++ b/tests/test_modeling_ctrl.py
@@ -30,7 +30,8 @@ if is_torch_available():
 
 class CTRLModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 14
diff --git a/tests/test_modeling_distilbert.py b/tests/test_modeling_distilbert.py
index c1f3548c17..fb4dc4e4eb 100644
--- a/tests/test_modeling_distilbert.py
+++ b/tests/test_modeling_distilbert.py
@@ -179,7 +179,9 @@ if is_torch_available():
             multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
             multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
             result = model(
-                multiple_choice_inputs_ids, attention_mask=multiple_choice_input_mask, labels=choice_labels,
+                multiple_choice_inputs_ids,
+                attention_mask=multiple_choice_input_mask,
+                labels=choice_labels,
             )
             self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
 
diff --git a/tests/test_modeling_dpr.py b/tests/test_modeling_dpr.py
index 666c7898bf..ad6b860288 100644
--- a/tests/test_modeling_dpr.py
+++ b/tests/test_modeling_dpr.py
@@ -149,7 +149,10 @@ class DPRModelTester:
         model = DPRReader(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(input_ids, attention_mask=input_mask,)
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+        )
 
         self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
         self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
@@ -173,7 +176,15 @@ class DPRModelTester:
 @require_torch
 class DPRModelTest(ModelTesterMixin, unittest.TestCase):
 
-    all_model_classes = (DPRContextEncoder, DPRQuestionEncoder, DPRReader,) if is_torch_available() else ()
+    all_model_classes = (
+        (
+            DPRContextEncoder,
+            DPRQuestionEncoder,
+            DPRReader,
+        )
+        if is_torch_available()
+        else ()
+    )
 
     test_resize_embeddings = False
     test_missing_keys = False  # why?
diff --git a/tests/test_modeling_electra.py b/tests/test_modeling_electra.py
index 1113c636c0..29bc782f93 100644
--- a/tests/test_modeling_electra.py
+++ b/tests/test_modeling_electra.py
@@ -39,7 +39,8 @@ if is_torch_available():
 
 class ElectraModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
diff --git a/tests/test_modeling_encoder_decoder.py b/tests/test_modeling_encoder_decoder.py
index 2a92c29faa..3af9fbc9c7 100644
--- a/tests/test_modeling_encoder_decoder.py
+++ b/tests/test_modeling_encoder_decoder.py
@@ -391,7 +391,11 @@ class EncoderDecoderMixin:
         decoder_input_ids = ids_tensor([13, 1], model_2.config.encoder.vocab_size)
         attention_mask = ids_tensor([13, 5], vocab_size=2)
         with torch.no_grad():
-            outputs = model_2(input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask,)
+            outputs = model_2(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+            )
             out_2 = outputs[0].cpu().numpy()
             out_2[np.isnan(out_2)] = 0
 
@@ -401,7 +405,9 @@ class EncoderDecoderMixin:
                 model_1.to(torch_device)
 
                 after_outputs = model_1(
-                    input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask,
+                    input_ids=input_ids,
+                    decoder_input_ids=decoder_input_ids,
+                    attention_mask=attention_mask,
                 )
                 out_1 = after_outputs[0].cpu().numpy()
                 out_1[np.isnan(out_1)] = 0
diff --git a/tests/test_modeling_flaubert.py b/tests/test_modeling_flaubert.py
index ca00786545..6694d9c912 100644
--- a/tests/test_modeling_flaubert.py
+++ b/tests/test_modeling_flaubert.py
@@ -39,7 +39,8 @@ if is_torch_available():
 
 class FlaubertModelTester(object):
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
diff --git a/tests/test_modeling_gpt2.py b/tests/test_modeling_gpt2.py
index 725bd85853..17e0a6bc48 100644
--- a/tests/test_modeling_gpt2.py
+++ b/tests/test_modeling_gpt2.py
@@ -244,7 +244,8 @@ class GPT2ModelTester:
         # append to next input_ids and attn_mask
         next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
         attn_mask = torch.cat(
-            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], dim=1,
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
         )
 
         # get two different outputs
diff --git a/tests/test_modeling_longformer.py b/tests/test_modeling_longformer.py
index 59d240df4c..85430b0fd8 100644
--- a/tests/test_modeling_longformer.py
+++ b/tests/test_modeling_longformer.py
@@ -40,7 +40,8 @@ if is_torch_available():
 
 class LongformerModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
diff --git a/tests/test_modeling_mobilebert.py b/tests/test_modeling_mobilebert.py
index dc83064d6d..149494d20a 100644
--- a/tests/test_modeling_mobilebert.py
+++ b/tests/test_modeling_mobilebert.py
@@ -217,7 +217,10 @@ class MobileBertModelTester:
         model.to(torch_device)
         model.eval()
         result = model(
-            input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels,
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            next_sentence_label=sequence_labels,
         )
         self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
 
@@ -397,7 +400,11 @@ class MobileBertModelTest(ModelTesterMixin, unittest.TestCase):
 
 
 def _long_tensor(tok_lst):
-    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device,)
+    return torch.tensor(
+        tok_lst,
+        dtype=torch.long,
+        device=torch_device,
+    )
 
 
 TOLERANCE = 1e-3
diff --git a/tests/test_modeling_openai.py b/tests/test_modeling_openai.py
index b77441b237..1014e1eea4 100644
--- a/tests/test_modeling_openai.py
+++ b/tests/test_modeling_openai.py
@@ -37,7 +37,8 @@ if is_torch_available():
 
 class OpenAIGPTModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
diff --git a/tests/test_modeling_reformer.py b/tests/test_modeling_reformer.py
index 6973f4bdd9..14aa6550be 100644
--- a/tests/test_modeling_reformer.py
+++ b/tests/test_modeling_reformer.py
@@ -240,14 +240,19 @@ class ReformerModelTester:
         half_input_ids = input_ids[:, :half_seq_len]
 
         # normal padded
-        attn_mask = torch.cat([torch.ones_like(half_input_ids), torch.zeros_like(half_input_ids)], dim=-1,)
+        attn_mask = torch.cat(
+            [torch.ones_like(half_input_ids), torch.zeros_like(half_input_ids)],
+            dim=-1,
+        )
         input_ids_padded = torch.cat(
-            [half_input_ids, ids_tensor((self.batch_size, half_seq_len), self.vocab_size)], dim=-1,
+            [half_input_ids, ids_tensor((self.batch_size, half_seq_len), self.vocab_size)],
+            dim=-1,
         )
 
         # shifted padded
         input_ids_roll = torch.cat(
-            [half_input_ids, ids_tensor((self.batch_size, half_seq_len), self.vocab_size)], dim=-1,
+            [half_input_ids, ids_tensor((self.batch_size, half_seq_len), self.vocab_size)],
+            dim=-1,
         )
         input_ids_roll = torch.roll(input_ids_roll, roll, dims=-1)
         attn_mask_roll = torch.roll(attn_mask, roll, dims=-1)
@@ -283,13 +288,21 @@ class ReformerModelTester:
         torch.manual_seed(layer.attention_seed)
         attn_outputs = layer.attention(hidden_states, attention_mask=input_mask)
         self.parent.assertTrue(
-            torch.allclose(prev_attn_output + attn_outputs.hidden_states, next_attn_output, atol=1e-3,)
+            torch.allclose(
+                prev_attn_output + attn_outputs.hidden_states,
+                next_attn_output,
+                atol=1e-3,
+            )
         )
 
         torch.manual_seed(layer.feed_forward_seed)
         feed_forward_hidden_states = layer.feed_forward(next_attn_output)
         self.parent.assertTrue(
-            torch.allclose(next_hidden_states, hidden_states + feed_forward_hidden_states, atol=1e-3,)
+            torch.allclose(
+                next_hidden_states,
+                hidden_states + feed_forward_hidden_states,
+                atol=1e-3,
+            )
         )
 
     def create_and_check_reformer_feed_backward_chunking(self, config, input_ids, input_mask, choice_labels):
@@ -416,7 +429,10 @@ class ReformerModelTester:
         model.to(torch_device)
         model.eval()
         result = model(
-            input_ids, attention_mask=input_mask, start_positions=choice_labels, end_positions=choice_labels,
+            input_ids,
+            attention_mask=input_mask,
+            start_positions=choice_labels,
+            end_positions=choice_labels,
         )
         self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
         self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
@@ -468,7 +484,7 @@ class ReformerModelTester:
 
 class ReformerTesterMixin:
     """
-        Reformer Local and Reformer LSH run essentially the same tests
+    Reformer Local and Reformer LSH run essentially the same tests
     """
 
     def test_config(self):
@@ -887,7 +903,9 @@ class ReformerIntegrationTests(unittest.TestCase):
         reformer_output = layer(prev_attn_output=hidden_states.clone(), hidden_states=hidden_states)
         output_slice = reformer_output.hidden_states[0, 0, :5]
         expected_output_slice = torch.tensor(
-            [1.6879, -1.3083, -0.4708, 1.3555, -0.6292], dtype=torch.float, device=torch_device,
+            [1.6879, -1.3083, -0.4708, 1.3555, -0.6292],
+            dtype=torch.float,
+            device=torch_device,
         )
         self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
 
@@ -902,11 +920,15 @@ class ReformerIntegrationTests(unittest.TestCase):
         layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
         layer.eval()
         reformer_output = layer(
-            prev_attn_output=hidden_states.clone(), hidden_states=hidden_states, attention_mask=attn_mask,
+            prev_attn_output=hidden_states.clone(),
+            hidden_states=hidden_states,
+            attention_mask=attn_mask,
         )
         output_slice = reformer_output.hidden_states[0, 0, :5]
         expected_output_slice = torch.tensor(
-            [1.6439, -1.2306, -0.5108, 1.3006, -0.6537], dtype=torch.float, device=torch_device,
+            [1.6439, -1.2306, -0.5108, 1.3006, -0.6537],
+            dtype=torch.float,
+            device=torch_device,
         )
         self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
 
@@ -922,7 +944,9 @@ class ReformerIntegrationTests(unittest.TestCase):
         reformer_output = layer(prev_attn_output=hidden_states, hidden_states=hidden_states)
         output_slice = reformer_output.hidden_states[0, 0, :5]
         expected_output_slice = torch.tensor(
-            [1.4212, -2.0576, -0.9688, 1.4599, -0.1344], dtype=torch.float, device=torch_device,
+            [1.4212, -2.0576, -0.9688, 1.4599, -0.1344],
+            dtype=torch.float,
+            device=torch_device,
         )
         self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
 
@@ -935,10 +959,16 @@ class ReformerIntegrationTests(unittest.TestCase):
         torch.manual_seed(0)
         layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
         layer.eval()
-        reformer_output = layer(prev_attn_output=hidden_states, hidden_states=hidden_states, attention_mask=attn_mask,)
+        reformer_output = layer(
+            prev_attn_output=hidden_states,
+            hidden_states=hidden_states,
+            attention_mask=attn_mask,
+        )
         output_slice = reformer_output.hidden_states[0, 0, :5]
         expected_output_slice = torch.tensor(
-            [1.4750, -2.0235, -0.9743, 1.4463, -0.1269], dtype=torch.float, device=torch_device,
+            [1.4750, -2.0235, -0.9743, 1.4463, -0.1269],
+            dtype=torch.float,
+            device=torch_device,
         )
         self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
 
@@ -953,7 +983,9 @@ class ReformerIntegrationTests(unittest.TestCase):
         hidden_states = model(input_ids=input_ids, attention_mask=attn_mask)[0]
         output_slice = hidden_states[0, 0, :5]
         expected_output_slice = torch.tensor(
-            [-0.9896, -0.9396, -1.0831, -0.0597, 0.2456], dtype=torch.float, device=torch_device,
+            [-0.9896, -0.9396, -1.0831, -0.0597, 0.2456],
+            dtype=torch.float,
+            device=torch_device,
         )
         self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
 
@@ -967,7 +999,9 @@ class ReformerIntegrationTests(unittest.TestCase):
         hidden_states = model(input_ids=input_ids, attention_mask=attn_mask)[0]
         output_slice = hidden_states[0, 0, :5]
         expected_output_slice = torch.tensor(
-            [-1.6791, 0.7171, 0.1594, 0.4063, 1.2584], dtype=torch.float, device=torch_device,
+            [-1.6791, 0.7171, 0.1594, 0.4063, 1.2584],
+            dtype=torch.float,
+            device=torch_device,
         )
         self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
 
@@ -983,7 +1017,9 @@ class ReformerIntegrationTests(unittest.TestCase):
         hidden_states = model(input_ids=input_ids, attention_mask=attn_mask)[0]
         output_slice = hidden_states[1, -1, :5]
         expected_output_slice = torch.tensor(
-            [0.0256, -0.0121, 0.0636, 0.0024, -0.0393], dtype=torch.float, device=torch_device,
+            [0.0256, -0.0121, 0.0636, 0.0024, -0.0393],
+            dtype=torch.float,
+            device=torch_device,
         )
         self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
 
@@ -1005,15 +1041,21 @@ class ReformerIntegrationTests(unittest.TestCase):
         # check last grads to cover all proable errors
         grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
         expected_grad_slice_word = torch.tensor(
-            [-0.0005, 0.0001, 0.0002, 0.0003, 0.0006], dtype=torch.float, device=torch_device,
+            [-0.0005, 0.0001, 0.0002, 0.0003, 0.0006],
+            dtype=torch.float,
+            device=torch_device,
         )
         grad_slice_position_factor_1 = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
         expected_grad_slice_pos_fac_1 = torch.tensor(
-            [0.0037, -1.3793, -1.0231, -1.5230, -2.5306], dtype=torch.float, device=torch_device,
+            [0.0037, -1.3793, -1.0231, -1.5230, -2.5306],
+            dtype=torch.float,
+            device=torch_device,
         )
         grad_slice_position_factor_2 = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
         expected_grad_slice_pos_fac_2 = torch.tensor(
-            [-1.3165, 0.5168, 0.7785, 1.0811, -0.9830], dtype=torch.float, device=torch_device,
+            [-1.3165, 0.5168, 0.7785, 1.0811, -0.9830],
+            dtype=torch.float,
+            device=torch_device,
         )
         self.assertTrue(torch.allclose(grad_slice_word, expected_grad_slice_word, atol=1e-3))
         self.assertTrue(torch.allclose(grad_slice_position_factor_1, expected_grad_slice_pos_fac_1, atol=1e-3))
@@ -1038,15 +1080,21 @@ class ReformerIntegrationTests(unittest.TestCase):
         # check last grads to cover all proable errors
         grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
         expected_grad_slice_word = torch.tensor(
-            [2.6357e-05, 4.3358e-04, -8.4985e-04, 1.0094e-04, 3.8954e-04], dtype=torch.float, device=torch_device,
+            [2.6357e-05, 4.3358e-04, -8.4985e-04, 1.0094e-04, 3.8954e-04],
+            dtype=torch.float,
+            device=torch_device,
         )
         grad_slice_position_factor_1 = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
         expected_grad_slice_pos_fac_1 = torch.tensor(
-            [-0.0984, 0.6283, 0.4282, 1.2960, 0.6897], dtype=torch.float, device=torch_device,
+            [-0.0984, 0.6283, 0.4282, 1.2960, 0.6897],
+            dtype=torch.float,
+            device=torch_device,
         )
         grad_slice_position_factor_2 = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
         expected_grad_slice_pos_fac_2 = torch.tensor(
-            [0.4626, -0.0231, -0.0172, 0.1081, 0.3805], dtype=torch.float, device=torch_device,
+            [0.4626, -0.0231, -0.0172, 0.1081, 0.3805],
+            dtype=torch.float,
+            device=torch_device,
         )
         self.assertTrue(torch.allclose(grad_slice_word, expected_grad_slice_word, atol=1e-3))
         self.assertTrue(torch.allclose(grad_slice_position_factor_1, expected_grad_slice_pos_fac_1, atol=1e-3))
diff --git a/tests/test_modeling_roberta.py b/tests/test_modeling_roberta.py
index a1d1efe68d..cd1aa3a7ac 100644
--- a/tests/test_modeling_roberta.py
+++ b/tests/test_modeling_roberta.py
@@ -45,7 +45,8 @@ if is_torch_available():
 
 class RobertaModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
@@ -352,7 +353,7 @@ class RobertaModelTest(ModelTesterMixin, unittest.TestCase):
             self.assertIsNotNone(model)
 
     def test_create_position_ids_respects_padding_index(self):
-        """ Ensure that the default position ids only assign a sequential . This is a regression
+        """Ensure that the default position ids only assign a sequential . This is a regression
         test for https://github.com/huggingface/transformers/issues/1761
 
         The position ids should be masked with the embedding object's padding index. Therefore, the
@@ -371,7 +372,7 @@ class RobertaModelTest(ModelTesterMixin, unittest.TestCase):
         self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
 
     def test_create_position_ids_from_inputs_embeds(self):
-        """ Ensure that the default position ids only assign a sequential . This is a regression
+        """Ensure that the default position ids only assign a sequential . This is a regression
         test for https://github.com/huggingface/transformers/issues/1761
 
         The position ids should be masked with the embedding object's padding index. Therefore, the
diff --git a/tests/test_modeling_t5.py b/tests/test_modeling_t5.py
index 61d1bf71bc..fef623807c 100644
--- a/tests/test_modeling_t5.py
+++ b/tests/test_modeling_t5.py
@@ -101,7 +101,13 @@ class T5ModelTester:
         )
 
     def check_prepare_lm_labels_via_shift_left(
-        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
     ):
         model = T5Model(config=config)
         model.to(torch_device)
@@ -134,7 +140,13 @@ class T5ModelTester:
                 self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
 
     def create_and_check_model(
-        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
     ):
         model = T5Model(config=config)
         model.to(torch_device)
@@ -160,7 +172,13 @@ class T5ModelTester:
         self.parent.assertEqual(len(decoder_past[1][0]), 4)
 
     def create_and_check_with_lm_head(
-        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
     ):
         model = T5ForConditionalGeneration(config=config).to(torch_device).eval()
         outputs = model(
@@ -174,7 +192,13 @@ class T5ModelTester:
         self.parent.assertEqual(outputs["loss"].size(), ())
 
     def create_and_check_decoder_model_past(
-        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
     ):
         model = T5Model(config=config).get_decoder().to(torch_device).eval()
         # first forward pass
@@ -205,7 +229,13 @@ class T5ModelTester:
         self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
 
     def create_and_check_decoder_model_attention_mask_past(
-        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
     ):
         model = T5Model(config=config).get_decoder()
         model.to(torch_device)
@@ -231,7 +261,8 @@ class T5ModelTester:
         # append to next input_ids and attn_mask
         next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
         attn_mask = torch.cat(
-            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], dim=1,
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
         )
 
         # get two different outputs
@@ -249,7 +280,13 @@ class T5ModelTester:
         self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
 
     def create_and_check_generate_with_past_key_value_states(
-        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
     ):
         model = T5ForConditionalGeneration(config=config).to(torch_device).eval()
         torch.manual_seed(0)
@@ -261,14 +298,26 @@ class T5ModelTester:
         self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
 
     def create_and_check_model_fp16_forward(
-        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
     ):
         model = T5Model(config=config).to(torch_device).half().eval()
         output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)["last_hidden_state"]
         self.parent.assertFalse(torch.isnan(output).any().item())
 
     def create_and_check_encoder_decoder_shared_weights(
-        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
     ):
         for model_class in [T5Model, T5ForConditionalGeneration]:
             torch.manual_seed(0)
@@ -339,7 +388,14 @@ class T5ModelTester:
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
-        (config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,) = config_and_inputs
+        (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        ) = config_and_inputs
 
         inputs_dict = {
             "input_ids": input_ids,
@@ -412,7 +468,11 @@ class T5ModelTest(ModelTesterMixin, unittest.TestCase):
         model = T5Model(config_and_inputs[0]).to(torch_device)
         with tempfile.TemporaryDirectory() as tmpdirname:
             torch.onnx.export(
-                model, config_and_inputs[1], f"{tmpdirname}/t5_test.onnx", export_params=True, opset_version=9,
+                model,
+                config_and_inputs[1],
+                f"{tmpdirname}/t5_test.onnx",
+                export_params=True,
+                opset_version=9,
             )
 
 
@@ -469,7 +529,8 @@ class T5ModelIntegrationTests(unittest.TestCase):
         )
         decoded = tok.batch_decode(hypotheses_batch, skip_special_tokens=True, clean_up_tokenization_spaces=False)
         self.assertListEqual(
-            expected_summaries, decoded,
+            expected_summaries,
+            decoded,
         )
 
     @slow
diff --git a/tests/test_modeling_tf_camembert.py b/tests/test_modeling_tf_camembert.py
index 5378bff686..865fc3be08 100644
--- a/tests/test_modeling_tf_camembert.py
+++ b/tests/test_modeling_tf_camembert.py
@@ -33,7 +33,8 @@ class TFCamembertModelIntegrationTest(unittest.TestCase):
         model = TFCamembertModel.from_pretrained("jplu/tf-camembert-base")
 
         input_ids = tf.convert_to_tensor(
-            [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]], dtype=tf.int32,
+            [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]],
+            dtype=tf.int32,
         )  # J'aime le camembert !"
 
         output = model(input_ids)["last_hidden_state"]
@@ -41,7 +42,8 @@ class TFCamembertModelIntegrationTest(unittest.TestCase):
         self.assertEqual(output.shape, expected_shape)
         # compare the actual values for a slice.
         expected_slice = tf.convert_to_tensor(
-            [[[-0.0254, 0.0235, 0.1027], [0.0606, -0.1811, -0.0418], [-0.1561, -0.1127, 0.2687]]], dtype=tf.float32,
+            [[[-0.0254, 0.0235, 0.1027], [0.0606, -0.1811, -0.0418], [-0.1561, -0.1127, 0.2687]]],
+            dtype=tf.float32,
         )
         # camembert = torch.hub.load('pytorch/fairseq', 'camembert.v0')
         # camembert.eval()
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index df361f80a3..478f8353eb 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -155,7 +155,8 @@ class TFModelTesterMixin:
                 self.assertEqual(len(outputs), num_out)
                 self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
                 self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size],
+                    list(hidden_states[0].shape[-2:]),
+                    [self.model_tester.seq_length, self.model_tester.hidden_size],
                 )
 
     @slow
@@ -486,7 +487,8 @@ class TFModelTesterMixin:
             hidden_states = [t.numpy() for t in outputs[-1]]
             self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
             self.assertListEqual(
-                list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size],
+                list(hidden_states[0].shape[-2:]),
+                [self.model_tester.seq_length, self.model_tester.hidden_size],
             )
 
         for model_class in self.all_model_classes:
@@ -591,9 +593,15 @@ class TFModelTesterMixin:
                     x = wte([input_ids, None, None, None], mode="embedding")
                 except Exception:
                     if hasattr(self.model_tester, "embedding_size"):
-                        x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32,)
+                        x = tf.ones(
+                            input_ids.shape + [self.model_tester.embedding_size],
+                            dtype=tf.dtypes.float32,
+                        )
                     else:
-                        x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32,)
+                        x = tf.ones(
+                            input_ids.shape + [self.model_tester.hidden_size],
+                            dtype=tf.dtypes.float32,
+                        )
         return x
 
     def test_inputs_embeds(self):
@@ -700,7 +708,14 @@ class TFModelTesterMixin:
                 model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
 
             # num_return_sequences > 1, sample
-            self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2, num_return_sequences=2,))
+            self._check_generated_ids(
+                model.generate(
+                    input_ids,
+                    do_sample=True,
+                    num_beams=2,
+                    num_return_sequences=2,
+                )
+            )
             # num_return_sequences > 1, greedy
             self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2))
 
@@ -895,7 +910,8 @@ class UtilsFunctionsTest(unittest.TestCase):
         )
 
         non_inf_expected_idx = tf.convert_to_tensor(
-            [[0, 0], [0, 9], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 18], [1, 20], [1, 27]], dtype=tf.int32,
+            [[0, 0], [0, 9], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 18], [1, 20], [1, 27]],
+            dtype=tf.int32,
         )  # expected non filtered idx as noted above
 
         non_inf_expected_output = tf.convert_to_tensor(
@@ -907,7 +923,8 @@ class UtilsFunctionsTest(unittest.TestCase):
 
         non_inf_output = output[output != -float("inf")]
         non_inf_idx = tf.cast(
-            tf.where(tf.not_equal(output, tf.constant(-float("inf"), dtype=tf.float32))), dtype=tf.int32,
+            tf.where(tf.not_equal(output, tf.constant(-float("inf"), dtype=tf.float32))),
+            dtype=tf.int32,
         )
 
         tf.debugging.assert_near(non_inf_output, non_inf_expected_output, rtol=1e-12)
diff --git a/tests/test_modeling_tf_ctrl.py b/tests/test_modeling_tf_ctrl.py
index cbd0abb4cd..be9ba0111d 100644
--- a/tests/test_modeling_tf_ctrl.py
+++ b/tests/test_modeling_tf_ctrl.py
@@ -31,7 +31,8 @@ if is_tf_available():
 
 class TFCTRLModelTester(object):
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
diff --git a/tests/test_modeling_tf_distilbert.py b/tests/test_modeling_tf_distilbert.py
index 45930d2b0d..2c09e0fa8e 100644
--- a/tests/test_modeling_tf_distilbert.py
+++ b/tests/test_modeling_tf_distilbert.py
@@ -39,7 +39,8 @@ if is_tf_available():
 
 class TFDistilBertModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
diff --git a/tests/test_modeling_tf_electra.py b/tests/test_modeling_tf_electra.py
index 1004576c84..2c1daf4557 100644
--- a/tests/test_modeling_tf_electra.py
+++ b/tests/test_modeling_tf_electra.py
@@ -39,7 +39,8 @@ if is_tf_available():
 
 class TFElectraModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
diff --git a/tests/test_modeling_tf_flaubert.py b/tests/test_modeling_tf_flaubert.py
index cc44166902..dbbdc15b2a 100644
--- a/tests/test_modeling_tf_flaubert.py
+++ b/tests/test_modeling_tf_flaubert.py
@@ -40,7 +40,8 @@ if is_tf_available():
 
 class TFFlaubertModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
@@ -337,7 +338,8 @@ class TFFlaubertModelIntegrationTest(unittest.TestCase):
         model = TFFlaubertModel.from_pretrained("jplu/tf-flaubert-small-cased")
 
         input_ids = tf.convert_to_tensor(
-            [[0, 158, 735, 2592, 1424, 6727, 82, 1]], dtype=tf.int32,
+            [[0, 158, 735, 2592, 1424, 6727, 82, 1]],
+            dtype=tf.int32,
         )  # "J'aime flaubert !"
 
         output = model(input_ids)[0]
diff --git a/tests/test_modeling_tf_gpt2.py b/tests/test_modeling_tf_gpt2.py
index 9a4aa1ca01..41b973719e 100644
--- a/tests/test_modeling_tf_gpt2.py
+++ b/tests/test_modeling_tf_gpt2.py
@@ -37,7 +37,8 @@ if is_tf_available():
 
 class TFGPT2ModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
diff --git a/tests/test_modeling_tf_longformer.py b/tests/test_modeling_tf_longformer.py
index da884bffd4..090a61c84b 100644
--- a/tests/test_modeling_tf_longformer.py
+++ b/tests/test_modeling_tf_longformer.py
@@ -36,7 +36,7 @@ if is_tf_available():
 
     def shape_list(x):
         """
-            copied from transformers.modeling_tf_utils
+        copied from transformers.modeling_tf_utils
         """
         static = x.shape.as_list()
         dynamic = tf.shape(x)
@@ -45,7 +45,8 @@ if is_tf_available():
 
 class TFLongformerModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
@@ -228,7 +229,8 @@ class TFLongformerModelTester:
         # global attention mask has to be partly defined
         # to trace all weights
         global_attention_mask = tf.concat(
-            [tf.zeros_like(input_ids)[:, :-1], tf.ones_like(input_ids)[:, -1:]], axis=-1,
+            [tf.zeros_like(input_ids)[:, :-1], tf.ones_like(input_ids)[:, -1:]],
+            axis=-1,
         )
 
         inputs_dict = {
@@ -267,7 +269,13 @@ class TFLongformerModelTest(TFModelTesterMixin, unittest.TestCase):
     test_torchscript = False
 
     all_model_classes = (
-        (TFLongformerModel, TFLongformerForMaskedLM, TFLongformerForQuestionAnswering,) if is_tf_available() else ()
+        (
+            TFLongformerModel,
+            TFLongformerForMaskedLM,
+            TFLongformerForQuestionAnswering,
+        )
+        if is_tf_available()
+        else ()
     )
 
     def setUp(self):
diff --git a/tests/test_modeling_tf_openai.py b/tests/test_modeling_tf_openai.py
index c73bb40763..e3bd82dae2 100644
--- a/tests/test_modeling_tf_openai.py
+++ b/tests/test_modeling_tf_openai.py
@@ -36,7 +36,8 @@ if is_tf_available():
 
 class TFOpenAIGPTModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
diff --git a/tests/test_modeling_tf_roberta.py b/tests/test_modeling_tf_roberta.py
index 2380acd455..9a4d0b037d 100644
--- a/tests/test_modeling_tf_roberta.py
+++ b/tests/test_modeling_tf_roberta.py
@@ -40,7 +40,8 @@ if is_tf_available():
 
 class TFRobertaModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
diff --git a/tests/test_modeling_tf_t5.py b/tests/test_modeling_tf_t5.py
index 9e2e03f434..eb575f5131 100644
--- a/tests/test_modeling_tf_t5.py
+++ b/tests/test_modeling_tf_t5.py
@@ -32,7 +32,8 @@ if is_tf_available():
 
 class TFT5ModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
@@ -181,7 +182,10 @@ class TFT5ModelTester:
 
         # append to next input_ids and attn_mask
         next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        attn_mask = tf.concat([attn_mask, tf.ones((attn_mask.shape[0], 1), dtype=tf.int32)], axis=1,)
+        attn_mask = tf.concat(
+            [attn_mask, tf.ones((attn_mask.shape[0], 1), dtype=tf.int32)],
+            axis=1,
+        )
 
         # get two different outputs
         output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0]
diff --git a/tests/test_modeling_tf_transfo_xl.py b/tests/test_modeling_tf_transfo_xl.py
index 6525e4c45d..641d1165ec 100644
--- a/tests/test_modeling_tf_transfo_xl.py
+++ b/tests/test_modeling_tf_transfo_xl.py
@@ -32,7 +32,8 @@ if is_tf_available():
 
 class TFTransfoXLModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
diff --git a/tests/test_modeling_tf_xlm.py b/tests/test_modeling_tf_xlm.py
index faee68f7ff..be7f7d1cf3 100644
--- a/tests/test_modeling_tf_xlm.py
+++ b/tests/test_modeling_tf_xlm.py
@@ -40,7 +40,8 @@ if is_tf_available():
 
 class TFXLMModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
diff --git a/tests/test_modeling_tf_xlnet.py b/tests/test_modeling_tf_xlnet.py
index acf8cf80ae..9707686201 100644
--- a/tests/test_modeling_tf_xlnet.py
+++ b/tests/test_modeling_tf_xlnet.py
@@ -40,7 +40,8 @@ if is_tf_available():
 
 class TFXLNetModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
diff --git a/tests/test_modeling_transfo_xl.py b/tests/test_modeling_transfo_xl.py
index ce04809c18..2c93243f95 100644
--- a/tests/test_modeling_transfo_xl.py
+++ b/tests/test_modeling_transfo_xl.py
@@ -32,7 +32,8 @@ if is_torch_available():
 
 class TransfoXLModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 14
diff --git a/tests/test_modeling_xlm.py b/tests/test_modeling_xlm.py
index af77ad0402..da1ef130c0 100644
--- a/tests/test_modeling_xlm.py
+++ b/tests/test_modeling_xlm.py
@@ -41,7 +41,8 @@ if is_torch_available():
 
 class XLMModelTester:
     def __init__(
-        self, parent,
+        self,
+        parent,
     ):
         self.parent = parent
         self.batch_size = 13
diff --git a/tests/test_modeling_xlnet.py b/tests/test_modeling_xlnet.py
index 56f1810a09..9b38254016 100644
--- a/tests/test_modeling_xlnet.py
+++ b/tests/test_modeling_xlnet.py
@@ -104,10 +104,20 @@ class XLNetModelTester:
 
         input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
         perm_mask = torch.zeros(
-            self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float, device=torch_device,
+            self.batch_size,
+            self.seq_length + 1,
+            self.seq_length + 1,
+            dtype=torch.float,
+            device=torch_device,
         )
         perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-        target_mapping = torch.zeros(self.batch_size, 1, self.seq_length + 1, dtype=torch.float, device=torch_device,)
+        target_mapping = torch.zeros(
+            self.batch_size,
+            1,
+            self.seq_length + 1,
+            dtype=torch.float,
+            device=torch_device,
+        )
         target_mapping[:, 0, -1] = 1.0  # predict last token
 
         sequence_labels = None
@@ -217,7 +227,11 @@ class XLNetModelTester:
 
         # first forward pass
         causal_mask = torch.ones(
-            input_ids_1.shape[0], input_ids_1.shape[1], input_ids_1.shape[1], dtype=torch.float, device=torch_device,
+            input_ids_1.shape[0],
+            input_ids_1.shape[1],
+            input_ids_1.shape[1],
+            dtype=torch.float,
+            device=torch_device,
         )
         causal_mask = torch.triu(causal_mask, diagonal=0)
         outputs_cache = model(input_ids_1, use_cache=True, perm_mask=causal_mask)
@@ -363,7 +377,11 @@ class XLNetModelTester:
 
         total_loss, mems = result_with_labels.to_tuple()
 
-        result_with_labels = model(input_ids_1, start_positions=sequence_labels, end_positions=sequence_labels,)
+        result_with_labels = model(
+            input_ids_1,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
 
         total_loss, mems = result_with_labels.to_tuple()
 
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 205db94b4c..406ac765a4 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -164,7 +164,8 @@ class MonoColumnInputTestCase(unittest.TestCase):
             for result, expect in zip(multi_result, expected_multi_result):
                 for key in expected_check_keys or []:
                     self.assertEqual(
-                        set([o[key] for o in result]), set([o[key] for o in expect]),
+                        set([o[key] for o in result]),
+                        set([o[key] for o in expect]),
                     )
 
         if isinstance(multi_result[0], list):
@@ -214,7 +215,13 @@ class MonoColumnInputTestCase(unittest.TestCase):
             "This is"  # No mask_token is not supported
         ]
         for model_name in FILL_MASK_FINETUNED_MODELS:
-            nlp = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt", topk=2,)
+            nlp = pipeline(
+                task="fill-mask",
+                model=model_name,
+                tokenizer=model_name,
+                framework="pt",
+                topk=2,
+            )
             self._test_mono_column_pipeline(
                 nlp, valid_inputs, mandatory_keys, invalid_inputs, expected_check_keys=["sequence"]
             )
@@ -231,7 +238,13 @@ class MonoColumnInputTestCase(unittest.TestCase):
             "This is"  # No mask_token is not supported
         ]
         for model_name in FILL_MASK_FINETUNED_MODELS:
-            nlp = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="tf", topk=2,)
+            nlp = pipeline(
+                task="fill-mask",
+                model=model_name,
+                tokenizer=model_name,
+                framework="tf",
+                topk=2,
+            )
             self._test_mono_column_pipeline(
                 nlp, valid_inputs, mandatory_keys, invalid_inputs, expected_check_keys=["sequence"]
             )
@@ -274,7 +287,13 @@ class MonoColumnInputTestCase(unittest.TestCase):
         ]
         valid_targets = [" Patrick", " Clara"]
         for model_name in LARGE_FILL_MASK_FINETUNED_MODELS:
-            nlp = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt", topk=2,)
+            nlp = pipeline(
+                task="fill-mask",
+                model=model_name,
+                tokenizer=model_name,
+                framework="pt",
+                topk=2,
+            )
             self._test_mono_column_pipeline(
                 nlp,
                 valid_inputs,
@@ -343,7 +362,12 @@ class MonoColumnInputTestCase(unittest.TestCase):
         invalid_inputs = [4, "<mask>"]
         mandatory_keys = ["summary_text"]
         for model_name in TF_SUMMARIZATION_FINETUNED_MODELS:
-            nlp = pipeline(task="summarization", model=model_name, tokenizer=model_name, framework="tf",)
+            nlp = pipeline(
+                task="summarization",
+                model=model_name,
+                tokenizer=model_name,
+                framework="tf",
+            )
             self._test_mono_column_pipeline(
                 nlp, VALID_INPUTS, mandatory_keys, invalid_inputs=invalid_inputs, **SUMMARIZATION_KWARGS
             )
@@ -355,7 +379,10 @@ class MonoColumnInputTestCase(unittest.TestCase):
         for model_name, task in TRANSLATION_FINETUNED_MODELS:
             nlp = pipeline(task=task, model=model_name, tokenizer=model_name)
             self._test_mono_column_pipeline(
-                nlp, VALID_INPUTS, mandatory_keys, invalid_inputs,
+                nlp,
+                VALID_INPUTS,
+                mandatory_keys,
+                invalid_inputs,
             )
 
     @require_tf
@@ -655,7 +682,9 @@ class QAPipelineTests(unittest.TestCase):
 
 class NerPipelineTests(unittest.TestCase):
     def _test_ner_pipeline(
-        self, nlp: Pipeline, output_keys: Iterable[str],
+        self,
+        nlp: Pipeline,
+        output_keys: Iterable[str],
     ):
 
         ungrouped_ner_inputs = [
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 8da99531e6..55c14bfacb 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -882,8 +882,7 @@ class TokenizerTesterMixin:
                 assert encoded_sequence == padded_sequence_left
 
     def test_padding_to_max_length(self):
-        """ We keep this test for backward compatibility but it should be remove when `pad_to_max_length` will e deprecated
-        """
+        """We keep this test for backward compatibility but it should be remove when `pad_to_max_length` will e deprecated"""
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -972,7 +971,11 @@ class TokenizerTesterMixin:
                 # Test 'longest' and 'no_padding' don't do anything
                 tokenizer.padding_side = "right"
 
-                not_padded_sequence = tokenizer.encode_plus(sequence, padding=True, return_special_tokens_mask=True,)
+                not_padded_sequence = tokenizer.encode_plus(
+                    sequence,
+                    padding=True,
+                    return_special_tokens_mask=True,
+                )
                 not_padded_input_ids = not_padded_sequence["input_ids"]
 
                 not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
@@ -982,7 +985,11 @@ class TokenizerTesterMixin:
                 assert input_ids == not_padded_input_ids
                 assert special_tokens_mask == not_padded_special_tokens_mask
 
-                not_padded_sequence = tokenizer.encode_plus(sequence, padding=False, return_special_tokens_mask=True,)
+                not_padded_sequence = tokenizer.encode_plus(
+                    sequence,
+                    padding=False,
+                    return_special_tokens_mask=True,
+                )
                 not_padded_input_ids = not_padded_sequence["input_ids"]
 
                 not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
@@ -1148,7 +1155,8 @@ class TokenizerTesterMixin:
                 )
                 for key in encoded_sequences_batch_padded_1.keys():
                     self.assertListEqual(
-                        encoded_sequences_batch_padded_1[key], encoded_sequences_batch_padded_2[key],
+                        encoded_sequences_batch_padded_1[key],
+                        encoded_sequences_batch_padded_2[key],
                     )
 
                 # check 'no_padding' is unsensitive to a max length
@@ -1158,7 +1166,8 @@ class TokenizerTesterMixin:
                 )
                 for key in encoded_sequences_batch_padded_1.keys():
                     self.assertListEqual(
-                        encoded_sequences_batch_padded_1[key], encoded_sequences_batch_padded_2[key],
+                        encoded_sequences_batch_padded_1[key],
+                        encoded_sequences_batch_padded_2[key],
                     )
 
     def test_added_token_serializable(self):
@@ -1361,10 +1370,18 @@ class TokenizerTesterMixin:
 
                 if tokenizer.pad_token_id is None:
                     self.assertRaises(
-                        ValueError, tokenizer.batch_encode_plus, sequences, padding=True, return_tensors="pt",
+                        ValueError,
+                        tokenizer.batch_encode_plus,
+                        sequences,
+                        padding=True,
+                        return_tensors="pt",
                     )
                     self.assertRaises(
-                        ValueError, tokenizer.batch_encode_plus, sequences, padding="longest", return_tensors="tf",
+                        ValueError,
+                        tokenizer.batch_encode_plus,
+                        sequences,
+                        padding="longest",
+                        return_tensors="tf",
                     )
                 else:
                     pytorch_tensor = tokenizer.batch_encode_plus(sequences, padding=True, return_tensors="pt")
diff --git a/tests/test_tokenization_fast.py b/tests/test_tokenization_fast.py
index 399aae8b78..4dcf0bf896 100644
--- a/tests/test_tokenization_fast.py
+++ b/tests/test_tokenization_fast.py
@@ -228,7 +228,8 @@ class CommonFastTokenizerTest(unittest.TestCase):
     def assert_special_tokens_map_equal(self, tokenizer_r, tokenizer_p):
         # Assert the set of special tokens match.
         self.assertSequenceEqual(
-            tokenizer_p.special_tokens_map.items(), tokenizer_r.special_tokens_map.items(),
+            tokenizer_p.special_tokens_map.items(),
+            tokenizer_r.special_tokens_map.items(),
         )
 
     def assert_add_tokens(self, tokenizer_r):
@@ -544,18 +545,26 @@ class CommonFastTokenizerTest(unittest.TestCase):
         assert_batch_padded_input_match(input_r, input_p, max_length)
 
         input_r = tokenizer_r.batch_encode_plus(
-            ["This is a simple input 1", "This is a simple input 2"], max_length=max_length, padding="max_length",
+            ["This is a simple input 1", "This is a simple input 2"],
+            max_length=max_length,
+            padding="max_length",
         )
         input_p = tokenizer_p.batch_encode_plus(
-            ["This is a simple input 1", "This is a simple input 2"], max_length=max_length, padding="max_length",
+            ["This is a simple input 1", "This is a simple input 2"],
+            max_length=max_length,
+            padding="max_length",
         )
         assert_batch_padded_input_match(input_r, input_p, max_length)
 
         input_r = tokenizer_r.batch_encode_plus(
-            ["This is a simple input 1", "This is a simple input 2"], max_length=max_length, padding="longest",
+            ["This is a simple input 1", "This is a simple input 2"],
+            max_length=max_length,
+            padding="longest",
         )
         input_p = tokenizer_p.batch_encode_plus(
-            ["This is a simple input 1", "This is a simple input 2"], max_length=max_length, padding=True,
+            ["This is a simple input 1", "This is a simple input 2"],
+            max_length=max_length,
+            padding=True,
         )
         assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
 
@@ -865,7 +874,11 @@ class NoPaddingTokenFastTokenizerMatchingTest(CommonFastTokenizerTest):
 
         # Simple input
         self.assertRaises(
-            ValueError, tokenizer_r.batch_encode_plus, s2, max_length=max_length, padding="max_length",
+            ValueError,
+            tokenizer_r.batch_encode_plus,
+            s2,
+            max_length=max_length,
+            padding="max_length",
         )
 
         # Pair input
@@ -876,7 +889,11 @@ class NoPaddingTokenFastTokenizerMatchingTest(CommonFastTokenizerTest):
 
         # Pair input
         self.assertRaises(
-            ValueError, tokenizer_r.batch_encode_plus, p2, max_length=max_length, padding="max_length",
+            ValueError,
+            tokenizer_r.batch_encode_plus,
+            p2,
+            max_length=max_length,
+            padding="max_length",
         )
 
 
diff --git a/tests/test_tokenization_mbart.py b/tests/test_tokenization_mbart.py
index d8b1ae18f4..bda0be3aec 100644
--- a/tests/test_tokenization_mbart.py
+++ b/tests/test_tokenization_mbart.py
@@ -125,7 +125,9 @@ class MBartEnroIntegrationTest(unittest.TestCase):
 
     def test_enro_tokenizer_prepare_seq2seq_batch(self):
         batch = self.tokenizer.prepare_seq2seq_batch(
-            self.src_text, tgt_texts=self.tgt_text, max_length=len(self.expected_src_tokens),
+            self.src_text,
+            tgt_texts=self.tgt_text,
+            max_length=len(self.expected_src_tokens),
         )
         self.assertIsInstance(batch, BatchEncoding)
 
diff --git a/tests/test_tokenization_reformer.py b/tests/test_tokenization_reformer.py
index 1e926a3b66..239ce1d594 100644
--- a/tests/test_tokenization_reformer.py
+++ b/tests/test_tokenization_reformer.py
@@ -44,7 +44,8 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
 
         self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382],
+            tokenizer.convert_tokens_to_ids(tokens),
+            [285, 46, 10, 170, 382],
         )
 
         tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
@@ -76,7 +77,8 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         )
         ids = tokenizer.convert_tokens_to_ids(tokens)
         self.assertListEqual(
-            ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4],
+            ids,
+            [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4],
         )
 
         back_tokens = tokenizer.convert_ids_to_tokens(ids)
diff --git a/tests/test_tokenization_t5.py b/tests/test_tokenization_t5.py
index eca64ad16a..a974da8baf 100644
--- a/tests/test_tokenization_t5.py
+++ b/tests/test_tokenization_t5.py
@@ -126,7 +126,11 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             "Another summary.",
         ]
         expected_src_tokens = [71, 307, 8986, 21, 4505, 51, 52, 1707, 5, tokenizer.eos_token_id]
-        batch = tokenizer.prepare_seq2seq_batch(src_text, tgt_texts=tgt_text, return_tensors=FRAMEWORK,)
+        batch = tokenizer.prepare_seq2seq_batch(
+            src_text,
+            tgt_texts=tgt_text,
+            return_tensors=FRAMEWORK,
+        )
         self.assertIsInstance(batch, BatchEncoding)
         result = list(batch.input_ids.numpy()[0])
         self.assertListEqual(expected_src_tokens, result)
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index 4f0a56ec78..e8dc86ea6f 100755
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -275,7 +275,9 @@ class TrainerIntegrationTest(unittest.TestCase):
         MODEL_ID = "distilroberta-base"
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
         dataset = LineByLineTextDataset(
-            tokenizer=tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=tokenizer.max_len_single_sentence,
+            tokenizer=tokenizer,
+            file_path=PATH_SAMPLE_TEXT,
+            block_size=tokenizer.max_len_single_sentence,
         )
         self.assertEqual(len(dataset), 31)
 
diff --git a/utils/link_tester.py b/utils/link_tester.py
index ff53eb7049..3c8a58f93e 100644
--- a/utils/link_tester.py
+++ b/utils/link_tester.py
@@ -18,7 +18,7 @@ S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
 
 
 def list_python_files_in_repository():
-    """ List all python files in the repository.
+    """List all python files in the repository.
 
     This function assumes that the script is executed in the root folder.
     """
@@ -43,7 +43,7 @@ def find_all_links(file_paths):
 
 
 def scan_code_for_links(source):
-    """ Scans the file to find links using a regular expression.
+    """Scans the file to find links using a regular expression.
     Returns a list of links.
     """
     with open(source, "r") as content:
@@ -55,7 +55,7 @@ def scan_code_for_links(source):
 
 
 def check_all_links(links):
-    """ Check that the provided links are valid.
+    """Check that the provided links are valid.
 
     Links are considered valid if a HEAD request to the server
     returns a 200 status code.